In [0]:
# !pip install scattertext
# !pip install flashtext
# !pip install spacy

Collecting scattertext
[?25l  Downloading https://files.pythonhosted.org/packages/e0/6a/d2b2af772934a946cbebb47cb068b4631ed437a264d9cfc7ef5761f95e00/scattertext-0.0.2.56-py3-none-any.whl (6.8MB)
[K     |████████████████████████████████| 6.8MB 4.1MB/s 
Collecting mock
  Downloading https://files.pythonhosted.org/packages/05/d2/f94e68be6b17f46d2c353564da56e6fb89ef09faeeff3313a046cb810ca9/mock-3.0.5-py2.py3-none-any.whl
Installing collected packages: mock, scattertext
Successfully installed mock-3.0.5 scattertext-0.0.2.56
Collecting flashtext
  Downloading https://files.pythonhosted.org/packages/81/d8/2cd0656eae456d615c2f1efbcae8dfca2cb871a31f34ba8925aba47d5e09/flashtext-2.7.tar.gz
Building wheels for collected packages: flashtext
  Building wheel for flashtext (setup.py) ... [?25l[?25hdone
  Created wheel for flashtext: filename=flashtext-2.7-py2.py3-none-any.whl size=9298 sha256=a03e5e7de607ac0e05bc97abba28d04a0f82be69d8032cbebaf293b3c8e4a8b7
  Stored in directory: /root/.cache/pip/

In [0]:
#breaks reviews up into individual words, tallies up word occurrences and extracts phrases where word appears.
#spacy and scattertext are not used because the results are decent without it and for companies with few reviews, compute time is instant.

from collections import Counter
from flashtext import KeywordProcessor
import scattertext as st
import json
import warnings
import pandas as pd
import numpy as np
from lxml import html
from requests import Session
from concurrent.futures import ThreadPoolExecutor as Executor
import requests
import re
import spacy
nlp = spacy.load("en_core_web_sm")
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.set_option('display.max_colwidth', 1000)
base_url = "https://www.yelp.com/biz/" 
api_url = "/review_feed?sort_by=date_desc&start="
bid = 'lJAGnYzku5zSaLnQ_T6_GQ'


class Scraper():
    def __init__(self):
        self.data = pd.DataFrame()

    def get_data(self, n, bid=bid):
        with Session() as s:
            with s.get(base_url+bid+api_url+str(n*20)) as resp: #makes an http get request to given url and returns response as json
                r = json.loads(resp.content) #converts json response into a dictionary
                _html = html.fromstring(r['review_list']) #loads from dictionary

                dates = _html.xpath("//div[@class='review-content']/descendant::span[@class='rating-qualifier']/text()")
                reviews = [el.text for el in _html.xpath("//div[@class='review-content']/p")]
                ratings = _html.xpath("//div[@class='review-content']/descendant::div[@class='biz-rating__stars']/div/@title")

                df = pd.DataFrame([dates, reviews, ratings]).T

                self.data = pd.concat([self.data,df])

    def scrape(self): #makes it faster
        # multithreaded looping
        with Executor(max_workers=40) as e:
            list(e.map(self.get_data, range(10)))

s = Scraper()
s.scrape()
df = s.data
df = df.dropna()


In [0]:
def customtokensize(text):
    return re.findall("[\w']+", str(text))

df['tokenized_text'] = df[1].apply(customtokensize)
stopwords = ['and','was','were','had','check-in','=','= =','u','want', 'u want', 'cuz','him',"i've",'on', 'her','told','ins', '1 check','I', 'i"m', 'i', ' ', 'it', "it's", 'it.','they', 'the', 'this','its', 'l','they','this',"don't",'the ', ' the', 'it', 'i"ve', 'i"m', '!', '1','2','3','4', '5','6','7','8','9','0','/','.',',']

# stopwords = [',','"','!','-','&','?','was']

def filter_stopwords(text):
  nonstopwords = []
  for i in text:
    if i not in stopwords:
      nonstopwords.append(i)
  return nonstopwords
df['tokenized_text'] = df['tokenized_text'].apply(filter_stopwords)
df['parts_of_speech_reference'] = df['tokenized_text'].apply(filter_stopwords)
df['parts_of_speech_reference'] = df['parts_of_speech_reference'].str.join(' ')

def find_noun_noun(x):
  noun_list = []
  doc = nlp(str(x))
  try:
    for token in range(len(doc)):
      sub_list = []
      if doc[token].pos_ == 'NOUN'and doc[token+1].pos_ == 'NOUN':
        sub_list.append(doc[token-1])
        sub_list.append(doc[token])
        sub_list.append(doc[token+1])
      if len(sub_list) != 0 and sub_list not in noun_list:
        noun_list.append(sub_list)
  except IndexError as e:
    pass
  return noun_list

def find_adj_noun(x):
  adj_noun_list = []
  doc = nlp(str(x))
  try:
    for token in range(len(doc)):
      sub_list = []
      if doc[token].pos_ == 'ADJ'and doc[token+1].pos_ == 'NOUN':
        sub_list.append(doc[token-1])
        sub_list.append(doc[token])
        sub_list.append(doc[token+2])
      if len(sub_list) != 0 and sub_list not in adj_noun_list:
        adj_noun_list.append(sub_list)
  except IndexError as e:
    pass
  return adj_noun_list

def find_the(x):
  the_list = []
  doc = nlp(str(x))
  try:
    for token in range(len(doc)):
      sub_list = []
      if doc[token].text == 'the' or doc[token].text == 'a' or doc[token].text == 'an':
        sub_list.append(doc[token+1])
        sub_list.append(doc[token+2])
        sub_list.append(doc[token+3])
        # sub_list.append(doc[token+4])
      if len(sub_list) != 0 and sub_list not in the_list:
        the_list.append(sub_list)
  except IndexError as e:
    pass
  return the_list

df['word_segments_nn'] = df['parts_of_speech_reference'].apply(find_noun_noun)
df['word_segments_adjn'] = df['parts_of_speech_reference'].apply(find_adj_noun)
df['word_segments_the'] = df['parts_of_speech_reference'].apply(find_the)

noun_noun_phrases = []
for i in df['word_segments_nn']:
  for x in i:
    string = ' '.join([str(elem) for elem in x]) 
    noun_noun_phrases.append(string)
adj_noun_phrases = []
for i in df['word_segments_adjn']:
  for x in i:
    string = ' '.join([str(elem) for elem in x]) 
    adj_noun_phrases.append(string)
the_phrases = []
for i in df['word_segments_the']:
  for x in i:
    string = ' '.join([str(elem) for elem in x]) 
    the_phrases.append(string)

all_phrases = noun_noun_phrases + adj_noun_phrases + the_phrases
print(all_phrases)

['high quality spot', 'fresh creole flavor', 'spicy tomato jam', 'tomato jam lightens', 'with jazz music', 'Clientele divers friends', 'doing lunch couples', 'lunch couples vacation', 'couples vacation couples', 'up lunch break', 'lunch break individuals', 'favorite stand bys', 'visual appeal flavor', 'ordered beignet flight', 'fried chicken bene', 'chicken bene pork', 'bene pork belly', 'Soul Food OWES', 'Food OWES ME', 'OWES ME NOTHING', 'new orleans style', 'your name number', 'down chalk board', 'small seating area', 'lol Love food', 'southern soul food', 'soul food ambiance', 'food ambiance lol', 'brunch chaser crowd', 'hot links breakfast', 'my eye stomach', 'BTC chicken breast', 'chicken breast side', 'breast side sooo', 'Tried shrimp poboy', 'Top notch spices', 'off hook good', 'plus size love', 'size love space', 'juicy tender chicken', 'a watermelon tea', 'other items menu', 'your table neighbor', 'a guava mimosa', 'grits beignet sampler', 'chose beignet sampler', 'The staff 

In [0]:
class FlashTextExtact(st.FeatsFromSpacyDoc):
    '''
    '''
    def set_keyword_processor(self, keyword_processor):
        '''
        :param keyword_processor: set, phrases to look for
        :return: self
        '''
        self.keyword_processor_ = keyword_processor
        return self

    def get_feats(self, doc):
        '''
        Parameters
        ----------
        doc, Spacy Doc
        Returns
        -------
        Counter noun chunk -> count
        '''
        return Counter(self.keyword_processor_.extract_keywords(str(doc)))

In [0]:
df.head(1)

Unnamed: 0,0,1,2,tokenized_text,parts_of_speech_reference,word_segments_nn,word_segments_adjn,word_segments_the,parse
0,\n 10/9/2019\n,"Looking for a unique, high quality spot in San Fran for a great bite to eat, this is it! We found it looking on Yelp and were not disappointed one bit. Unfortunately located with the surrounding decline of SF but a hidden gem for great service and fantastic food. The seafood was fresh and the creole flavor is true. The grits are some of the best I have ever had, the spicy tomato jam lightens the typically heavy dish. We will be back!",5.0 star rating,"[Looking, for, a, unique, high, quality, spot, in, San, Fran, for, a, great, bite, to, eat, is, We, found, looking, Yelp, not, disappointed, one, bit, Unfortunately, located, with, surrounding, decline, of, SF, but, a, hidden, gem, for, great, service, fantastic, food, The, seafood, fresh, creole, flavor, is, true, The, grits, are, some, of, best, have, ever, spicy, tomato, jam, lightens, typically, heavy, dish, We, will, be, back]",Looking for a unique high quality spot in San Fran for a great bite to eat is We found looking Yelp not disappointed one bit Unfortunately located with surrounding decline of SF but a hidden gem for great service fantastic food The seafood fresh creole flavor is true The grits are some of best have ever spicy tomato jam lightens typically heavy dish We will be back,"[[high, quality, spot], [fresh, creole, flavor], [spicy, tomato, jam], [tomato, jam, lightens]]","[[unique, high, spot], [a, great, to], [a, hidden, for], [for, great, fantastic], [service, fantastic, The], [seafood, fresh, flavor], [ever, spicy, jam], [typically, heavy, We]]","[[unique, high, quality], [great, bite, to], [hidden, gem, for]]","(looking, for, a, unique, ,, high, quality, spot, in, san, fran, for, a, great, bite, to, eat, ,, this, is, it, !, we, found, it, looking, on, yelp, and, were, not, disappointed, one, bit, ., unfortunately, located, with, the, surrounding, decline, of, sf, but, a, hidden, gem, for, great, service, and, fantastic, food, ., the, seafood, was, fresh, and, the, creole, flavor, is, true, ., the, grits, are, some, of, the, best, i, have, ever, had, ,, the, spicy, tomato, jam, lightens, the, typically, heavy, dish, ., we, will, be, back, !)"


In [0]:
keyword_processor = KeywordProcessor(case_sensitive=False)

for phrase in all_phrases:
    keyword_processor.add_keyword(phrase)
feature_extractor = FlashTextExtact().set_keyword_processor(keyword_processor)

df['parse'] = df['parts_of_speech_reference'].apply(st.whitespace_nlp_with_sentences)
corpus = (st.CorpusFromPandas(df,
                              category_col=2,
                              text_col='parts_of_speech_reference',
                              nlp=st.whitespace_nlp_with_sentences,
                              feats_from_spacy_doc=feature_extractor)
          .build())

# print(corpus.get_term_freq_df())

term_freq_df = corpus.get_term_freq_df()
term_freq_df['highratingscore'] = corpus.get_scaled_f_scores('5.0 star rating')

term_freq_df['poorratingscore'] = corpus.get_scaled_f_scores('1.0 star rating')
dh = term_freq_df.sort_values(by= 'highratingscore', ascending = False)
dh = dh[['highratingscore', 'poorratingscore']]
dh = dh.reset_index(drop=False)
dh = dh.rename(columns={'highratingscore': 'score'})
dh = dh.drop(columns='poorratingscore')
# positive_df = dh.head(10)
# negative_df = dh.tail(10)

# html = st.produce_scattertext_explorer(
#     corpus,
#     category='democrat',
#     category_name='Democratic',
#     not_category_name='Republican',
#     metadata=convention_df['speaker'],
#     term_scorer=st.RankDifference(),
#     transform=st.Scalers.dense_rank,
#     pmi_threshold_coefficient=0,
#     minimum_term_frequency=0,
#     minimum_not_category_term_frequency=0,
#     use_full_doc=True
# )

# file_name = 'demo_specific_phrases.html'
# open(file_name, 'wb').write(html.encode('utf-8'))
# print('Open %s in Chrome or Firefox.' % file_name)

In [0]:
dh

Unnamed: 0,term,score
0,line out door,1.0
1,watermelon iced tea,1.0
2,review because just,1.0
3,cup of gumbo,1.0
4,but food soooo,1.0
5,Great service food,1.0
6,fried chicken eggs,0.923501
7,fried chicken benedict,0.923501
8,French soul food,0.823535
9,line to get,0.805222


In [0]:
positive_df = dh.head(10)
negative_df = dh.tail(10)


results = {'positive': [{'term': pos_term, 'score': pos_score} for pos_term, pos_score in
                        zip(positive_df['term'], positive_df['score'])],
            'negative': [{'term': neg_term, 'score': neg_score} for neg_term, neg_score in
                        zip(negative_df['term'], negative_df['score'])]}

In [0]:
results

{'negative': [{'score': 0.28998794732248623, 'term': 'numerous times nothing'},
  {'score': 0.28998794732248623, 'term': 'rude customer service'},
  {'score': 0.28998794732248623, 'term': 'an orange juice'},
  {'score': 0.28998794732248623, 'term': 'small cup not'},
  {'score': 0.28998794732248623, 'term': 'ordered watermelon tea'},
  {'score': 0.28998794732248623, 'term': 'three way beignets'},
  {'score': 0.28998794732248623, 'term': 'lot of water'},
  {'score': 0.28998794732248623, 'term': 'Service umm time'},
  {'score': 0.14416388105176076, 'term': 'crispy pork belly'},
  {'score': 0.0, 'term': 'crazy long line'}],
 'positive': [{'score': 1.0, 'term': 'line out door'},
  {'score': 1.0, 'term': 'watermelon iced tea'},
  {'score': 1.0, 'term': 'review because just'},
  {'score': 1.0, 'term': 'cup of gumbo'},
  {'score': 1.0, 'term': 'but food soooo'},
  {'score': 1.0, 'term': 'Great service food'},
  {'score': 0.9235014647642518, 'term': 'fried chicken eggs'},
  {'score': 0.92350146