In [0]:
#breaks reviews up into individual words, tallies up word occurrences and extracts phrases where word appears.
#spacy and scattertext are not used because the results are decent without it and for companies with few reviews, compute time is instant.


import json
import warnings
import pandas as pd
import numpy as np
from lxml import html
from requests import Session
from concurrent.futures import ThreadPoolExecutor as Executor
import requests
import re
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.set_option('display.max_colwidth', 1000)
base_url = "https://www.yelp.com/biz/" 
api_url = "/review_feed?sort_by=date_desc&start="
bid = 'On6d83tR7_twBBRFzEH6HA'


class Scraper():
    def __init__(self):
        self.data = pd.DataFrame()

    def get_data(self, n, bid=bid):
        with Session() as s:
            with s.get(base_url+bid+api_url+str(n*20)) as resp: #makes an http get request to given url and returns response as json
                r = json.loads(resp.content) #converts json response into a dictionary
                _html = html.fromstring(r['review_list']) #loads from dictionary

                dates = _html.xpath("//div[@class='review-content']/descendant::span[@class='rating-qualifier']/text()")
                reviews = [el.text for el in _html.xpath("//div[@class='review-content']/p")]
                ratings = _html.xpath("//div[@class='review-content']/descendant::div[@class='biz-rating__stars']/div/@title")

                df = pd.DataFrame([dates, reviews, ratings]).T

                self.data = pd.concat([self.data,df])

    def scrape(self): #makes it faster
        # multithreaded looping
        with Executor(max_workers=40) as e:
            list(e.map(self.get_data, range(10)))

s = Scraper()
s.scrape()
df = s.data
df = df.dropna()

df['word_segments_unpacked'] = df[1].apply(lambda x: x[1:-1].split(' '))#turn string comma separated list per word

df['word_segments_unpacked'] = df['word_segments_unpacked'].astype(str)
df['word_segments_unpacked'] = df['word_segments_unpacked'].apply(lambda x: ''.join([str(i) for i in x]))
phrase_count = df[['word_segments_unpacked', 2]]


s= phrase_count.apply(lambda x: pd.Series(x['word_segments_unpacked']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'word_segments_unpacked'

phrase_count = phrase_count.drop('word_segments_unpacked', axis=1).join(s)
phrase_count = pd.DataFrame(df['word_segments_unpacked'].str.split(',').tolist(), index=df[2]).stack()

phrase_count = phrase_count.reset_index()[[0, 2]] # var1 variable is currently labeled 0
phrase_count.columns = ['word_segments_unpacked', 'ratings'] # renaming var1
phrase_count = phrase_count.reset_index(drop=False)
replace_dict_phrase_count = {'[':'',']':'','-':'','!':'','.':'',' ':'',"'":''}
for key in replace_dict_phrase_count.keys():
  phrase_count['word_segments_unpacked'] = phrase_count['word_segments_unpacked'].str.replace(key, replace_dict_phrase_count[key])
phrase_count['word_segments_unpacked'] = phrase_count['word_segments_unpacked'].str.lower()

stopwords = [')','(','\(','\xa0','0','1','2','3','4','5','6','7','8','9','/','$',"'d","'ll","'m",'+','maybe','from','first','here','only','put','where','got','sure','definitely','food','yet','our','go','since','really','very','two',"n't",'with','if',"'s",'which','came','all','me','(',')','makes','make','were','immediately','get','been','ahead','also','that','one','have','see','what','to','we','had','.',"'re",'it','or','he','she','we','us','how','went','no','"','of','has','by','bit','thing','place','so','ok','and','they','none','was','you',"'ve",'did','be','and','but','is','as','&','you','has','-',':','and','had','was','him','so','my','did','would','her','him','it','is','by','bit','thing','place','[',']','while','check-in','=','= =','want', 'good','husband', 'want','love','something','your','they','your','cuz','him',"i've",'her','told', 'check', 'i"m', "it's",'they', 'this','its','they','this',"don't",'the',',', 'it', 'i"ve', 'i"m', '!', '1','2','3','4', '5','6','7','8','9','0','/','.']
def filter_stopwords(text):
  for i in str(text):
    if i not in stopwords:
      return str(text)

#if item in stopwords list partially matches, delete, single letters like 'i' would be deleted from inside individual words if in list
phrase_count = phrase_count[~phrase_count['word_segments_unpacked'].isin(stopwords)]
#if the following words fully matches, filter out
full_match_list = ['i','a','an','am','at','are','in','on','for','','\xa0\xa0','\xa0','\(']
phrase_count = phrase_count[~phrase_count['word_segments_unpacked'].isin(full_match_list)]

#pivot table ratings
phrase_count_pivot = pd.pivot_table(phrase_count, index='word_segments_unpacked', columns='ratings', aggfunc='count', fill_value=0)
phrase_count_pivot.columns = [''.join(col).strip() for col in phrase_count_pivot.columns.values]#flatten index levels part 1
phrase_count_pivot = pd.DataFrame(phrase_count_pivot.to_records())#flatten index levels part 2

#if there are no _# star reviews, add a column of zeros
required_column_names = ['index1.0 star rating', 'index2.0 star rating','index3.0 star rating','index4.0 star rating','index5.0 star rating']
for i in required_column_names:
  if i not in phrase_count_pivot.columns:
    phrase_count_pivot[i] = 0
phrase_count_pivot.sample(10)

#replace the original count by getting an exaggerated scaled tally of reviews to calculate score
phrase_count_pivot['index1.0 star rating'] = phrase_count_pivot['index1.0 star rating']*(-2)
phrase_count_pivot['index2.0 star rating'] = phrase_count_pivot['index2.0 star rating']*(-1)
phrase_count_pivot['index3.0 star rating'] = phrase_count_pivot['index3.0 star rating']*(-0.1)
phrase_count_pivot['index4.0 star rating'] = phrase_count_pivot['index4.0 star rating']*(1)
phrase_count_pivot['index5.0 star rating'] = phrase_count_pivot['index5.0 star rating']*(2)

#get a total score from the sum of exaggerated scores
phrase_count_pivot['score'] = phrase_count_pivot['index1.0 star rating'] + phrase_count_pivot['index2.0 star rating'] + phrase_count_pivot['index3.0 star rating'] + phrase_count_pivot['index4.0 star rating'] + phrase_count_pivot['index5.0 star rating']

phrase_count_pivot['score'] = phrase_count_pivot['score'].div(phrase_count_pivot['score'].max(), axis=0)#normalize
phrase_count_pivot['score'] = phrase_count_pivot['score'].round(decimals=4)#round to 4 decimal places
phrase_count_pivot = phrase_count_pivot.sort_values(by=('score'), ascending=False)
phrase_count_pivot.head(2)

Unnamed: 0,word_segments_unpacked,index5.0 star rating,index1.0 star rating,index2.0 star rating,index3.0 star rating,index4.0 star rating,score
43,tape,6,0,0,-0.0,0,1.0
0,"""painters""",2,0,0,-0.0,0,0.3333


In [0]:
phrase_count_pivot['word_segments_unpacked'] = phrase_count_pivot['word_segments_unpacked'].str.replace('\(', '')
phrase_count_pivot['word_segments_unpacked'] = phrase_count_pivot['word_segments_unpacked'].str.replace('(', '')
phrase_count_pivot['word_segments_unpacked'] = phrase_count_pivot['word_segments_unpacked'].str.replace(')', '')#without these, errors incurr

worst_terms_list = [] 
top_terms_list = []
x,y = phrase_count_pivot.shape#tuple unpacking to get the length of the dataframe
for i in reversed(range(x)):
  try:
    new_df = df[df[1].str.contains(phrase_count_pivot['word_segments_unpacked'].iloc[i])]#if word appears in review, create a dataframe with each row being the word occurring in a different review
    neg_first_df = new_df.sort_values(by=2, ascending=True)#rank the dataframe with worst reviews first
    pos_first_df = new_df.sort_values(by=2, ascending=False)#rank the dataframe with most positive reviews first
    if neg_first_df[1].iloc[0] not in worst_terms_list:#get the lowest star rating review
      worst_terms_list.append(neg_first_df[1].iloc[0])#prevent duplicates
    if pos_first_df[1].iloc[0] not in top_terms_list:#get the highest star rating review
      top_terms_list.append(pos_first_df[1].iloc[0])
  except IndexError as e:
    pass

In [0]:
negative_list = []
for i in range(-30,0):#take the worst 30 terms
  for list_of_words in worst_terms_list:
    word_list = list_of_words.split(' ')
    for word in word_list:
      word = word.replace(',','')
      word = word.replace('.','')
      try: 
        if phrase_count_pivot['word_segments_unpacked'].iloc[i] == word: #find word occurrence in original comma separated word list of reviews
          try:
            index = word_list.index(word)
            string_from_phrases = ','.join(word_list[max(0,index-5):min(index+20, len(word_list))])
            replace_dict_string_from_phrases= {'\xa0':'',' ':'',',':' ',' .':'.','!':'','[':'',']':'','\n':'','1':'','2':'','3':'','4':'','5':'','6':'','7':'','8':'','9':'','0':'','/':'',"'":"'","'":''}
            for key in replace_dict_string_from_phrases.keys():
              string_from_phrases=string_from_phrases.replace(key, replace_dict_string_from_phrases[key])
            negative_list.append(string_from_phrases)
          except ValueError as e:
            pass
      except IndexError as e:#if there are less than 30 words after stopword filtering, just get the first word and its occurrence in the original review
        if phrase_count_pivot['word_segments_unpacked'].iloc[0] == word:
          try:
            index = word_list.index(word)
            string_from_phrases = ','.join(word_list[max(0,index-5):min(index+20, len(word_list))])
            replace_dict_string_from_phrases= {'\xa0':'',' ':'',',':' ',' .':'.','!':'','[':'',']':'','\n':'','1':'','2':'','3':'','4':'','5':'','6':'','7':'','8':'','9':'','0':'','/':'',"'":"'","'":''}
            for key in replace_dict_string_from_phrases.keys():
              string_from_phrases=string_from_phrases.replace(key, replace_dict_string_from_phrases[key])
            negative_list.append(string_from_phrases)
          except ValueError as e:
            pass
negative_df = pd.DataFrame(negative_list)
negative_df = negative_df.reset_index(drop=False)
negative_df = negative_df.rename(columns={'index':'score', 0 : 'term'})
negative_df = negative_df.drop_duplicates(subset='term')
x,y = negative_df.shape#tuple unpacking to get the length of the dataframe
if x < 10:
  for i in range(-40,-30):
    for list_of_words in worst_terms_list:
      word_list = list_of_words.split(' ')
      for word in word_list:
        word = word.replace(',','')
        word = word.replace('.','')
        try:
          if phrase_count_pivot['word_segments_unpacked'].iloc[i] == word:
            try:
              index = word_list.index(word)
              string_from_phrases = ','.join(word_list[max(0,index-5):min(index+20, len(word_list))])
              replace_dict = {'\xa0':'',' ':'',',':' ',' .':'.','!':'','[':'',']':'','\n':'','1':'','2':'','3':'','4':'','5':'','6':'','7':'','8':'','9':'','0':'','/':'',"'":"'","'":''}
              for key in replace_dict.keys():
                string_from_phrases=string_from_phrases.replace(key, replace_dict[key])
              negative_list.append(string_from_phrases)
            except ValueError as e:
              pass
        except IndexError as e:
          if phrase_count_pivot['word_segments_unpacked'].iloc[0] == word:
            try:
              index = word_list.index(word)
              string_from_phrases = ','.join(word_list[max(0,index-5):min(index+20, len(word_list))])
              replace_dict_string_from_phrases= {'\xa0':'',' ':'',',':' ',' .':'.','!':'','[':'',']':'','\n':'','1':'','2':'','3':'','4':'','5':'','6':'','7':'','8':'','9':'','0':'','/':'',"'":"'","'":''}
              for key in replace_dict_string_from_phrases.keys():
                string_from_phrases=string_from_phrases.replace(key, replace_dict_string_from_phrases[key])
              negative_list.append(string_from_phrases)
            except ValueError as e:
              pass
negative_df_addon = pd.DataFrame(negative_list)
negative_df_addon = negative_df_addon.reset_index(drop=False)
negative_df_addon = negative_df_addon.rename(columns={'index':'score', 0 : 'term'})
negative_df = pd.concat([negative_df, negative_df_addon])
negative_df = negative_df.head(10)



In [0]:
positive_list = []
for i in range(0,30):
  for list_of_words in top_terms_list:
    word_list = list_of_words.split(' ')
    for word in word_list:
      word = word.replace(',','')
      word = word.replace('.','')
      try: 
        if phrase_count_pivot['word_segments_unpacked'].iloc[i] == word:
          try:
            index = word_list.index(word)
            string_from_phrases = ','.join(word_list[max(0,index-5):min(index+20, len(word_list))])
            replace_dict = {'\xa0':'',' ':'',',':' ',' .':'.','!':'','[':'',']':'','\n':'','1':'','2':'','3':'','4':'','5':'','6':'','7':'','8':'','9':'','0':'','/':'',"'":"'","'":''}
            for key in replace_dict.keys():
              string_from_phrases=string_from_phrases.replace(key, replace_dict[key])
            negative_list.append(string_from_phrases)
          except ValueError as e:
            pass
      except IndexError as e:
        if phrase_count_pivot['word_segments_unpacked'].iloc[0] == word:
          try:
            index = word_list.index(word)
            string_from_phrases = ','.join(word_list[max(0,index-5):min(index+20, len(word_list))])
            replace_dict = {'\xa0':'',' ':'',',':' ',' .':'.','!':'','[':'',']':'','\n':'','1':'','2':'','3':'','4':'','5':'','6':'','7':'','8':'','9':'','0':'','/':'',"'":"'","'":''}
            for key in replace_dict.keys():
              string_from_phrases=string_from_phrases.replace(key, replace_dict[key])
            negative_list.append(string_from_phrases)
          except ValueError as e:
            pass
  positive_list.append(string_from_phrases)
positive_df = pd.DataFrame(positive_list)
positive_df = positive_df.reset_index(drop=False)
positive_df = positive_df.rename(columns={'index':'score', 0 : 'term'})
positive_df = positive_df.drop_duplicates(subset='term')
x,y = positive_df.shape#tuple unpacking to get the length of the dataframe
for i in range(30,40):
  for list_of_words in top_terms_list:
    word_list = list_of_words.split(' ')
    for word in word_list:
      word = word.replace(',','')
      word = word.replace('.','')
      try:
        if phrase_count_pivot['word_segments_unpacked'].iloc[i] == word:
          try:
            index = word_list.index(word)
            string_from_phrases = ','.join(word_list[max(0,index-5):min(index+20, len(word_list))])
            replace_dict = {'\xa0':'',' ':'',',':' ',' .':'.','!':'','[':'',']':'','\n':'','1':'','2':'','3':'','4':'','5':'','6':'','7':'','8':'','9':'','0':'','/':'',"'":"'","'":''}
            for key in replace_dict.keys():
              string_from_phrases=string_from_phrases.replace(key, replace_dict[key])
            negative_list.append(string_from_phrases)
          except ValueError as e:
            pass
      except IndexError as e:
        if phrase_count_pivot['word_segments_unpacked'].iloc[0] == word:
          try:
            index = word_list.index(word)
            string_from_phrases = ','.join(word_list[max(0,index-5):min(index+20, len(word_list))])
            replace_dict = {'\xa0':'',' ':'',',':' ',' .':'.','!':'','[':'',']':'','\n':'','1':'','2':'','3':'','4':'','5':'','6':'','7':'','8':'','9':'','0':'','/':'',"'":"'","'":''}
            for key in replace_dict.keys():
              string_from_phrases=string_from_phrases.replace(key, replace_dict[key])
            negative_list.append(string_from_phrases)
          except ValueError as e:
            pass
positive_df_addon = pd.DataFrame(negative_list)
positive_df_addon = positive_df_addon.reset_index(drop=False)
positive_df_addon = positive_df_addon.rename(columns={'index':'score', 0 : 'term'})
positive_df = pd.concat([positive_df, positive_df_addon])
positive_df = positive_df.head(10)

In [0]:
results = {'positive': [{'term': pos_term, 'score': pos_score} for pos_term, pos_score in zip(positive_df['term'], positive_df['score'])], 'negative': [{'term': neg_term, 'score': neg_score} for neg_term, neg_score in zip(negative_df['term'], negative_df['score'])]}
results

{'negative': [{'score': 0,
   'term': 'realized I needed some plumbers tape and had no idea where it might be. But I went to the "tape" section of our newly organized'},
  {'score': 3,
   'term': 'cabinet and it was right there (along with the painters tape  packing tape and masking tape)  allowing me to complete my project before I went'},
  {'score': 4,
   'term': 'to charity and what to throw away. Just this morning I realized I needed some plumbers tape and had no idea where it might be.'},
  {'score': 5,
   'term': 'chaos. She was able to understand the interpersonal dynamics of our family and help us make decisions about what to save  what to sell  what'},
  {'score': 6,
   'term': 'had no idea where it might be. But I went to the "tape" section of our newly organized supply cabinet and it was right there'},
  {'score': 7,
   'term': 'was able to understand the interpersonal dynamics of our family and help us make decisions about what to save  what to sell  what to donate'},
  {'