# Scattertext spaCy with Yelp Dataset
Exploratory data analysis and visualization for text data

Medium Article - [Analyze Yelp Dataset with Scattertext spaCy](https://link.medium.com/k3DRTC57I1)

[GitHub Repo](https://github.com/gyhou/yelp_dataset)

https://www.yelp.com/dataset/

In [1]:
import pandas as pd

# csv file can be found in the github repo
df = pd.read_csv('yelp_reviews_RV_categories.csv')
print(df.shape)
df.head()

(5342, 15)


Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,attributes,categories,user_id,review_stars,text,date
0,q0B39iv1bs16PO--eiMfIw,Hobo Camper Country,4020 Edmonton Trail NE,Calgary,AB,T2E 3P6,51.088523,-114.051507,1.5,,"RV Repair, Auto Repair, Automotive",DLURbx1V0QyJrntcTyMR4w,1,it is todays experience that got HCC (Hobo Cam...,2016-09-06 18:06:23
1,q0B39iv1bs16PO--eiMfIw,Hobo Camper Country,4020 Edmonton Trail NE,Calgary,AB,T2E 3P6,51.088523,-114.051507,1.5,,"RV Repair, Auto Repair, Automotive",0RsS3-oJ_MBJ-Fb9I-oBhA,2,"Meh, needed parts got attitude. went somewher...",2016-08-11 17:56:24
2,q0B39iv1bs16PO--eiMfIw,Hobo Camper Country,4020 Edmonton Trail NE,Calgary,AB,T2E 3P6,51.088523,-114.051507,1.5,,"RV Repair, Auto Repair, Automotive",X_W-pHzDboM1jGz60f81cA,1,My father took his motorhome in to get the Ant...,2016-09-26 02:25:28
3,SW_bePWPlMZJZGQ4eT9vrA,E's RV Appliance Repair,"1959 S Power Rd, Ste 103-238",Mesa,AZ,85206,33.379981,-111.68722,5.0,"{'ByAppointmentOnly': 'True', 'BusinessAccepts...","Automotive, Water Heater Installation/Repair, ...",pR29E9_ird-h26CY7t18Ag,5,Eric was great! He diagnosed my LQ horse trail...,2018-05-21 15:46:27
4,SW_bePWPlMZJZGQ4eT9vrA,E's RV Appliance Repair,"1959 S Power Rd, Ste 103-238",Mesa,AZ,85206,33.379981,-111.68722,5.0,"{'ByAppointmentOnly': 'True', 'BusinessAccepts...","Automotive, Water Heater Installation/Repair, ...",KGZGa7Hnx6WiAVqtV_uizQ,5,Always ready to help us whenever we are in tow...,2018-04-29 23:23:56


In [2]:
# Check how rating is distributed
import seaborn as sns
sns.distplot(df['review_stars']);

In [3]:
# Consolidate rating to high or low
df['rating'] = df['review_stars'].replace({1:'Low Rating', 2:'Low Rating', 3:'Low Rating',
                                           4:'High Rating', 5:'High Rating'})
df.rating.value_counts()

High Rating    3223
Low Rating     2119
Name: rating, dtype: int64

In [None]:
# Group similar categories
df_RV_Auto = df[df['categories'].str.contains('RV Repair|RV Dealers|RV Rental', case=False, na=False)]
df_Parks_Camp = df[df['categories'].str.contains('RV Parks|Campgrounds', case=False, na=False)]

## Use NLP on review text

In [None]:
# Make sure you have the english language model
# !python -m spacy download en_core_web_sm

In [None]:
import spacy
import scattertext
# https://spacy.io/models/en
# use the english model that you have
nlp = spacy.load('en_core_web_sm')

In [None]:
# Create a text file to add stop words
with open('stopwords.txt', 'r') as f:
    str_f = f.read()
    stopwords_file = set(str_f.split('\n'))
nlp.Defaults.stop_words |= stopwords_file

In [None]:
# Add more stop words
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))
nlp.Defaults.stop_words |= stopWords 

### Set up corpus - Term Frequency and Scaled F-Score

In [None]:
def term_freq(df_yelp):
    corpus = (scattertext.CorpusFromPandas(df_yelp,
                                           category_col='rating', 
                                           text_col='text',
                                           nlp=nlp)
              .build()
              .remove_terms(nlp.Defaults.stop_words, ignore_absences=True)
              # ignore_absences: if the term does not appear, don't raise an error, just move on.
              )
    
    df = corpus.get_term_freq_df()
    df['High_Rating_Score'] = corpus.get_scaled_f_scores('High Rating')
    df['Low_Rating_Score'] = corpus.get_scaled_f_scores('Low Rating')

    df['High_Rating_Score'] = round(df['High_Rating_Score'], 2)
    df['Low_Rating_Score'] = round(df['Low_Rating_Score'], 2)
    
    df_high = df.sort_values(by='High Rating freq', 
                             ascending = False).reset_index()
    df_low = df.sort_values(by='Low Rating freq', 
                            ascending=False).reset_index()
    
    return df_high, df_low

In [None]:
# Frequency and Scaled F-Score for RV Parks and Campgrounds
Parks_Camp_high, Parks_Camp_low = term_freq(df_Parks_Camp)

In [None]:
# Sorted by High Rating Frequency
Parks_Camp_high.head(10)

In [None]:
# Sorted by Low Rating Frequency
Parks_Camp_low.head(10)

In [None]:
# Frequency and Scaled F-Score for RV Repair, RV Dealers and RV Rental
RV_Auto_high, RV_Auto_low = term_freq(df_RV_Auto)

In [None]:
RV_Auto_high.head(10)

In [None]:
RV_Auto_low.head(10)

In [None]:
# Frequency and Scaled F-Score for all 5 RV categories
RV_all_high, RV_all_low = term_freq(df)

In [None]:
RV_all_high.head(10)

In [None]:
RV_all_low.head(10)

## Using Scattertext to visualize term associations

In [None]:
# Label each excerpt with the name of business using the metadata parameter
corpus_dataframe = df_Parks_Camp
html = scattertext.produce_scattertext_explorer(corpus,
                                                category='Low Rating',
                                                category_name='Low Rating',
                                                not_category_name='High Rating',
                                                width_in_pixels=1000,
                                                metadata=corpus_dataframe['name'])

In [None]:
html_file_name = "RV-Parks-Campgrounds-Yelp-Review-Scattertext.html"
open(html_file_name, 'wb').write(html.encode('utf-8'))