In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import spacy

from spacy import displacy
from nltk.stem import WordNetLemmatizer


In [2]:
df = pd.read_csv('../data/San_Francisco_restaurant_reviews.csv')

In [3]:
df.shape

(50000, 5)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,restaurant_name,restaurant_rating,customer_rating,review_text
0,0,Brenda's French Soul Food,4.0,5,Absolutely amazing. I&#39;ve dined here at lea...
1,1,Brenda's French Soul Food,4.0,5,So good we went twice in one week! After a rec...
2,2,Brenda's French Soul Food,4.0,2,Hadn&#39;t been here in years because the neig...
3,3,Brenda's French Soul Food,4.0,3,I love the other Brenda&#39;s in SF but this o...
4,4,Brenda's French Soul Food,4.0,5,Another repeat visit and no disappointment fro...


In [5]:
# Rename column 'Unnamed: 0' to Review Id
df.rename({'Unnamed: 0':'review_id'})

Unnamed: 0.1,Unnamed: 0,restaurant_name,restaurant_rating,customer_rating,review_text
0,0,Brenda's French Soul Food,4.0,5,Absolutely amazing. I&#39;ve dined here at lea...
1,1,Brenda's French Soul Food,4.0,5,So good we went twice in one week! After a rec...
2,2,Brenda's French Soul Food,4.0,2,Hadn&#39;t been here in years because the neig...
3,3,Brenda's French Soul Food,4.0,3,I love the other Brenda&#39;s in SF but this o...
4,4,Brenda's French Soul Food,4.0,5,Another repeat visit and no disappointment fro...
...,...,...,...,...,...
49995,49995,Dumpling Time,4.0,2,I live up the hill from Dumpling Time and woul...
49996,49996,Dumpling Time,4.0,5,I was thoroughly happy with Dumpling Time. Our...
49997,49997,Dumpling Time,4.0,5,Do you like dumplings? Chinese dumplings? Xial...
49998,49998,Dumpling Time,4.0,4,Oh no Diana why did you just dock a star from ...


In [6]:
for row in df['review_text'][:5]:
    print(f'{row}\n')

Absolutely amazing. I&#39;ve dined here at least 6 times since moving to the Bay Area in February of 2022. Every experience has been great and those I recommended to eat here all let me know how good the food was as well.

So good we went twice in one week! After a recommendation from the barista by our hotel we walked over to Brenda&#39;s for brunch on a Saturday. <br><br>There was a wait but it moved quickly. I got the Florentine Benedict and my friend got the Corn Succotash &amp; White Cheddar Omelette (not pictured). Everything was great - savory, a little spice, comforting!<br><br> Friendly and quick service. I ended up going back to try the dinner menu a few nights later - the shrimp and grits was everything I wanted. The beignets are a must try - SO big and fluffy and filled to perfection.

Hadn&#39;t been here in years because the neighborhood has become increasingly rough and filthy and waiting for a table first thing in the morning under such circumstances isn&#39;t a good wa

- Need to change the string of characters ' & #39; ' into a " ' "
- Remove the break characters

In [7]:
def remove_html_characters(df):
    #removing break characters and html character references
    df['review_text'] = df['review_text'].str.replace('&#39;', "'")
    df['review_text'] = df['review_text'].str.replace('&#34;', '"')
    df['review_text'] = df['review_text'].str.replace('<br>', '')
    df['review_text'] = df['review_text'].str.replace('&amp;', '&')

In [8]:
remove_html_characters(df)

In [9]:
for row in df['review_text'][:5]:
    print(f'{row}\n')

Absolutely amazing. I've dined here at least 6 times since moving to the Bay Area in February of 2022. Every experience has been great and those I recommended to eat here all let me know how good the food was as well.

So good we went twice in one week! After a recommendation from the barista by our hotel we walked over to Brenda's for brunch on a Saturday. There was a wait but it moved quickly. I got the Florentine Benedict and my friend got the Corn Succotash & White Cheddar Omelette (not pictured). Everything was great - savory, a little spice, comforting! Friendly and quick service. I ended up going back to try the dinner menu a few nights later - the shrimp and grits was everything I wanted. The beignets are a must try - SO big and fluffy and filled to perfection.

Hadn't been here in years because the neighborhood has become increasingly rough and filthy and waiting for a table first thing in the morning under such circumstances isn't a good way to start the day.But snagged a res

In [10]:
#Creating column for the character length of the review
df['review_text_length'] = df['review_text'].apply(lambda x: len(x))

In [11]:
df.head()

Unnamed: 0.1,Unnamed: 0,restaurant_name,restaurant_rating,customer_rating,review_text,review_text_length
0,0,Brenda's French Soul Food,4.0,5,Absolutely amazing. I've dined here at least 6...,217
1,1,Brenda's French Soul Food,4.0,5,So good we went twice in one week! After a rec...,560
2,2,Brenda's French Soul Food,4.0,2,Hadn't been here in years because the neighbor...,462
3,3,Brenda's French Soul Food,4.0,3,I love the other Brenda's in SF but this one d...,753
4,4,Brenda's French Soul Food,4.0,5,Another repeat visit and no disappointment fro...,497


In [12]:
df.describe()

Unnamed: 0.1,Unnamed: 0,restaurant_rating,customer_rating,review_text_length
count,50000.0,50000.0,50000.0,50000.0
mean,24999.5,4.25,4.31974,682.02788
std,14433.901067,0.287231,1.018669,607.736702
min,0.0,4.0,1.0,17.0
25%,12499.75,4.0,4.0,263.0
50%,24999.5,4.0,5.0,508.0
75%,37499.25,4.5,5.0,896.0
max,49999.0,5.0,5.0,7778.0


- The average rating is 4.25 and the min max is 4 and 5 so there isn't high variance. However the customer rating ranges from 1-5 so there will be difference in opinions even if they average out to around a 4.
- This isn't all of the reviews since only 1000 of the most recent reviews were pulled from each restaurant

## Create sentences from each of the reviews

In [13]:
nlp = spacy.load('en_core_web_md')

In [14]:
#Creating a list of sentences from the review using spacey

def create_sentences_from_review(review, nlp):

    spacey_doc = nlp(review)
    
    return [str(sent) for sent in spacey_doc.sents]

In [15]:
# Creating new column sentences (list of sentences)

df['sentences'] = df['review_text'].apply(lambda x: create_sentences_from_review(x, nlp))

In [16]:
df.head()

Unnamed: 0.1,Unnamed: 0,restaurant_name,restaurant_rating,customer_rating,review_text,review_text_length,sentences
0,0,Brenda's French Soul Food,4.0,5,Absolutely amazing. I've dined here at least 6...,217,"[Absolutely amazing., I've dined here at least..."
1,1,Brenda's French Soul Food,4.0,5,So good we went twice in one week! After a rec...,560,"[So good we went twice in one week!, After a r..."
2,2,Brenda's French Soul Food,4.0,2,Hadn't been here in years because the neighbor...,462,[Hadn't been here in years because the neighbo...
3,3,Brenda's French Soul Food,4.0,3,I love the other Brenda's in SF but this one d...,753,[I love the other Brenda's in SF but this one ...
4,4,Brenda's French Soul Food,4.0,5,Another repeat visit and no disappointment fro...,497,[Another repeat visit and no disappointment fr...


In [17]:
# Creating column sentence count

df['sentence_count'] = df['sentences'].apply(lambda x: len(x))

# Looking at total amount of sentences

df['sentence_count'].sum()

469663

In [18]:
def create_sentence_df(df):
    """Creates rows for each sentence for each review"""
    
    #Loops through every row
    for index, row in df.iterrows():
        
        # Creates x duplicates where x is the sentence_count
        dup_rows = pd.DataFrame(np.repeat([row.values],
                                          [row['sentence_count']],
                                          axis = 0),
                                columns = df.columns)

        # Populates the duplicates with the sentences from the curent row
        for idx, sent in zip(dup_rows.index, (row['sentences'])):
            dup_rows.iloc[idx]['sentence'] = sent

        # Concatenates duplicates back to the original df
        df = pd.concat([df, dup_rows])
    
    # resets the index since duplicates were added
    df.reset_index(inplace = True, drop = True)
    # Dropping all of original rows where sentence is blank
    df.drop(df[df['sentence'] == ""].index, inplace = True)
    return df

In [19]:
# Using the index column to create review id's
df['review_id'] = df.index
# Creating empty sentence column
df['sentence'] = ""
df = create_sentence_df(df)

In [20]:
df.shape

(469663, 10)

- 469_663 sentences created from 50_000 reviews

In [21]:
df.to_csv(f'../data/San_Francisco_restaurant_reviews_sentences.csv')

## Creating sentence Dataframe for the 5 restaurants with all their reviews

In [24]:
five_rest_df = pd.read_csv('../data/San_Francisco_restaurant_reviews_first_5.csv')

# Dropping column
five_rest_df.rename({'Unnamed: 0':'review_id'})

# Removing html characters
remove_html_characters(five_rest_df)

# Creating sentences with spacey
five_rest_df['sentences'] = five_rest_df['review_text'].apply(lambda x: create_sentences_from_review(x, nlp))

# Creating sentence count for each review
five_rest_df['sentence_count'] = five_rest_df['sentences'].apply(lambda x: len(x))
print(five_rest_df['sentence_count'].sum())

# Using the index column to create review id's
five_rest_df['review_id'] = five_rest_df.index

# Creating empty sentence column
five_rest_df['sentence'] = ""

# Creating rows for each individual sentence
five_rest_df = create_sentence_df(five_rest_df)

# Saving the 
five_rest_df.to_csv(f'../data/San_Francisco_restaurant_reviews_first_5_sentences.csv')

390257
