In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import spacy

from spacy import displacy
from nltk.stem import WordNetLemmatizer


In [2]:
df = pd.read_csv('../data/San_Francisco_50_restaurant_1000_reviews')

In [3]:
df.shape

(50000, 5)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,restaurant_name,restaurant_rating,customer_rating,review_text
0,0,Brenda's French Soul Food,4.0,5,Absolutely amazing. I&#39;ve dined here at lea...
1,1,Brenda's French Soul Food,4.0,5,So good we went twice in one week! After a rec...
2,2,Brenda's French Soul Food,4.0,2,Hadn&#39;t been here in years because the neig...
3,3,Brenda's French Soul Food,4.0,3,I love the other Brenda&#39;s in SF but this o...
4,4,Brenda's French Soul Food,4.0,5,Another repeat visit and no disappointment fro...


In [5]:
# Rename column 'Unnamed: 0' to Review Id
df.rename({'Unnamed: 0':'review_id'})

In [6]:
for row in df['review_text'][:5]:
    print(f'{row}\n')

Absolutely amazing. I&#39;ve dined here at least 6 times since moving to the Bay Area in February of 2022. Every experience has been great and those I recommended to eat here all let me know how good the food was as well.

So good we went twice in one week! After a recommendation from the barista by our hotel we walked over to Brenda&#39;s for brunch on a Saturday. <br><br>There was a wait but it moved quickly. I got the Florentine Benedict and my friend got the Corn Succotash &amp; White Cheddar Omelette (not pictured). Everything was great - savory, a little spice, comforting!<br><br> Friendly and quick service. I ended up going back to try the dinner menu a few nights later - the shrimp and grits was everything I wanted. The beignets are a must try - SO big and fluffy and filled to perfection.

Hadn&#39;t been here in years because the neighborhood has become increasingly rough and filthy and waiting for a table first thing in the morning under such circumstances isn&#39;t a good wa

- Need to change the string of characters ' & #39; ' into a " ' "
- Remove the break characters

In [7]:
#removing break characters and html character references
df['review_text'] = df['review_text'].str.replace('&#39;', "'")
df['review_text'] = df['review_text'].str.replace('<br>', '')
df['review_text'] = df['review_text'].str.replace('&amp;', '&')

In [8]:
for row in df['review_text'][:5]:
    print(f'{row}\n')

Absolutely amazing. I've dined here at least 6 times since moving to the Bay Area in February of 2022. Every experience has been great and those I recommended to eat here all let me know how good the food was as well.

So good we went twice in one week! After a recommendation from the barista by our hotel we walked over to Brenda's for brunch on a Saturday. There was a wait but it moved quickly. I got the Florentine Benedict and my friend got the Corn Succotash & White Cheddar Omelette (not pictured). Everything was great - savory, a little spice, comforting! Friendly and quick service. I ended up going back to try the dinner menu a few nights later - the shrimp and grits was everything I wanted. The beignets are a must try - SO big and fluffy and filled to perfection.

Hadn't been here in years because the neighborhood has become increasingly rough and filthy and waiting for a table first thing in the morning under such circumstances isn't a good way to start the day.But snagged a res

In [9]:
#Creating column for the character length of the review
df['review_text_length'] = df['review_text'].apply(lambda x: len(x))

In [10]:
df.head()

Unnamed: 0,restaurant_name,restaurant_rating,customer_rating,review_text,review_text_length
0,Brenda's French Soul Food,4.0,5,Absolutely amazing. I've dined here at least 6...,217
1,Brenda's French Soul Food,4.0,5,So good we went twice in one week! After a rec...,560
2,Brenda's French Soul Food,4.0,2,Hadn't been here in years because the neighbor...,462
3,Brenda's French Soul Food,4.0,3,I love the other Brenda's in SF but this one d...,753
4,Brenda's French Soul Food,4.0,5,Another repeat visit and no disappointment fro...,497


In [12]:
df.describe()

Unnamed: 0,restaurant_rating,customer_rating,review_text_length
count,50000.0,50000.0,50000.0
mean,4.25,4.31974,683.22092
std,0.287231,1.018669,609.410535
min,4.0,1.0,17.0
25%,4.0,4.0,263.0
50%,4.0,5.0,508.0
75%,4.5,5.0,898.0
max,5.0,5.0,7778.0


## Create sentences from each of the reviews

In [30]:
nlp = spacy.load('en_core_web_md')

In [151]:
#Creating a list of sentences from the review using spacey

def create_sentences_from_review(review, nlp):

    spacey_doc = nlp(review)
    
    return [str(sent) for sent in spacey_doc.sents]

In [66]:
# Creating new column sentences (list of sentences)

df['sentences'] = df['review_text'].apply(lambda x: create_sentences_from_review(x, nlp))

In [47]:
df.head()

Unnamed: 0,restaurant_name,restaurant_rating,customer_rating,review_text,review_text_length,sentences
0,Brenda's French Soul Food,4.0,5,Absolutely amazing. I've dined here at least 6...,217,"[(Absolutely, amazing, .), (I, 've, dined, her..."
1,Brenda's French Soul Food,4.0,5,So good we went twice in one week! After a rec...,560,"[(So, good, we, went, twice, in, one, week, !)..."
2,Brenda's French Soul Food,4.0,2,Hadn't been here in years because the neighbor...,462,"[(Had, n't, been, here, in, years, because, th..."
3,Brenda's French Soul Food,4.0,3,I love the other Brenda's in SF but this one d...,753,"[(I, love, the, other, Brenda, 's, in, SF, but..."
4,Brenda's French Soul Food,4.0,5,Another repeat visit and no disappointment fro...,497,"[(Another, repeat, visit, and, no, disappointm..."


In [48]:
# Creating column sentence count

df['sentence_count'] = df['sentences'].apply(lambda x: len(x))

# Looking at total amount of sentences

df['sentence_count'].sum()

In [152]:
def create_sentence_df(df):
    """Creates rows for each sentence for each review"""
    
    #Loops through every row
    for index, row in df.iterrows():
        
        # Creates x duplicates where x is the sentence_count
        dup_rows = pd.DataFrame(np.repeat([row.values],
                                          [row['sentence_count']],
                                          axis = 0),
                                columns = df.columns)

        # Populates the duplicates with the sentences from the curent row
        for idx, sent in zip(dup_rows.index, (row['sentences'])):
            dup_rows.iloc[idx]['sentence'] = sent

        # Concatenates duplicates back to the original df
        df = pd.concat([df, dup_rows])
    
    # resets the index since duplicates were added
    df.reset_index(inplace = True, drop = True)
    # Dropping all of original rows where sentence is blank
    df.drop(df[df['sentence'] == ""].index, inplace = True)
    return df

In [150]:
# Using the index column to create review id's
df['review_id'] = df.index
# Creating empty sentence column
df['sentence'] = ""
df = create_sentence_df(df)

In [155]:
df.shape

(468444, 9)

In [156]:
df.to_csv(f'../data/San_Francisco_restaurant_reviews_sentences.csv')