In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
import re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords

In [None]:
df = pd.read_csv('../data/San_Francisco_restaurant_reviews_sentences.csv')

In [None]:
df.drop(columns=['Unnamed: 0'], inplace = True)

In [None]:
df.head()

## Checkign and getting rid of useless sentences

In [None]:
df['sentence'].value_counts().iloc[0:30]

In [None]:
# Removing sentences that are just a number and a period
df['sentence'] = df['sentence'].apply(lambda x: re.sub(r'^\d.$', '', x))

In [None]:
# Removing sentences that aren't alphanumeric characters
df['sentence'] = df['sentence'].apply(lambda x: re.sub(r'^\W+$', '', x))

In [None]:
# Removing sentences that are just a word and a colon
df['sentence'] = df['sentence'].apply(lambda x: re.sub(r'^\w+:$', '', x))

In [None]:
print(df.shape)
df = df[df['sentence'] != '']
print(df.shape)

In [None]:
df['sentence'].value_counts().iloc[0:60]

## Lemmatizing sentences and getting the count of occurences of each of the words

In [None]:
# Create lemmatizer and tokenizer
lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')

In [None]:
# lemmatizing and tokenizing sentences
df['lemmatized'] = df['sentence'].apply(lambda x: [lemmatizer.lemmatize(word) for word in tokenizer.tokenize(x.lower())])

In [None]:
print(stopwords.words('english'))

In [None]:
# Removing stopwords from lemmatized tokens
df['lemmatized'] = df['lemmatized'].apply(lambda x: [token for token in x if token not in stopwords.words('english')])

In [None]:
# Create the word_df by extrending all of the lemmatized lists

word_df = []

# Going through each row and appending the words to the word df
for idx, row in df.iterrows():
    word_df.extend(row['lemmatized'])
                   
word_df = pd.DataFrame(word_df, columns=['word'])

In [None]:
# Changing df to be a count of each word found
word_df = word_df['word'].value_counts()

In [None]:
word_df.head()

In [None]:
word_df.shape

In [None]:
word_count_greater_than_1000 = word_df[word_df >= 1000]

In [None]:
word_count_greater_than_1000.shape

In [None]:
def barchart_of_words(counts, words, title, ):
    plt.figure(figsize = (10, 8))
    y_pos = np.arrange(10)
    plt.rc('axes', titlesize=30) 
    plt.rc('axes', labelsize=20) 
    plt.rc('xtick', labelsize=15)
    plt.rc('ytick', labelsize=15) 
    plt.barh(y_pos ,counts)
    plt.yticks(y_pos, words)
    plt.title(title)
    plt.xlabel('Frequency')
    plt.ylabel('Words')
    plt.tight_layout()
    plt.savefig(f'../images/top_{title[5:7]}_most_frequent_words')

In [None]:
for idx, value in zip(word_count_greater_than_1000[600:].index, word_count_greater_than_1000[600:]):
    print(idx, value)

In [None]:
food_words = ['food', 'get', 'delicious', 'dish' ,'ordered', 'order', 'pizza', 'flavor', 'menu',
             'sauce', 'chicken', 'meat', 'fresh', 'pork', 'bit', 'burrito', 'bread', 'taste',
             'fried', 'eat', 'salad', 'dinner', 'cheese', 'drink', 'oyster', 'sweet', 'ordered', 'try', 
             'fish', 'egg', 'brunch', 'potato', 'sandwich', 'rice', 'spicy', 'seafood', 'shrimp',
             'clam', 'tried', 'coffee', 'dessert', 'crab', 'portion', 'toast', 'tasty', 'bacon', 'taco',
             'soup', 'beef', 'plate', 'big', 'course', 'crispy', 'chip', 'dumpling', 'cooked', 'tasted',
             'lunch', 'quality', 'flavorful', 'cream', 'huge', 'excellent', 'perfectly', 'noodle',
             'eating', 'wine', 'appetizer', 'chowder', 'slice', 'served', 'roll', 'rib', 'large', 'piece',
             'pancake', 'tender', 'breakfast', 'style', 'fry' , 'bean', 'benedict', 'cut', 'cocktail', 
             'steak', 'ramen', 'soft', 'chocolate', 'light', 'lemon', 'crust', 'half', 'yummy', 'sausage',
             'entree', 'texture', 'sushi', 'filling', 'bowl', 'grilled', 'wing', 'lamb', 'korean', 'butter',
             'tea', 'ordering', 'start', 'ingredient', 'mushroom', 'tomato', 'onion', 'salmon', 'salsa',
             'curry', 'water', 'broth', 'cioppino',' veggie', 'mexican', 'started', 'ate', 'cold' , 'duck',
             'creamy', 'belly', 'mouth', 'juicy', 'salty', 'burger', 'ice', 'thick', 'rich', 'thai', 'scallop'
             'chef', 'savory', 'vegan', 'avocado', 'shared', 'pasta', 'spice', 'red', 'crunchy', 
             'chop', 'pepper', 'yum', 'vegetarian', 'seasoned', 'thin', 'roasted', 'kimchi', 'beignet',
             'combo', 'tart', 'heavy', 'risotto', 'stuffed', 'italian', 'bakery', 'generous', 'asada',
             'mussel', 'lobster', 'sour', 'serving', 'pastry', 'ricotta', 'topping' , 'spinach', 'fusion',
             'bone', 'authentic', 'vegetable', 'craving', 'carne', 'banana', 'coconut', 'croissant', 'apple',
             'fluffy']

service_words = ['service', 'staff', 'friendly', 'waiter', 'served', 'check' , 'customer', 'finally', 
                 'ask', 'attentive', 'told', 'tip', 'waitress']
time_words = ['wait', 'line', 'reservation', 'minute', 'hour', 'night', 'seated', 'server', 'waiting',
              'busy', 'early', 'waited', 'quick', 'quickly', 'fast', 'weekend', 'saturday', 'morning',
              'sunday', 'waitlist', 'evening', 'serve', 'month', 'friday', 'crowded']

Created a list of words to filter through the messages by looking at words that were used at least 1000 times

In [None]:
food_set = set(food_words)
service_set = set(service_words)
time_set = set(time_words)

df['food_review'] = False
df['service_review'] = False
df['time_review'] = False

df['food_review'] = df['lemmatized'].apply(lambda x: set(x)&food_set)
df['service_review'] = df['lemmatized'].apply(lambda x: set(x)&service_set)
df['time_review'] = df['lemmatized'].apply(lambda x: set(x)&time_set)
df.head()

In [None]:
df.iloc[0]['food_review']

In [None]:
df[df['food_review'] != set()].shape

In [None]:
df[df['service_review'] != set()].shape

In [None]:
df[df['time_review'] != set()].shape

In [None]:
non_specific_reviews = df[(df['food_review'] == set()) & (df['service_review'] == set()) & (df['time_review'] == set())]

In [None]:
for _, row in non_specific_reviews['sentence'].iloc[12:12].iteritems():
    print(row)