In [6]:
import pandas as pd
import nltk
from nltk import word_tokenize
from sklearn.metrics import pairwise_distances
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import normalize
import re
import string
# import spacy
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
#Step 1 extract the data from the csv files
data_1 = pd.read_csv('Brooklyn_openreviews_10.csv')
data_2 = pd.read_csv('Brooklyn_reviews_closed_10.csv')
data_3 = pd.read_csv('Manhattan__openreviews_10.csv')
data_4 = pd.read_csv('Manhattan_reviews_closed_10.csv')

data_1 = data_1.values.flatten().astype('str')
data_2 = data_2.values.flatten().astype('str')
data_3 = data_3.values.flatten().astype('str')
data_4 = data_4.values.flatten().astype('str')

In [8]:
#Step 2 define the tokenize function and compute_tf_idf function
def tokenize(doc, lemmatized=True, stopword=True, punctuation=True):
    tokens = []
    tokens = word_tokenize(doc.lower())
    if stopword == False:
        stop_words = nltk.corpus.stopwords.words('english')
        tokens = [token for token in tokens
                  if token.strip() not in stop_words]
    if punctuation == False:
        tokens = [token for token in tokens
                  if token.strip() not in string.punctuation]
    if lemmatized == True:
        wordnet_lemmatizer = WordNetLemmatizer()
        tokens = [wordnet_lemmatizer.lemmatize(token) for token in tokens]
    return tokens

In [9]:
def compute_tfidf(docs, lemmatized=True, stopword=True, punctuation=False):
    docs_tokens = {idx:nltk.FreqDist(tokenize(doc, lemmatized, stopword, punctuation))for idx, doc in enumerate(docs)}
    dtm = pd.DataFrame.from_dict(docs_tokens, orient="index")
    dtm = dtm.fillna(0)
    dtm = dtm.sort_index(axis=0)
    vocabulary = dtm.columns.values
    tf = dtm.values
    doc_len = tf.sum(axis=1)
    tf = np.divide(tf.T, doc_len).T
    df = np.where(tf > 0, 1, 0)
    smoothed_idf = np.log(np.divide(len(docs) + 1, np.sum(df, axis=0) + 1)) + 1
    smoothed_tf_idf = tf * smoothed_idf
    return smoothed_tf_idf, smoothed_idf, vocabulary

In [10]:
def extract_top_5(tf_idf, vocabulary):
    return [list(vocabulary[np.argsort(row)][-1:-6:-1]) for row in tf_idf]

In [11]:
for i, data in enumerate([data_1, data_2, data_3, data_4]):
    tf_idf_data, _, vocab_data = compute_tfidf(data, True, False, False)
    print('The top_5 frequent words mentioned in each review in File {} are:'.format(i+1))
    print(extract_top_10(tf_idf_data, vocab_data))

The top_5 frequent words mentioned in each review in File 1 are:
[['otis', 'fishy', 'speel', 'onto', 'accompaniment'], ['cous', 'since', 'also', 'otisbk.com/menu-hours', 'pick-up'], ['time', '5', 'area', 'first', 'decided'], ['squash', 'nervous', 'chanelle', 'fennel', 'outbreak'], ['meat', 'lot', 'commute', 'writing', 'exquisite'], ['definitely', 'crumble', 'legroom', 'mind-blowing', 'rave'], ['ear', 'curry', 'night', 'mussel', 'beef'], ['must', 'chef', 'long', 'potato', 'stable'], ['restaurant', 'visit', 'drink', 'addicting', 'elite'], ['broth', 'mussel', 'gave', 'wanted', 'heading'], ['spritz', 'aperol', 'confit', 'spaced', 'entrance'], ['heaven', 'taste', 'excitement', 'onto', 'accompaniment'], ['justify', 'daniel', 'charging', '3-', 'tony'], ['visit', 'wont', 'capture', 'first', 'cornbread'], ['tasha', 'prayerfully', 'magic', 'tge', 'meanwhile'], ['healed', 'leg', 'wake', '..took', 'prob'], ['grateful', 'crazy.lol', 'reopened', 'stir', 'covid/quarantine'], ['cornbread', 'go', '--',

The top_5 frequent words mentioned in each review in File 2 are:
[['champion', 'coffee', 'surely', 'dare', 'strike'], ['frequent', 'card', 'add', 'visitor', 'employee'], ['lost', 'unwelcoming', 'woman', 'worst', 'working'], ['yes', 'yep', 'champ', 'patio', 'lit'], ['coffee', 'minimal', 'mix', 'get', 'shop'], ['spot', 'croissant', 'spending', 'various', 'includes'], ['couple', 'coffee', 'west', 'played', 'coast'], ['le', 'store', 'top', 'space', 'nice'], ['café', 'helpful', 'small-group', 'informal', 'conducting'], ['sandwich', 'coffee', 'lic', 'walk/drive', 'greenpoint'], ['baristas', 'cashier', 'rudely', '3x', 'smile'], ['ristorante', 'sorrentina', 'la', 'diner', 'yelp'], ['--', 'alla', 'main', 'course', 'dining'], ['scampi', 'shrimp', 'drenched', 'effectively', '24'], ['event', 'bridal', 'shower', 'beautiful', 'party'], ['really', 'listen', 'angry', 'crappy', 'terrible'], ['event', 'dining', 'beautiful', '3', 'awesome'], ['8clams', '12', 'order..', 'lady', 'excuse'], ['creme', 'baked

The top_5 frequent words mentioned in each review in File 4 are:
[['151', 'flight', 'patate', 'coated', 'hook'], ['killian', 'cali', 'provided', 'visited', 'thanks'], ['beer', 'since', 'kicker', 'erase', 'disgusting'], ['game', 'pitt', 'bar', 'finger', 'week'], ["'s", 'catch', 'super', 'clean', "n't"], ['bathroom', 'long', 'situation-', 'drink', 'food'], ['beer', '4pm', 'stating', 'breading', 'waffle'], ['5/10', 'margarita', 'okay', 'ambiance', 'margz'], ['beer', 'wine', 'reveals', 'poor', 'thirsty'], ['substance', 'strongbow', 'sunrise', 'ketchup', 'sticky/gooey'], ['wing', '10', "n't", 'ca', 'breeze'], ['sage', 'kitchen', 'tamika', 'con', 'hook'], ['love', 'donated', 'sinai', 'soo', 'pandemic'], ['paper', 'israeli', 'craft', 'wrapped', 'perfection'], ['sandwich', 'schnitzel', 'kosher', 'sage', 'office'], ['client', 'sandwich', 'horseradish', 'taking', 'also'], ['fresh', 'baguette', 'salami', 'israeli', 'egg'], ['favs', 'killer', 'ive', '7', 'everytime'], ['avoid', 'plastic', 'single'