### Imports

In [154]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [155]:
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from nltk.corpus import stopwords
from nltk import word_tokenize
from sklearn.model_selection import train_test_split

In [156]:
from nltk.stem import WordNetLemmatizer 

In [157]:
from afinn import Afinn

In [158]:
import spacy
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_sm

In [159]:
pd.set_option('display.max_rows',600)
pd.set_option('display.max_columns',500)

In [160]:
df = pd.read_csv('final_df_v3.csv', index_col='Unnamed: 0')

# Cleaning DF

In [161]:
df.dropna(subset=['review','rating'], inplace=True)

In [162]:
df.reset_index(drop=True, inplace=True)

## Removing Bad Strings

In [163]:
bad_str_1 = 'Follow NYT Food on Twitter and NYT Cooking on Instagram, Facebook, YouTube and Pinterest.'
bad_str_2 = 'Get regular updates from NYT Cooking, with recipe suggestions, cooking tips and shopping advice.'
bad_str_3 = 'What the stars mean: (None) Poor to Satisfactory * Good ** Very Good *** Excellent ****' 
bad_str_4 = 'Extraordinary Ratings reflect the reviewer\'s reaction primarily to food, with ambiance and service taken into consideration.' 
bad_str_5 = 'Menu listings and prices are subject to change.'
bad_str_6 = 'Extraordinary Ratings reflect the reviewer\'s reaction to food, ambiance and service with price taken into consideration.'
bad_str_7 = 'ON THE WEB PAST REVIEWS from the New York Times, with additional capsule reviews by Times critics: nytimes.com/dining'

def del_bad_str(review):
    
    for string in [bad_str_1, bad_str_2, bad_str_3, bad_str_4, bad_str_5, bad_str_6, bad_str_7]:
        
        if string in review:
            
            review = review.replace(string, '').strip()
    
    return review


In [164]:
df['review'] = df['review'].apply(lambda x: del_bad_str(x))

## Fixing Rating

In [165]:
def categorize(rating):
    if rating in ['0.25 star', '0.5 star', '0.75 star', 'Fair', 'Satisfactory', 'Poor', 'SATISFACTORY']:
        return '0 star'
    elif rating in ['1 star', '★']:
        return '★'
    elif rating in ['2 star','★★']:
        return '★★'
    else:
        return '★★★'

In [166]:
df['rating'] = df['rating'].apply(lambda x: categorize(x))

In [167]:
df.head()

Unnamed: 0,name,rating,review_date,reviewer,review_url,neighborhood,cuisine,recommendations,review,headline
0,Hanon,★★,2019-05-21,Pete Wells,https://www.nytimes.com/2019/05/21/dining/hano...,Williamsburg,Japanese,Japanese omelet; fried chicken with spice; ric...,"Hanon, a new udon shop in Williamsburg, Brookl...","Udon, Innovated for Your Pleasure"
1,Del Posto,★★★,2019-05-14,Pete Wells,https://www.nytimes.com/2019/05/14/dining/del-...,Chelsea,Italian,Grilled salsify salad; vitello tonnato; minest...,"More than a year ago, I was on the verge of re...","In Del Posto’s New Era, Cuisine and Service Ar..."
2,The Freakin Rican,★,2019-05-07,Pete Wells,https://www.nytimes.com/2019/05/07/dining/the-...,Astoria,"Caribbean, Latin American",Alcapurrias; pasteles; bacalaitos; chicharrone...,It is true that the bacalaitos at the Freakin ...,Pasteles and Two Kinds of Fried Chicken at the...
3,Wayan,★★,2019-04-23,Pete Wells,https://www.nytimes.com/2019/04/23/dining/waya...,NoLIta,Indonesian,Hearts-of-palm salad; clams Jimbaran style; av...,If you ate at Spice Market before it closed tw...,Tapping a Family Connection to Indonesian Food
4,Niche,★,2019-04-16,Pete Wells,https://www.nytimes.com/2019/04/16/dining/nich...,Lower East Side,Japanese,Avocado crunch; yuzu scallop crudo; umami komb...,"In 2012 and 2013, when people were lining up a...",Ramen Without Broth? A Chef Doubles Down on a ...


# Making a Stopwords List

In [168]:
NYT_stoplist = ['follow','NYT','nyt','food','twitter','cooking','instagram','facebook','youtube','pinterest','Cooking',
 'recipe','suggestions','cooking','tips','shopping','advice','food', 'restaurant', '00',
 '000','0000','0005','0008','0030','0050','0082','0101','0105','0200','0220','0233','0242','0271','03','0303','0327',
 '0350','0404', '05', '0505', '0553', '0555', '0665', '07', '0808', '0812', '0820', '0905', '0906', '0999', 'street', 
 'ave','★','*','atmosphere','sound','wine','list','am','pm', 'daily', 'monday','tuesday','wednesday','thursday',
 'friday','saturday','sunday','hours','wheelchair','access','0003', '0012', '0020', '0033', '0085', '0100', '02',
 '0202', '0211', '0215', '0221', '0228', '0300', '0400', '0412', '0470', '0606', '0700', '08', '0844', '0880', '0955',
 'mr', 'dinner']

numbers_stoplist = [str(x) for x in range(0,2020)]

In [169]:
new_stop_words = [';','.',',','’','“','”','—', ':', '(', ')', 'nyt', '$','would', 'one', 'mr.', \
                  'came', 'table', 'get', 'year', 'water', 'go', 'along', 'first', 'second', \
                  'restaurant', 'new', 'york', 'cooking', 'shop', 'might', 'also', 'make', 'made', 'also'\
                  'food', 'menu', 'meal', 'list', 'dish', 'table', 'dining', 'server', 'food', 'room', 'floor',\
                  'served', 'may', 'want', 'brought', 'night', 'customer', 'said', 'say', 'come', 'dinner', 'le',\
                  'name', 'time', 'chef', '!', 'side', 'day', 'night', 'ate', '?', 'back', 'customers', 'restaurants',\
                  'ingredient', 'cook','dish', 'lunch', 'a.m.', 'p.m.', '\'s', 'way', 'ms.', '--', 'n\'t']

In [170]:
stop_entities = ['GPE', 'CARDINAL', 'PERSON', 'DATE']

def remove_named_entities(review):
    
    # Get Entities
    nlp = English()
    nlp = en_core_web_sm.load()
    
    doc = nlp(review)
    
    stop_entity_list = []
    entities=[(i, i.label_, i.label) for i in doc.ents]
    
    for entity in entities:
        
        if entity[1] in stop_entities:
            stop_entity_list.append(str(entity[0]).lower())

    return stop_entity_list

In [171]:
nested = []
for review in df['review']:
    se_list = remove_named_entities(review)
    nested.append(se_list)

In [172]:
stopped_entities = [word for sublist in nested for word in sublist]

In [173]:
stopped_entities = list(set(stopped_entities))

In [174]:
stopped_entities

['georg breuer',
 'february',
 'josh ochoa',
 'harrison',
 'jeremy piven',
 'meat hook sandwich',
 'kolokithi',
 'hava nagila',
 'jun ichikawa',
 'two to four',
 'atkins',
 'michael laiskonis’s',
 'six days',
 'salon millesime',
 'more than 200 years',
 'philippe massoud',
 'sri lanka',
 'luciano',
 'rotisserie',
 'kowloon',
 'robert aikens',
 'tocqueville',
 'raveneau',
 '2000',
 'game changing',
 'a year and a half ago',
 'della rovere',
 'genoa',
 'his week',
 'bianco',
 'rialto',
 'the time of year',
 'hospoda',
 'gaon',
 'redeye grill',
 'kajitsu',
 'james calvert',
 'stephen gaghan',
 'turley',
 'jell',
 'spooky',
 'kim',
 'puerto',
 'suarez',
 'williamsburg',
 'pierre schutz',
 'simone bonelli',
 'the previous decade',
 'the first month of spring',
 'sun',
 'prague',
 'klink',
 'nearly four years ago',
 '1854',
 'at least a week ahead',
 'the 1940s',
 'online frank bruni',
 'rice',
 'last fall',
 'colleen grapes',
 'warmly',
 '165',
 'din tai fung',
 'the fall of 2008',
 'anita 

In [175]:
shortlist = stopwords.words('english')+ list(string.digits)+ NYT_stoplist + numbers_stoplist +['dishes', 'dish', 'new', 'sauce', 'made', 'mr.', 'room', 'table', 'would', 'new york', 'york','menu','one','two','three']+new_stop_words

In [176]:
shortlist = list(set(shortlist))

In [177]:
stopwords_list = stopwords.words('english')+ list(string.digits)+ NYT_stoplist + numbers_stoplist + [str(x) for x in range(2020, 99999)]+['dishes', 'dish', 'new', 'sauce', 'made', 'mr.', 'room', 'table', 'would', 'new york', 'york','menu','one','two','three']+new_stop_words+stopped_entities

In [178]:
stopwords_list = list(set(stopwords_list)) 

In [179]:
stopwords_list

['7882',
 '8245',
 '14088',
 '2833',
 '18339',
 '52309',
 '63564',
 '74987',
 '99222',
 '43413',
 '74921',
 '15007',
 'josh ochoa',
 '54660',
 '67148',
 '30592',
 '68317',
 '8150',
 '86775',
 '72772',
 'meat hook sandwich',
 '3100',
 '496',
 '19513',
 '3367',
 '44024',
 '99931',
 '51179',
 '87571',
 '50739',
 '78435',
 'six days',
 '84456',
 '4231',
 '17311',
 '87092',
 '16048',
 '37558',
 '55892',
 '54855',
 'tocqueville',
 '48334',
 '70484',
 '66420',
 '15361',
 '57144',
 'della rovere',
 '11926',
 '37913',
 '628',
 '3887',
 '33609',
 '60368',
 '54609',
 '61390',
 'his week',
 '76953',
 '89722',
 '71449',
 '41098',
 '94666',
 '72346',
 '80179',
 '28455',
 '49512',
 '89553',
 '33477',
 '50255',
 '69157',
 '26391',
 '63917',
 '95650',
 '25911',
 '78171',
 '8729',
 '22057',
 '58140',
 '88209',
 '72626',
 '28589',
 '61930',
 '59129',
 '94675',
 '7219',
 '33717',
 '56076',
 '62114',
 '79866',
 '14436',
 '69647',
 '2126',
 '60648',
 '75663',
 '17159',
 '50309',
 'williamsburg',
 'the previ

# More DF Cleaning

In [180]:
from nltk.tokenize import WhitespaceTokenizer

from nltk.stem import LancasterStemmer, SnowballStemmer, RegexpStemmer, WordNetLemmatizer 

stemmer = SnowballStemmer('english')

lemmer = WordNetLemmatizer()

token = WhitespaceTokenizer()


def make_tokens(review, token):
    tokenized_review = token.tokenize(review)
    return tokenized_review

def lower(tokenized_review):
    lowercase_tokens = [token.lower() for token in tokenized_review]
    return lowercase_tokens

def remove_punc(lowercase_tokens):
    stripped_tokens = [token.translate(str.maketrans('', '', string.punctuation)) for token in lowercase_tokens]
    return stripped_tokens

def remove_stopwords(stripped_tokens, stopwords_list):
    stopped_tokens = [token for token in stripped_tokens if token not in stopwords_list]
    return stopped_tokens

def stem_tokens(stopped_tokens, stemmer):
    stems = [stemmer.stem(token) for token in stopped_tokens]
    return stems

def lem_tokens(stopped_tokens, lemmer):
    lems = [lemmer.lemmatize(token) for token in stopped_tokens]
    return lems

def normalize(review, stopword_list, lemmer, token):
    tokens = make_tokens(review, token)
    lower_tokens = lower(tokens)
    no_punc = remove_punc(lower_tokens)
    stopped = remove_stopwords(no_punc, stopword_list)
    lems = lem_tokens(stopped, lemmer)
    return lems

In [181]:
df['tokens'] = df['review'].apply(lambda x: normalize(x, shortlist, lemmer, token))

In [182]:
# Number of unique words
df['vocabulary'] = df['tokens'].apply(lambda x: len(set(x)))

In [183]:
# Number of total words
# df['num_tokens'] = df['tokens'].apply(lambda x: len(x))

In [184]:
df.drop(['cuisine','recommendations'], axis=1, inplace=True)

df = df[(df['reviewer'] == 'Pete Wells')|(df['reviewer'] == 'Frank Bruni')|(df['reviewer'] == 'Sam Sifton')]

In [185]:
def hood_to_area(hood):
    if hood in ['West Village', 'East Village', 'TriBeCa','SoHo', 'Lower East Side', 'Greenwich Village',
                'NoHo', 'Nolita', 'Financial District', 'Gramercy Park', 'Chinatown', 'Union Square', 
                'Battery Park City', 'Meatpacking District','NoLIta', 'Little Italy', 'Hudson Square','Fulton Ferry']:
        return 'Downtown'
    elif hood in ['Midtown', 'Midtown East', 'Midtown South', 'Times Square Theatre District', 'Kips Bay',
                  'Murray Hill', 'Koreatown', 'Hudson Yards', 'Chelsea', 'Flatiron district','Flatiron', 
                  'NoMad', 'Columbus Circle']:
        return 'Midtown'
    elif hood in ['Upper East Side', 'Upper West Side', 'Harlem']:
        return 'Uptown'
    
    elif hood in ['Williamsburg','Clinton', 'Park Slope','Greenpoint', 'Gowanus','Prospect Heights', 'Red Hook', 
                  'Carroll Gardens', 'Bushwick', 'Crown Heights', 'Cobble Hill', 'Boerum Hill', 'Fort Greene',
                  'Park Slope North', 'Columbia Street Waterfront District', 'Coney Island', 'Brooklyn Heights', 
                  'Bay Ridge','Ditmas', 'Downtown Brooklyn','Sheepshead Bay', 'Midwood']:
        
        return 'Brooklyn'
    else:
        
        return 'Other'


In [186]:
df['area'] = df['neighborhood'].apply(lambda x: hood_to_area(x))

In [187]:
df.head()

Unnamed: 0,name,rating,review_date,reviewer,review_url,neighborhood,review,headline,tokens,vocabulary,area
0,Hanon,★★,2019-05-21,Pete Wells,https://www.nytimes.com/2019/05/21/dining/hano...,Williamsburg,"Hanon, a new udon shop in Williamsburg, Brookl...","Udon, Innovated for Your Pleasure","[hanon, udon, williamsburg, brooklyn, produced...",416,Brooklyn
1,Del Posto,★★★,2019-05-14,Pete Wells,https://www.nytimes.com/2019/05/14/dining/del-...,Chelsea,"More than a year ago, I was on the verge of re...","In Del Posto’s New Era, Cuisine and Service Ar...","[ago, verge, reviewing, del, posto, news, inte...",606,Midtown
2,The Freakin Rican,★,2019-05-07,Pete Wells,https://www.nytimes.com/2019/05/07/dining/the-...,Astoria,It is true that the bacalaitos at the Freakin ...,Pasteles and Two Kinds of Fried Chicken at the...,"[true, bacalaitos, freakin, rican, astoria, qu...",471,Other
3,Wayan,★★,2019-04-23,Pete Wells,https://www.nytimes.com/2019/04/23/dining/waya...,NoLIta,If you ate at Spice Market before it closed tw...,Tapping a Family Connection to Indonesian Food,"[spice, market, closed, year, ago, experience,...",425,Downtown
4,Niche,★,2019-04-16,Pete Wells,https://www.nytimes.com/2019/04/16/dining/nich...,Lower East Side,"In 2012 and 2013, when people were lining up a...",Ramen Without Broth? A Chef Doubles Down on a ...,"[people, lining, smorgasburg, houston, whole, ...",414,Downtown


In [188]:
df.shape

(615, 11)

In [189]:
df.describe()

Unnamed: 0,vocabulary
count,615.0
mean,467.162602
std,44.873106
min,291.0
25%,437.0
50%,464.0
75%,492.0
max,666.0


# Sentiment Analysis

In [190]:
af = Afinn()

In [191]:
# df['afinn_score'] = df['review'].apply(lambda x: af.score(x))

In [192]:
# plt.figure(figsize=(12,8))
# sns.boxplot(df['rating'], df['afinn_score'], palette ='Set3', order= ['0 star','★','★★','★★★'])

In [193]:
from nltk import sent_tokenize

def blob_polarity(review):
    
    polarity = []
    
    for sentence in sent_tokenize(review):
        polarity.append(af.score(sentence))
    
    avg_pol = np.mean(polarity)
    min_pol = np.min(polarity)
    max_pol = np.max(polarity)
    std_pol = np.std(polarity)
    
    return avg_pol, min_pol, max_pol, std_pol

def add_blob_pol(df):
    
    df['af_avg_pol'] = df['review'].apply(lambda x: blob_polarity(x)[0])
    df['af_min_pol'] = df['review'].apply(lambda x: blob_polarity(x)[1])
    df['af_max_pol'] = df['review'].apply(lambda x: blob_polarity(x)[2])
    df['af_std_pol'] = df['review'].apply(lambda x: blob_polarity(x)[3])
    
#     df['sentiment'] = np.where(df['blob_avg_pol'] > 0.05, 1, 0)
        
    return df

df = add_blob_pol(df)

In [194]:
df.head()

Unnamed: 0,name,rating,review_date,reviewer,review_url,neighborhood,review,headline,tokens,vocabulary,area,af_avg_pol,af_min_pol,af_max_pol,af_std_pol
0,Hanon,★★,2019-05-21,Pete Wells,https://www.nytimes.com/2019/05/21/dining/hano...,Williamsburg,"Hanon, a new udon shop in Williamsburg, Brookl...","Udon, Innovated for Your Pleasure","[hanon, udon, williamsburg, brooklyn, produced...",416,Brooklyn,0.886364,-2.0,6.0,1.654414
1,Del Posto,★★★,2019-05-14,Pete Wells,https://www.nytimes.com/2019/05/14/dining/del-...,Chelsea,"More than a year ago, I was on the verge of re...","In Del Posto’s New Era, Cuisine and Service Ar...","[ago, verge, reviewing, del, posto, news, inte...",606,Midtown,0.716216,-6.0,8.0,2.016957
2,The Freakin Rican,★,2019-05-07,Pete Wells,https://www.nytimes.com/2019/05/07/dining/the-...,Astoria,It is true that the bacalaitos at the Freakin ...,Pasteles and Two Kinds of Fried Chicken at the...,"[true, bacalaitos, freakin, rican, astoria, qu...",471,Other,0.865385,-2.0,5.0,1.593603
3,Wayan,★★,2019-04-23,Pete Wells,https://www.nytimes.com/2019/04/23/dining/waya...,NoLIta,If you ate at Spice Market before it closed tw...,Tapping a Family Connection to Indonesian Food,"[spice, market, closed, year, ago, experience,...",425,Downtown,0.75,-4.0,5.0,1.85405
4,Niche,★,2019-04-16,Pete Wells,https://www.nytimes.com/2019/04/16/dining/nich...,Lower East Side,"In 2012 and 2013, when people were lining up a...",Ramen Without Broth? A Chef Doubles Down on a ...,"[people, lining, smorgasburg, houston, whole, ...",414,Downtown,0.387755,-4.0,4.0,1.53618


In [195]:
# sent_tokens = sent_tokenize(df['review'][0])

In [196]:
# polarity = [af.score(x) for x in sent_tokens]

In [197]:
# list(zip(sent_tokens, polarity))

In [198]:
# sum(polarity)

In [199]:
# tokens0 = []
# tokens1 = []
# tokens2 = []
# tokens3 = []

# for i in range(0,len(df)):
#     if df.iloc[i]['rating'] == '0 star':
#         tokens0.append(df.iloc[i]['review'])
#     elif df.iloc[i]['rating'] == '★':
#         tokens1.append(df.iloc[i]['review'])
#     if df.iloc[i]['rating'] == '★★':
#         tokens2.append(df.iloc[i]['review'])
#     if df.iloc[i]['rating'] == '★★★':
#         tokens3.append(df.iloc[i]['review'])

In [200]:
# vec0 = TfidfVectorizer(stop_words=stopwords_list_3,preprocessor=lambda x: stemmer.stem(x), ngram_range=(1,3),max_features = 20)
# vec1 = TfidfVectorizer(stop_words=stopwords_list_3,preprocessor=lambda x: stemmer.stem(x), ngram_range=(1,3),max_features = 20)
# vec2 = TfidfVectorizer(stop_words=stopwords_list_3,preprocessor=lambda x: stemmer.stem(x), ngram_range=(1,3),max_features = 20)
# vec3 = TfidfVectorizer(stop_words=stopwords_list_3,preprocessor=lambda x: stemmer.stem(x), ngram_range=(1,3),max_features = 20)

In [201]:
# vec0 = TfidfVectorizer(stop_words=stopwords_list,ngram_range=(1,3),max_features = 20)
# vec1 = TfidfVectorizer(stop_words=stopwords_list,ngram_range=(1,3),max_features = 20)
# vec2 = TfidfVectorizer(stop_words=stopwords_list,ngram_range=(1,3),max_features = 20)
# vec3 = TfidfVectorizer(stop_words=stopwords_list,ngram_range=(1,3),max_features = 20)

In [202]:
# tfidf0 = vec0.fit_transform(tokens0)
# tfidf1 = vec1.fit_transform(tokens1)
# tfidf2 = vec2.fit_transform(tokens2)
# tfidf3 = vec3.fit_transform(tokens3)

In [203]:
# tfidf_df0 = pd.DataFrame(tfidf0.toarray(), columns=vec0.get_feature_names())
# tfidf_df1 = pd.DataFrame(tfidf1.toarray(), columns=vec1.get_feature_names())
# tfidf_df2 = pd.DataFrame(tfidf2.toarray(), columns=vec2.get_feature_names())
# tfidf_df3 = pd.DataFrame(tfidf3.toarray(), columns=vec3.get_feature_names())

# TF-IDF Analysis

In [204]:
tfidf_total = TfidfVectorizer(stop_words=stopwords_list)

In [205]:
tfidf_total.fit(df['review'])

  'stop_words.' % sorted(inconsistent))


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=['7882', '8245', '14088', '2833', '18339', '52309', '63564', '74987', '99222', '43413', '74921', '15007', 'josh ochoa', '54660', '67148', '30592', '68317', '8150', '86775', '72772', 'meat hook sandwich', '3100', '496', '19513', '3367', '44024', '99931', '51179', '87571', '50739', '78435',...43971', 'de tejas', '55220', '29412', '87805', '56534', '14029', '46319', '29126', '40616', '73688'],
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [206]:
tfidf_total_transform = tfidf_total.transform(df['review'])

In [207]:
ngrams = tfidf_total.get_feature_names()

In [208]:
ngram_polarity = [(word, af.score(word)) for word in ngrams]

In [209]:
polar_words = []
for word,score in ngram_polarity:
    if score != 0:
        polar_words.append((word, score))

In [252]:
most_polar_words = []
for word,score in polar_words:
    if score < -3.5 or score > 3.5:
        most_polar_words.append((word, score))

In [253]:
most_polar_words

[('amazing', 4.0),
 ('awesome', 4.0),
 ('bastard', -5.0),
 ('breathtaking', 5.0),
 ('brilliant', 4.0),
 ('catastrophic', -4.0),
 ('damned', -4.0),
 ('ecstatic', 4.0),
 ('exuberant', 4.0),
 ('fabulous', 4.0),
 ('fantastic', 4.0),
 ('fraud', -4.0),
 ('fun', 4.0),
 ('funnier', 4.0),
 ('funny', 4.0),
 ('godsend', 4.0),
 ('heavenly', 4.0),
 ('hell', -4.0),
 ('masterpiece', 4.0),
 ('miracle', 4.0),
 ('outstanding', 5.0),
 ('prick', -5.0),
 ('rapturous', 4.0),
 ('rejoice', 4.0),
 ('slut', -5.0),
 ('stunning', 4.0),
 ('superb', 5.0),
 ('supreme', 4.0),
 ('terrific', 4.0),
 ('terrifically', 4.0),
 ('thrilled', 5.0),
 ('torture', -4.0),
 ('tortured', -4.0),
 ('triumph', 4.0),
 ('triumphant', 4.0),
 ('win', 4.0),
 ('winning', 4.0),
 ('wins', 4.0),
 ('wonderful', 4.0),
 ('wonderfully', 4.0),
 ('wow', 4.0)]

In [254]:
tfidf_vocab = [word for word, score in most_polar_words]

In [255]:
tfidf_vocab = list(set([lemmer.lemmatize(x) for x in tfidf_vocab]))

In [256]:
tfidf_vocab = [word for word in tfidf_vocab if word not in 'hater' and word not in 'bribe']

In [257]:
tfidf_total_df = pd.DataFrame(tfidf_total_transform.toarray(), columns = ngrams)

# Creating a DF for Modeling

In [258]:
modeling = df.copy()

In [259]:
modeling.shape

(615, 15)

In [260]:
modeling.reset_index(drop=True, inplace=True)

In [261]:
modeling = pd.concat([modeling,tfidf_total_df[tfidf_vocab]], axis=1)

In [262]:
modeling.shape

(615, 55)

In [263]:
from sklearn.preprocessing import OneHotEncoder

In [264]:
ohe_rev = OneHotEncoder(sparse=False)
ohe_area = OneHotEncoder(sparse=False)

In [265]:
reviewer_dummies = ohe_rev.fit_transform(np.array(df['reviewer']).reshape(-1,1))

In [266]:
reviewer_dum_df = pd.DataFrame(reviewer_dummies, columns = ohe_rev.get_feature_names())

In [267]:
area_dummies = ohe_area.fit_transform(np.array(df['area']).reshape(-1,1))

In [268]:
area_dum_df = pd.DataFrame(area_dummies, columns = ohe_area.get_feature_names())

In [269]:
modeling = pd.concat([modeling, reviewer_dum_df, area_dum_df], axis=1)

In [270]:
modeling.shape

(615, 63)

In [271]:
modeling.columns

Index(['name', 'rating', 'review_date', 'reviewer', 'review_url',
       'neighborhood', 'review', 'headline', 'tokens', 'vocabulary', 'area',
       'af_avg_pol', 'af_min_pol', 'af_max_pol', 'af_std_pol', 'triumphant',
       'win', 'hell', 'masterpiece', 'thrilled', 'wonderful', 'damned',
       'breathtaking', 'heavenly', 'wonderfully', 'terrifically', 'rejoice',
       'miracle', 'godsend', 'fabulous', 'torture', 'wow', 'ecstatic', 'slut',
       'brilliant', 'outstanding', 'amazing', 'supreme', 'tortured', 'triumph',
       'awesome', 'fraud', 'prick', 'funny', 'superb', 'fantastic',
       'exuberant', 'catastrophic', 'stunning', 'terrific', 'fun', 'funnier',
       'winning', 'bastard', 'rapturous', 'x0_Frank Bruni', 'x0_Pete Wells',
       'x0_Sam Sifton', 'x0_Brooklyn', 'x0_Downtown', 'x0_Midtown', 'x0_Other',
       'x0_Uptown'],
      dtype='object')

In [272]:
modeling.to_csv('modeling_3.csv')