In [1]:
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
from pathlib import Path
import json

# Only needs to be run once to create a csv file for the training data

# set path to file
# p = Path(r'C:\Users\anish\Desktop\cse142_nltk\data_train.json')


# # read json
# with p.open('r', encoding='utf-8') as f:
#     data = json.loads(f.read())

# # create dataframe
# df = json_normalize(data)
# # create dataframe
# df = json_normalize(data)
# # save to csv
# df.to_csv('data_train.csv', index=False, encoding='utf-8')


In [2]:
# extract stars/ratings column. Will be used later when 
# extract text column since that is what we'll be using
df = pd.read_csv("data_train.csv")
ratings = df['stars'].values.tolist()
textdf = df.drop(['stars' ,'useful', 'funny','cool', 'date'], axis=1)
print(textdf.head())
total = len(textdf["text"]) 

                                                text
0  Total bill for this horrible service? Over $8G...
1  I *adore* Travis at the Hard Rock's new Kelly ...
2  I have to say that this office really has it t...
3  Went in for a lunch. Steak sandwich was delici...
4  Today was my second out of three sessions I ha...


In [3]:
# Word count for every review
text_len = []
for index, row in textdf.iterrows():
    text_len.append(len(row["text"]))

In [4]:
textdf['word_count'] = text_len
textdf.head()

Unnamed: 0,text,word_count
0,Total bill for this horrible service? Over $8G...,204
1,I *adore* Travis at the Hard Rock's new Kelly ...,1561
2,I have to say that this office really has it t...,615
3,Went in for a lunch. Steak sandwich was delici...,407
4,Today was my second out of three sessions I ha...,3509


In [5]:
from nltk.corpus import stopwords
words = stopwords.words("english")


In [6]:
# lowercase words in review and remove stopwords
review_list = []
for index, row in textdf.iterrows():
    temp = row["text"].split()
    temp2 = [word.lower() for word in temp if word.lower() not in words]
    review = " ".join(word for word in temp2)
    review_list.append(review)
review_list[:1]


['total bill horrible service? $8gs. crooks actually nerve charge us $69 3 pills. checked online pills 19 cents each! avoid hospital ers costs.']

In [41]:
from nltk.tokenize import sent_tokenize
sentence_count = []
for index, row in textdf.iterrows():
    num_of_sent = sent_tokenize(row["text"])
    sentence_count.append(len(num_of_sent))
print(sentence_count[:2])

[5, 17]


In [44]:
textdf['sentence_count'] = sentence_count
textdf.head()

Unnamed: 0,text,word_count,clean_stopwords_punc_reviews,final_clean,lemmatized_review,sentence_count
0,Total bill for this horrible service? Over $8G...,204,total bill horrible service 8gs crooks actuall...,total bill horrible service 8gs crooks actuall...,total bill horrible service 8gs crook actually...,5
1,I *adore* Travis at the Hard Rock's new Kelly ...,1561,adore travis hard rocks new kelly cardenas sal...,adore travis hard rocks new kelly cardenas sal...,adore travis hard rock new kelly cardenas salo...,17
2,I have to say that this office really has it t...,615,say office really together organized friendly ...,say office really together organized friendly ...,say office really together organized friendly ...,5
3,Went in for a lunch. Steak sandwich was delici...,407,went lunch steak sandwich delicious caesar sal...,went lunch steak sandwich delicious caesar sal...,went lunch steak sandwich delicious caesar sal...,9
4,Today was my second out of three sessions I ha...,3509,today second three sessions paid for although ...,today second three sessions paid for although ...,today second three session paid for although f...,29


In [7]:

# removes all punctuation from reviews
import string
clean_review = []
for review in review_list:
    clean_review.append(review.translate(str.maketrans('', '', string.punctuation)))
print(clean_review[:2])
textdf['clean_stopwords_punc_reviews'] = clean_review


['total bill horrible service 8gs crooks actually nerve charge us 69 3 pills checked online pills 19 cents each avoid hospital ers costs', 'adore travis hard rocks new kelly cardenas salon im always fan great blowout stranger chains offer service however travis taken flawless blowout whole new level traviss greets perfectly green swoosh otherwise perfectly styled black hair vegasworthy rockstar outfit next comes relaxing incredible shampoo  get full head message could cure even worst migraine minutes  scented shampoo room travis freakishly strong fingers in good way use perfect amount pressure superb starts glorious blowout one two three people involved best roundbrush action hair ever seen team stylists clearly gets along extremely well evident way talk help one another really genuine corporate requirement much fun there next travis started flat iron way flipped wrist get volume around without overdoing making look like texas pagent girl admirable also worth noting fry hair  something

In [8]:
# top 20 occuring words across all cleaned reviews to see what I could further remove
pd.Series(" ".join(textdf['clean_stopwords_punc_reviews']).split()).value_counts()[:20]

food       171013
good       161099
place      160453
great      143014
service    119359
like       116287
time       113930
get        108899
one        106897
would      100492
back        98733
go          86714
really      84101
also        71608
us          68329
it          65544
got         64238
even        63441
nice        59916
well        59233
dtype: int64

In [9]:
# more preprocessing to remove words that have no meaning
more_stopwords = ["us","im","ive","it","get"]
final_clean = []
for index, row in textdf.iterrows():
    temp = row["clean_stopwords_punc_reviews"].split()
    temp2 = [word for word in temp if word not in more_stopwords]
    clean_review = " ".join(word for word in temp2)
    final_clean.append(clean_review)
textdf['final_clean'] = final_clean

In [10]:
textdf.head()

Unnamed: 0,text,word_count,clean_stopwords_punc_reviews,final_clean
0,Total bill for this horrible service? Over $8G...,204,total bill horrible service 8gs crooks actuall...,total bill horrible service 8gs crooks actuall...
1,I *adore* Travis at the Hard Rock's new Kelly ...,1561,adore travis hard rocks new kelly cardenas sal...,adore travis hard rocks new kelly cardenas sal...
2,I have to say that this office really has it t...,615,say office really together organized friendly ...,say office really together organized friendly ...
3,Went in for a lunch. Steak sandwich was delici...,407,went lunch steak sandwich delicious caesar sal...,went lunch steak sandwich delicious caesar sal...
4,Today was my second out of three sessions I ha...,3509,today second three sessions paid for although ...,today second three sessions paid for although ...


In [11]:
from textblob import Word
from textblob import TextBlob

In [12]:
# creates a lemmatized version of reviews
lemmatized_reviews = []
for index, row in textdf.iterrows():
    temp = row["final_clean"].split()
    temp2 = [Word(word).lemmatize() for word in temp]
    lemmatized_review = " ".join(word for word in temp2)
    lemmatized_reviews.append(lemmatized_review)
textdf['lemmatized_review'] = lemmatized_reviews

In [13]:
textdf.head()

Unnamed: 0,text,word_count,clean_stopwords_punc_reviews,final_clean,lemmatized_review
0,Total bill for this horrible service? Over $8G...,204,total bill horrible service 8gs crooks actuall...,total bill horrible service 8gs crooks actuall...,total bill horrible service 8gs crook actually...
1,I *adore* Travis at the Hard Rock's new Kelly ...,1561,adore travis hard rocks new kelly cardenas sal...,adore travis hard rocks new kelly cardenas sal...,adore travis hard rock new kelly cardenas salo...
2,I have to say that this office really has it t...,615,say office really together organized friendly ...,say office really together organized friendly ...,say office really together organized friendly ...
3,Went in for a lunch. Steak sandwich was delici...,407,went lunch steak sandwich delicious caesar sal...,went lunch steak sandwich delicious caesar sal...,went lunch steak sandwich delicious caesar sal...
4,Today was my second out of three sessions I ha...,3509,today second three sessions paid for although ...,today second three sessions paid for although ...,today second three session paid for although f...


In [45]:
# Loops through all rows and calculates polarity for each lemmatized review
sentiment_list = []
for index, row in textdf.iterrows():
    sentiment_list.append([TextBlob(row["lemmatized_review"]).sentiment[0],row["word_count"],row["sentence_count"]] )
print(sentiment_list[:2])

[[-0.3333333333333333, 204, 5], [0.3301852559205501, 1561, 17]]


In [23]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [46]:
# split data into test and training for experimental purposes to see logisitc regression accuracy
X_train, X_test, y_train, y_test = train_test_split(sentiment_list, ratings, test_size=0.2)
print(X_train[:2])
print(X_test[:2])

[[0.20625000000000002, 436, 7], [0.07361111111111113, 485, 7]]
[[0.0, 87, 3], [0.0, 217, 4]]


In [51]:
# convert array to numpy array for fit function and fit model
sentiments = np.asarray(X_test)
print(sentiments[:2])
clf = LogisticRegression(multi_class='auto',solver='lbfgs', max_iter=1000).fit(X_train, y_train)

[[  0.  87.   3.]
 [  0. 217.   4.]]




In [54]:
# predicts rating for test set
clf.predict(sentiments[:100, :])

array([1., 1., 5., 5., 1., 5., 5., 5., 5., 1., 5., 5., 5., 5., 5., 5., 5.,
       1., 1., 1., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 1., 5., 5., 4.,
       5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 1., 1., 5.,
       5., 4., 4., 5., 5., 5., 1., 5., 5., 5., 1., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 1., 5., 5., 5., 1., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 4., 1., 5., 1., 5., 5., 5., 5., 1.])

In [55]:
# mean accuracy on test set
clf.score(X_test, y_test)

0.5287994136915001