In [1]:
import pandas as pd
import os 
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
plt.style.use("ggplot")
os.getcwd()
os.chdir('/Users/caden/Desktop/yelp_project/data')

# 1. Load data

In [14]:
df = pd.read_csv('last_2_year_restaurant_reviews_Las_Vegas.csv')
documents = df['text'].copy()
target = df['stars'].apply(lambda x: 'perfect' if x == 5 else 'imperfect')
from sklearn.model_selection import train_test_split
documents_train, documents_test, target_train, target_test = train_test_split(documents, target, test_size = 0.2, random_state = 0)
documents_train = documents_train.reset_index(drop = True)
documents_test = documents_test.reset_index(drop = True)
target_train = target_train.reset_index(drop = True)
target_test = target_test.reset_index(drop = True)

# 2. NLP process

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
vectorizer = TfidfVectorizer(stop_words = stopwords.words('english'))
train_vector = vectorizer.fit_transform(documents_train)
test_vector = vectorizer.transform(documents_test)


# 3. Similar review search engine

In [37]:
# find the n closest vectors
def get_top_values(lst, n, labels):
    # print(np.argsort(lst)[::-1][:n])
    return [[labels[i]] for i in np.argsort(lst)[::-1][:n]]
# find the n furthest vectors
def get_bottom_values(lst, n, labels):
    return [[labels[i]] for i in np.argsort(lst)[::][n-1::-1]]


In [41]:
from sklearn.metrics.pairwise import cosine_similarity
import random
ar = documents_test.iloc[random.sample(range(len(documents_test)),1)].values
arbitrary_vector = vectorizer.transform(ar)
similarity = cosine_similarity(arbitrary_vector, train_vector)
similar_review = get_top_values(similarity[0], 5, documents_train)

In [42]:
print('Search query')
print(ar)

Search query
["SuperGeeks designed our new and improved website and I can't say enough good things about James and the experience. They made it super easy on our end and did a fantastic job. I would highly recommend this company if you are looking to revamp or create your new website.  A++\n\nwww.capital-flight.com"]


In [43]:
print('The 5 most similar reviews')
for i in similar_review:
    print(i)

The 5 most similar reviews
["My website was built professional by Mike. It was a great experience to watch my website come about. I'd highly recommend him because he made sure I was happy with the results. It was a great process and he did an excellent job. Even after he completed the website, he would still answer any questions I had afterwards. www.Balphawear.com"]
['Fantastic company. We needed to get more sales for our service company and hired SuperGeeks to do SEO. Two months later we are busier than ever. Great customer support from James and the team. Clear communication. I highly recommend them.']
['We recently updated our website and used James at Snelling Web Design. James has a lot of great ideas, answered all of our questions, and was a pleasure to work with. The new website turned out AMAZING and works perfect. We have had many complements about the website and would highly recommend him.']
["It has been almost 3 years since Bjorn created the website and logo for my busine

# 4. Build classifier for positive/negative review

### Naive-Bayes Classifier

In [44]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(train_vector, target_train)
model.score(train_vector, target_train)

0.8452500643503474

In [45]:
model.score(test_vector, target_test)

0.8367594776303284

### Logistic Regression Classifier

In [46]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(train_vector, target_train)
model.score(train_vector, target_train)

0.8707944022897723

In [47]:
model.score(test_vector, target_test)

0.860743954380654

#### Key features that make the positive prediction

In [48]:
words = vectorizer.get_feature_names()
get_top_values(model.coef_[0], 20, words)

[['amazing'],
 ['best'],
 ['thank'],
 ['highly'],
 ['awesome'],
 ['incredible'],
 ['professional'],
 ['happier'],
 ['phenomenal'],
 ['beyond'],
 ['fantastic'],
 ['exceeded'],
 ['grateful'],
 ['excellent'],
 ['heaven'],
 ['perfect'],
 ['love'],
 ['thanks'],
 ['great'],
 ['outstanding']]

#### Key features that make the negative prediction

In [49]:
get_bottom_values(model.coef_[0], 20, words)

[['unacceptable'],
 ['reason'],
 ['slow'],
 ['meh'],
 ['lacking'],
 ['okay'],
 ['disgusting'],
 ['awful'],
 ['lacked'],
 ['worse'],
 ['bland'],
 ['mediocre'],
 ['poor'],
 ['ok'],
 ['terrible'],
 ['disappointing'],
 ['rude'],
 ['unprofessional'],
 ['horrible'],
 ['worst']]