# Yelp Data Challenge - NLP

BitTiger DS501

Jun 2017

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/last_3_years_restaurant_reviews.csv')

In [3]:
df.head()

Unnamed: 0,business_id,name,categories,avg_stars,cool,date,funny,review_id,stars,text,useful,user_id
0,LDMCrFlGIFUN6L-FEFgzWg,El Pollo Loco,"['Restaurants', 'American (Traditional)', 'Mex...",3.0,0,2015-06-26,0,4gH-5f0ewrH2Vvl0UYtQQA,1,I'm at training for work and went for a quick ...,0,GLGz9sSNHIbguwv90XStYA
1,LDMCrFlGIFUN6L-FEFgzWg,El Pollo Loco,"['Restaurants', 'American (Traditional)', 'Mex...",3.0,2,2015-06-18,1,qlnMpBo8_GWhlJE6zR51qA,3,Given this location is close to my work I find...,2,YxqLJwDgcL4OoDB1hN-Ikg
2,LDMCrFlGIFUN6L-FEFgzWg,El Pollo Loco,"['Restaurants', 'American (Traditional)', 'Mex...",3.0,0,2015-05-15,0,woYGBjLlsrezUvBKiBwwhg,1,Never again will I return. The culture of thi...,0,i1zH7hGJs_accdfjEzrwUQ
3,LDMCrFlGIFUN6L-FEFgzWg,El Pollo Loco,"['Restaurants', 'American (Traditional)', 'Mex...",3.0,0,2017-02-15,0,_u6u4NB9XTNPgDOqJUbF7Q,5,this one is one of my favorite locations . Foo...,0,5TRDoYHqVvC81mq_p76HlQ
4,LDMCrFlGIFUN6L-FEFgzWg,El Pollo Loco,"['Restaurants', 'American (Traditional)', 'Mex...",3.0,0,2015-06-16,0,ilj0ZswUKjNMjw2kz7cgTg,4,A great mix between an authentic Mexican taco ...,0,Pg-sMoiilKCVPs41vf5V_Q


### Define your feature variables, here is the text of the review

In [4]:
# Take the values of the column that contains review text data, save to a variable named "documents"
documents = df['text'].values

In [5]:
# inspect your documents, e.g. check the size, take a peek at elements of the numpy array
documents.shape

(437524,)

### Define your target variable (any categorical variable that may be meaningful)

#### For example, I am interested in perfect (5 stars) and imperfect (1-4 stars) rating

In [6]:
# Make a column and take the values, save to a variable named "target"
df['favorable'] = df['stars']>4
target = df['favorable'].values

#### You may want to look at the statistic of the target variable

In [7]:
# To be implemented
target.mean()

0.45755661403717279

## Let's create training dataset and test dataset

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
# Documents is your X, target is your y
# Now split the data to training set and test set
documents_train, documents_test, target_train, target_test = train_test_split(documents, target, test_size=0.8, random_state=42)

In [10]:
# Split to documents_train, documents_test, target_train, target_test
documents_train.shape, documents_test.shape, target_train.shape, target_test.shape

((87504,), (350020,), (87504,), (350020,))

## Let's get NLP representation of the documents

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
# Create TfidfVectorizer, and name it vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

In [13]:
# Train the model with your training data
vector_train = vectorizer.fit_transform(documents_train)

In [18]:
vector_train = vector_train.toarray()

In [15]:
# Get the vocab of your tfidf
words = vectorizer.get_feature_names()
words

['00',
 '000',
 '00pm',
 '10',
 '100',
 '1000',
 '10am',
 '10pm',
 '11',
 '110',
 '11am',
 '11pm',
 '12',
 '120',
 '13',
 '14',
 '15',
 '150',
 '16',
 '17',
 '18',
 '19',
 '1am',
 '1pm',
 '1st',
 '20',
 '200',
 '2014',
 '2015',
 '2016',
 '2017',
 '21',
 '215',
 '22',
 '23',
 '24',
 '25',
 '250',
 '26',
 '27',
 '28',
 '29',
 '2am',
 '2nd',
 '2pm',
 '30',
 '300',
 '30am',
 '30pm',
 '31',
 '32',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '3am',
 '3pm',
 '3rd',
 '40',
 '400',
 '42',
 '45',
 '48',
 '49',
 '4pm',
 '4th',
 '50',
 '500',
 '55',
 '59',
 '5pm',
 '5th',
 '60',
 '65',
 '6pm',
 '70',
 '75',
 '7pm',
 '80',
 '85',
 '8am',
 '8pm',
 '90',
 '95',
 '99',
 '9am',
 '9pm',
 'ability',
 'able',
 'absolute',
 'absolutely',
 'abundance',
 'ac',
 'acai',
 'accent',
 'accept',
 'acceptable',
 'accepted',
 'access',
 'accessible',
 'accident',
 'accidentally',
 'accommodate',
 'accommodated',
 'accommodating',
 'accompanied',
 'accompanying',
 'according',
 'accordingly',
 'account',
 'acc

In [16]:
# Use the trained model to transform your test data
vector_test = vectorizer.transform(documents_test)

## Similar review search engine

In [17]:
import numpy as np

# We will need these helper methods pretty soon

def get_top_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the highest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["cat", "pig"]
    '''
    return [labels[i] for i in np.argsort(lst)[::-1][:n]]  # np.argsort by default sorts values in ascending order

def get_bottom_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the lowest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["mouse", "rabbit"]
    '''
    #pass  # To be implemented
    return [labels[i] for i in np.argsort(lst)[:n]]


In [19]:
# Let's use cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

In [20]:
# Draw an arbitrary review from test (unseen in training) documents
sample = np.random.choice(len(documents_test))
sample_view = documents_test[sample]

In [21]:
sample_view

"Best Ramen ever!!! We got the Brussels Sprouts tempura to start and the Creamy Spicy Vegan Ramen and Vegetable Ramen and it was SO delicious!!! Can't wait to come back!!"

In [22]:
# Transform the drawn review(s) to vector(s)
sample_vector = vectorizer.transform([sample_view]).toarray()

In [23]:
# Calculate the similarity score(s) between vector(s) and training vectors
sim_scores = cosine_similarity(vector_train, sample_vector)

In [24]:
sim_scores.shape

(87504, 1)

In [25]:
# Let's find top 5 similar reviews
n = 5
top5 = get_top_values(sim_scores, n, documents_train)
for i in range(n):
    print(i)
    print('{}'.format(top5[i]))
    print('')

0
[ 'I went for happy hour and it was amazing! Brandon and Robert were amazing bartenders, super friendly and attentive. The Mai tai was delicious! The appetizers like the crab balls were great and filling! Def would come back again!']

1
[ 'I went for happy hour and it was amazing! Brandon and Robert were amazing bartenders, super friendly and attentive. The Mai tai was delicious! The appetizers like the crab balls were great and filling! Def would come back again!']

2
[ 'I went for happy hour and it was amazing! Brandon and Robert were amazing bartenders, super friendly and attentive. The Mai tai was delicious! The appetizers like the crab balls were great and filling! Def would come back again!']

3
[ 'I went for happy hour and it was amazing! Brandon and Robert were amazing bartenders, super friendly and attentive. The Mai tai was delicious! The appetizers like the crab balls were great and filling! Def would come back again!']

4
[ 'I went for happy hour and it was amazing! Brand

In [None]:
print 'Our search query:'
print  # To be added

In [None]:
print 'Most %s similar reviews:' % n
print  # To be added

#### Q: Does the result make sense to you?

A: (insert your comments here)

## Classifying positive/negative review

#### Naive-Bayes Classifier

In [28]:
# Build a Naive-Bayes Classifier

from sklearn.naive_bayes import MultinomialNB
model_nb = MultinomialNB()

model_nb.fit(vector_train, target_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [30]:
# Get score for training set
model_nb.score(vector_train, target_train) # accuracy

0.80776878771256166

In [31]:
# Get score for test set
model_nb.score(vector_test, target_test)

0.80208273812924979

#### Logistic Regression Classifier

In [32]:
# Build a Logistic Regression Classifier

from sklearn.linear_model import LogisticRegression

model_lrc = LogisticRegression()
model_lrc.fit(vector_train, target_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [33]:
# Get score for training set
model_lrc.score(vector_train, target_train)

0.84123011519473401

In [34]:
# Get score for test set
model_lrc.score(vector_test, target_test)

0.82505856808182387

#### Q: What are the key features(words) that make the positive prediction?

In [35]:
# Let's find it out by ranking
n = 20
get_top_values(model_lrc.coef_[0], n, words)

['amazing',
 'best',
 'perfect',
 'highly',
 'awesome',
 'delicious',
 'phenomenal',
 'excellent',
 'heaven',
 'fantastic',
 'thank',
 'incredible',
 'perfection',
 'perfectly',
 'great',
 'die',
 'favorite',
 'gem',
 'outstanding',
 'fabulous']

A: (insert your comments here)

#### Q: What are the key features(words) that make the negative prediction?

In [36]:
# Let's find it out by ranking
n = 20
get_bottom_values(model_lrc.coef_[0], n, words)

['worst',
 'ok',
 'horrible',
 'slow',
 'rude',
 'bland',
 'terrible',
 'mediocre',
 'disappointing',
 'average',
 'okay',
 'poor',
 'dry',
 'unfortunately',
 'decent',
 'lacking',
 'overpriced',
 'wasn',
 'bad',
 'worse']

A: (insert your comments here)

#### Random Forest Classifier

In [38]:
# Build a Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

model_rfc = RandomForestClassifier(max_depth = None,
                                   n_estimators = 5,
                                   min_samples_leaf = 10)
model_rfc.fit(vector_train, target_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [40]:
# Get score for training set
model_rfc.score(vector_train, target_train)

0.81291140976412501

In [41]:
# Get score for test set
model_rfc.score(vector_test, target_test)

0.76867607565281981

#### Q: What do you see from the training score and the test score?

A: (insert your comments here)

#### Q: Can you tell what features (words) are important by inspecting the RFC model?

In [42]:
n = 20
get_top_values(model_rfc.feature_importances_, n, words)

['best',
 'great',
 'bad',
 'amazing',
 'delicious',
 'love',
 'ok',
 'definitely',
 'worst',
 'okay',
 'terrible',
 'awesome',
 'horrible',
 'wasn',
 'decent',
 'excellent',
 'vegas',
 'said',
 'favorite',
 'place']

## Extra Credit #1: Use cross validation to evaluate your classifiers

[sklearn cross validation](http://scikit-learn.org/stable/modules/cross_validation.html)

In [44]:
# To be implemented
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(model_lrc,
                            vector_train,
                            target_train,
                            cv = 5,
                            scoring="accuracy")
cv_scores

array([ 0.82149591,  0.8213245 ,  0.82006742,  0.82309582,  0.82285714])

## Extra Credit #2: Use grid search to find best predictable classifier


[sklearn grid search tutorial (with cross validation)](http://scikit-learn.org/stable/modules/grid_search.html#grid-search)

[sklearn grid search documentation (with cross validation)](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV)

In [47]:
# To be implemented
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

param_grid = [{'penalty':['l1'], 'C':[0.1, 100]},
              {'penalty':['l2'], 'C':[0.1, 100]}]

scores = ['accuracy']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score + "\n\n")
    clf = GridSearchCV(LogisticRegression(),
                       param_grid,
                       cv=5,
                       scoring=score)
    clf.fit(vector_train[:500,:], target_train[:500])
    print("Best parameters set found on development set:\n\n")
    print(clf.best_params_)
    print("\nGrid scores on development set:\n\n")
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    
    print("\nDetailed classification report:\n")
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print("\n")
    y_true, y_pred = target_test, clf.predict(vector_test)
    print(classification_report(y_true, y_pred))
    print("\n")

# Tuning hyper-parameters for accuracy


Best parameters set found on development set:


{'C': 100, 'penalty': 'l2'}

Grid scores on development set:


0.548 (+/-0.005) for {'C': 0.1, 'penalty': 'l1'}
0.722 (+/-0.078) for {'C': 100, 'penalty': 'l1'}
0.562 (+/-0.024) for {'C': 0.1, 'penalty': 'l2'}
0.730 (+/-0.053) for {'C': 100, 'penalty': 'l2'}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.


             precision    recall  f1-score   support

      False       0.75      0.79      0.77    190008
       True       0.73      0.68      0.70    160012

avg / total       0.74      0.74      0.74    350020



