In [0]:
#Basic imports 
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import re

#NLP imports 
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

import spacy
spacy.load('en')

# Feature Creations 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#Dimension Reduction 
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import Normalizer

#Model Imports 
from sklearn import ensemble
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

#Time
import time 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
df = pd.read_csv('https://www.dropbox.com/s/d4ye48a67tth2ae/Reviews.csv?dl=1')

In [0]:
#drop uneccessary columns 
df.drop(['Id','ProductId','UserId','ProfileName','HelpfulnessNumerator','HelpfulnessDenominator','Time', 'Summary'],axis=1,inplace=True)
df.head()

Unnamed: 0,Score,Text
0,5,I have bought several of the Vitality canned d...
1,1,Product arrived labeled as Jumbo Salted Peanut...
2,4,This is a confection that has been around a fe...
3,2,If you are looking for the secret ingredient i...
4,5,Great taffy at a great price. There was a wid...


In [0]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 2 columns):
Score    568454 non-null int64
Text     568454 non-null object
dtypes: int64(1), object(1)
memory usage: 8.7+ MB


In [0]:
df.isnull().sum()

Score    0
Text     0
dtype: int64

In [0]:
# Remove duplicate reviews  
df.drop_duplicates(subset=['Score','Text'],keep='first',inplace=True)  

In [0]:
df.Score.value_counts()

5    250745
4     56074
1     36280
3     29772
2     20804
Name: Score, dtype: int64

In [0]:
# Utility function to clean text.
def text_cleaner(text):
    
    # Visual inspection shows spaCy does not recognize the double dash '--'.
    
    text = re.sub(r'--',' ',text)
    
    # Removes hyperlinks 
    text = re.sub(r'<a\s+href=(?:"([^"]+)"|\'([^\']+)\').*?>(.*?)</a>',' ', text)
    
    # Get rid of extra whitespace.
    text = ' '.join(text.split())
    
    text = text.lower()
    
    return text

In [0]:
df['Clean'] = df['Text'].apply(lambda x: text_cleaner(x))

In [0]:
# Declare X and Y values
X = df.Clean
y = df.Score

In [0]:
#split out training/test data 
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

#split training set again to test best params with reduce execution times 
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_train,y_train, test_size=0.8, random_state=0)

In [0]:
# Display the shape to make sure the length is the same.
print(X_train2.shape)
print(y_train2.shape)

(62988,)
(62988,)


In [0]:
# Lemmatize our text reviews to limit variations on same words

stop_words = set(stopwords.words('english'))
lemmatizer = spacy.lang.en.English()

def lemmatize_sen(text):
    tokens = lemmatizer(text) 
    return([token.lemma_ for token in tokens if token not in stop_words])

In [0]:
# Setting max features to 1000 will choose the 1000 with the highest count.
vectorizer = CountVectorizer(max_features=1000, tokenizer=lemmatize_sen)
# Train the model and transform it
X_train2_matrix = vectorizer.fit_transform(X_train2)

X_train2_matrix

<62988x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 2994439 stored elements in Compressed Sparse Row format>

In [0]:
# X_test
X_test_matrix = vectorizer.transform(X_test)
X_test_matrix

<78735x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 3727078 stored elements in Compressed Sparse Row format>

### Logisitc Regression Model

In [0]:
start_time = time.time()
param_dist = {'penalty':['l1','l2'],
                'C':[1,100,1000]}

lr = LogisticRegression()

random_search = RandomizedSearchCV(LogisticRegression(class_weight = 'balanced',random_state=0, n_jobs= -1), param_distributions=param_dist, n_iter=5)
#Fit the Data
random_search.fit(X_train2_matrix, y_train2)
print(random_search.score(X_test_matrix, y_test))
print("-- Execution time: %s seconds ---" % (time.time() - start_time))

  " = {}.".format(effective_n_jobs(self.n_jobs)))


0.6808280942401728
-- Execution time: 529.4680571556091 seconds ---


In [0]:
random_search.best_params_

{'C': 1, 'penalty': 'l1'}

In [0]:
from sklearn.metrics import classification_report

print(classification_report(y_test, random_search.predict(X_test_matrix)))

              precision    recall  f1-score   support

           1       0.51      0.63      0.57      7335
           2       0.23      0.25      0.24      4160
           3       0.31      0.33      0.32      5980
           4       0.39      0.27      0.32     11089
           5       0.84      0.86      0.85     50171

   micro avg       0.68      0.68      0.68     78735
   macro avg       0.46      0.47      0.46     78735
weighted avg       0.67      0.68      0.67     78735



### Random Forest Model

In [0]:
#need to re-run cell for max_depth optimization and n_estimators 
from scipy.stats import randint as sp_randint
start_time = time.time()
param_dist = {'n_estimators':[500,700,800],
                'criterion':['entropy'],
                'max_depth':[6,8,10],
                'min_samples_split': [4],
               'max_features':sp_randint(1, 11),
              'bootstrap': [True]
              }

rf_random_search = RandomizedSearchCV(RandomForestClassifier(class_weight ='balanced_subsample', random_state=0), param_distributions=param_dist,
                                   n_iter=10)
#Fit the Data
rf_random_search.fit(X_train2_matrix, y_train2)
print(rf_random_search.score(X_test_matrix, y_test))
print("-- Execution time: %s seconds ---" % (time.time() - start_time))



0.617971677144853
-- Execution time: 877.2284212112427 seconds ---


In [0]:
rf_random_search.best_params_

{'bootstrap': True,
 'criterion': 'entropy',
 'max_depth': 10,
 'max_features': 1,
 'min_samples_split': 4,
 'n_estimators': 800}

In [0]:
print(classification_report(y_test, rf_random_search.predict(X_test_matrix)))

              precision    recall  f1-score   support

           1       0.43      0.62      0.51      7335
           2       0.23      0.13      0.16      4160
           3       0.23      0.27      0.24      5980
           4       0.28      0.25      0.27     11089
           5       0.80      0.78      0.79     50171

   micro avg       0.62      0.62      0.62     78735
   macro avg       0.40      0.41      0.40     78735
weighted avg       0.62      0.62      0.62     78735



### XGBoost Model

In [0]:
from scipy.stats import randint as sp_randint
start_time = time.time()
param_dist = {'max_depth':[5,7,9],
              'subsample':[0.5,0.7,0.9],
              'colsample_bytree': [0.5,0.7,0.9],
            'colsample_bylevel':[0.5,0.7,0.9]
              }
#scale_pos_weight 
xgb_random_search = RandomizedSearchCV(xgb.XGBClassifier(learning_rate =.01, random_state=0), param_distributions=param_dist,
                                   n_iter=10)
#Fit the Data
xgb_random_search.fit(X_train2_matrix, y_train2)
print(xgb_random_search.score(X_test_matrix, y_test))
print("-- Execution time: %s seconds ---" % (time.time() - start_time))



0.6678605448656887
-- Execution time: 3249.2764225006104 seconds ---


In [0]:
xgb_random_search.best_params_

{'colsample_bylevel': 0.7,
 'colsample_bytree': 0.9,
 'max_depth': 9,
 'subsample': 0.7}

In [0]:
print(classification_report(y_test, xgb_random_search.predict(X_test_matrix)))

              precision    recall  f1-score   support

           1       0.69      0.18      0.29      7335
           2       0.30      0.01      0.01      4160
           3       0.48      0.09      0.15      5980
           4       0.53      0.09      0.15     11089
           5       0.67      0.99      0.80     50171

   micro avg       0.67      0.67      0.67     78735
   macro avg       0.54      0.27      0.28     78735
weighted avg       0.62      0.67      0.57     78735



## Model #2

In [0]:
df2 = df.copy()
df2.head()

Unnamed: 0,Score,Text,Clean
0,5,I have bought several of the Vitality canned d...,i have bought several of the vitality canned d...
1,1,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled as jumbo salted peanut...
2,4,This is a confection that has been around a fe...,this is a confection that has been around a fe...
3,2,If you are looking for the secret ingredient i...,if you are looking for the secret ingredient i...
4,5,Great taffy at a great price. There was a wid...,great taffy at a great price. there was a wide...


In [0]:
#create training/test sets
X = df2.Clean
y = df2.Score

#split out training/test data 
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

#split training set again to test best params with reduce execution times 
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_train,y_train, test_size=0.8, random_state=0)

In [0]:
vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             lowercase=True, #convert everything to lower case
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True, #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                             tokenizer=lemmatize_sen)


#Applying the vectorizer
tfidf_X_train2_matrix = vectorizer.fit_transform(X_train2)

tfidf_X_train2_matrix

<62988x25634 sparse matrix of type '<class 'numpy.float64'>'
	with 2852791 stored elements in Compressed Sparse Row format>

In [0]:
# Apply to X test
tfidf_X_test_matrix = vectorizer.transform(X_test)

tfidf_X_test_matrix

<78735x25634 sparse matrix of type '<class 'numpy.float64'>'
	with 3530659 stored elements in Compressed Sparse Row format>

### Logistic Regression 2

In [0]:
start_time = time.time()
param_dist = {'penalty':['l1','l2'],
                'C':[1,100,1000]}

lr = LogisticRegression()

random_search = RandomizedSearchCV(LogisticRegression(class_weight = 'balanced',random_state=0, n_jobs= -1), param_distributions=param_dist, n_iter=5)
#Fit the Data
random_search.fit(tfidf_X_train2_matrix, y_train2)
print(random_search.score(tfidf_X_test_matrix, y_test))
print("-- Execution time: %s seconds ---" % (time.time() - start_time))

  " = {}.".format(effective_n_jobs(self.n_jobs)))


0.6901886073537816
-- Execution time: 736.85671043396 seconds ---


In [0]:
random_search.best_params_

{'C': 1, 'penalty': 'l2'}

In [0]:
from sklearn.metrics import classification_report

print(classification_report(y_test, random_search.predict(tfidf_X_test_matrix)))

              precision    recall  f1-score   support

           1       0.57      0.65      0.61      7335
           2       0.24      0.25      0.25      4160
           3       0.32      0.34      0.33      5980
           4       0.39      0.30      0.34     11089
           5       0.84      0.86      0.85     50171

   micro avg       0.69      0.69      0.69     78735
   macro avg       0.47      0.48      0.47     78735
weighted avg       0.68      0.69      0.69     78735



### Random Forest 2 

In [0]:
start_time = time.time()
param_dist = {'n_estimators':[500,700,800],
                'criterion':['entropy'],
                'max_depth':[6,8,10],
                'min_samples_split': [4],
               'max_features':sp_randint(1, 11),
              'bootstrap': [True]
              }

rf_random_search = RandomizedSearchCV(RandomForestClassifier(class_weight ='balanced_subsample', random_state=0), param_distributions=param_dist,
                                   n_iter=10)
#Fit the Data
rf_random_search.fit(tfidf_X_train2_matrix, y_train2)
print(rf_random_search.score(tfidf_X_test_matrix, y_test))
print("-- Execution time: %s seconds ---" % (time.time() - start_time))



0.6103638788340636
-- Execution time: 927.0712497234344 seconds ---


In [0]:
rf_random_search.best_params_

{'bootstrap': True,
 'criterion': 'entropy',
 'max_depth': 10,
 'max_features': 9,
 'min_samples_split': 4,
 'n_estimators': 800}

In [0]:
print(classification_report(y_test, rf_random_search.predict(tfidf_X_test_matrix)))

              precision    recall  f1-score   support

           1       0.46      0.60      0.52      7335
           2       0.22      0.16      0.18      4160
           3       0.22      0.28      0.25      5980
           4       0.28      0.24      0.26     11089
           5       0.79      0.77      0.78     50171

   micro avg       0.61      0.61      0.61     78735
   macro avg       0.39      0.41      0.40     78735
weighted avg       0.61      0.61      0.61     78735



### XGB 2 

In [0]:
from scipy.stats import randint as sp_randint
start_time = time.time()
param_dist = {'max_depth':[5,7,9],
              'subsample':[0.5,0.7,0.9],
              'colsample_bytree': [0.5,0.7,0.9],
            'colsample_bylevel':[0.5,0.7,0.9]
              }
#scale_pos_weight 
xgb_random_search = RandomizedSearchCV(xgb.XGBClassifier(learning_rate =.01, random_state=0), param_distributions=param_dist,
                                   n_iter=10)
#Fit the Data
xgb_random_search.fit(tfidf_X_train2_matrix, y_train2)
print(xgb_random_search.score(tfidf_X_test_matrix, y_test))
print("-- Execution time: %s seconds ---" % (time.time() - start_time))



In [0]:
xgb_random_search.best_params_

In [0]:
print(classification_report(y_test, rxgb_random_search.predict(tfidf_X_test_matrix)))