In [94]:
#Basic imports 
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import re

#NLP imports 
import nltk
nltk.download('gutenberg')
nltk.download('stopwords')

import spacy
spacy.load('en')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#Dimension Reduction 
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import Normalizer

#Model Imports 
from sklearn import ensemble
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

#Time
import time 

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
df = pd.read_csv('https://www.dropbox.com/s/d4ye48a67tth2ae/Reviews.csv?dl=1')

In [85]:
#drop uneccessary columns 
df.drop(['Id','ProductId','UserId','ProfileName','HelpfulnessNumerator','HelpfulnessDenominator','Time', 'Summary'],axis=1,inplace=True)
df.head()

Unnamed: 0,Score,Text
0,5,I have bought several of the Vitality canned d...
1,1,Product arrived labeled as Jumbo Salted Peanut...
2,4,This is a confection that has been around a fe...
3,2,If you are looking for the secret ingredient i...
4,5,Great taffy at a great price. There was a wid...


In [86]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 2 columns):
Score    568454 non-null int64
Text     568454 non-null object
dtypes: int64(1), object(1)
memory usage: 8.7+ MB


In [87]:
df.isnull().sum()

Score    0
Text     0
dtype: int64

In [88]:
df.Score.value_counts()

5    363122
4     80655
1     52268
3     42640
2     29769
Name: Score, dtype: int64

In [0]:
# Utility function to clean text.
def text_cleaner(text):
    
    # Visual inspection shows spaCy does not recognize the double dash '--'.
    
    text = re.sub(r'--',' ',text)
    
    # Removes hyperlinks 
    text = re.sub(r'<a\s+href=(?:"([^"]+)"|\'([^\']+)\').*?>(.*?)</a>',' ', text)
    
    # Get rid of extra whitespace.
    text = ' '.join(text.split())
    
    return text

In [0]:
df['Clean'] = df['Text'].apply(lambda x: text_cleaner(x))

In [0]:
#create training/test sets
X = df.Clean
y = df.Score

#split out training/test data 
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

#split training set again to test best params with reduce execution times 
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_train,y_train, test_size=0.8, random_state=0)

In [92]:
# Display the shape to make sure the length is the same.
print(X_train2.shape)
print(y_train2.shape)

(90952,)
(90952,)


In [96]:
# Setting max features to 1000 will choose the 1000 with the highest count.
vectorizer = CountVectorizer(max_features=1000)
# Train the model and transform it
X_train2_matrix = vectorizer.fit_transform(X_train2)

X_train2_matrix

<90952x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 3878310 stored elements in Compressed Sparse Row format>

In [99]:
# X_test
X_test_matrix = vectorizer.transform(X_test)
X_test_matrix

<113691x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 4812344 stored elements in Compressed Sparse Row format>

### Logisitc Regression Model

In [103]:
start_time = time.time()
param_dist = {'penalty':['l1','l2'],
                'C':[1,100,1000]}

lr = LogisticRegression()

random_search = RandomizedSearchCV(LogisticRegression(class_weight = 'balanced',random_state=0, n_jobs= -1), param_distributions=param_dist, n_iter=5)
#Fit the Data
random_search.fit(X_train2_matrix, y_train2)
print(random_search.score(X_test_matrix, y_test))
print("-- Execution time: %s seconds ---" % (time.time() - start_time))

  " = {}.".format(effective_n_jobs(self.n_jobs)))


0.6869057357222648
-- Execution time: 260.53825545310974 seconds ---


In [104]:
random_search.best_params_

{'C': 1, 'penalty': 'l1'}

In [106]:
from sklearn.metrics import classification_report

print(classification_report(y_test, random_search.predict(X_test_matrix)))

              precision    recall  f1-score   support

           1       0.52      0.65      0.57     10267
           2       0.25      0.27      0.26      6185
           3       0.32      0.34      0.33      8450
           4       0.42      0.28      0.34     16229
           5       0.84      0.86      0.85     72560

   micro avg       0.69      0.69      0.69    113691
   macro avg       0.47      0.48      0.47    113691
weighted avg       0.68      0.69      0.68    113691



### Random Forest Model

In [107]:
#need to re-run cell for max_depth optimization and n_estimators 
from scipy.stats import randint as sp_randint
start_time = time.time()
param_dist = {'n_estimators':[500,700,800],
                'criterion':['entropy'],
                'max_depth':[6,8,10],
                'min_samples_split': [4],
               'max_features':sp_randint(1, 11),
              'bootstrap': [True]
              }

rf_random_search = RandomizedSearchCV(RandomForestClassifier(class_weight ='balanced_subsample', random_state=0), param_distributions=param_dist,
                                   n_iter=10)
#Fit the Data
rf_random_search.fit(X_train2_matrix, y_train2)
print(rf_random_search.score(X_test_matrix, y_test))
print("-- Execution time: %s seconds ---" % (time.time() - start_time))



0.6109190701110906
-- Execution time: 264.1375939846039 seconds ---


In [108]:
rf_random_search.best_params_

{'bootstrap': True,
 'criterion': 'entropy',
 'max_depth': 6,
 'max_features': 3,
 'min_samples_split': 4,
 'n_estimators': 500}

In [109]:
print(classification_report(y_test, rf_random_search.predict(X_test_matrix)))

              precision    recall  f1-score   support

           1       0.44      0.58      0.50     10267
           2       0.25      0.20      0.22      6185
           3       0.24      0.35      0.29      8450
           4       0.31      0.25      0.28     16229
           5       0.79      0.76      0.78     72560

   micro avg       0.61      0.61      0.61    113691
   macro avg       0.41      0.43      0.41    113691
weighted avg       0.62      0.61      0.61    113691



### XGBoost Model

In [0]:
from scipy.stats import randint as sp_randint
start_time = time.time()
param_dist = {'max_depth':[5,7,9],
              'subsample':[0.5,0.7,0.9],
              'colsample_bytree': [0.5,0.7,0.9],
            'colsample_bylevel':[0.5,0.7,0.9]
              }
#scale_pos_weight 
xgb_random_search = RandomizedSearchCV(xgb.XGBClassifier(learning_rate =.01, random_state=0), param_distributions=param_dist,
                                   n_iter=10)
#Fit the Data
xgb_random_search.fit(X_train2_matrix, y_train2)
print(xgb_random_search.score(X_test_matrix, y_test))
print("-- Execution time: %s seconds ---" % (time.time() - start_time))



In [0]:
xgb_random_search.best_params_

In [0]:
print(classification_report(y_test, rxgb_random_search.predict(X_test_matrix)))

## Model #2

In [66]:
df2 = df.copy()
df2.head()

Unnamed: 0,Score,Text,Clean
0,5,I have bought several of the Vitality canned d...,I have bought several of the Vitality canned d...
1,1,Product arrived labeled as Jumbo Salted Peanut...,Product arrived labeled as Jumbo Salted Peanut...
2,4,This is a confection that has been around a fe...,This is a confection that has been around a fe...
3,2,If you are looking for the secret ingredient i...,If you are looking for the secret ingredient i...
4,5,Great taffy at a great price. There was a wid...,Great taffy at a great price. There was a wide...


In [0]:
#create training/test sets
X = df2.Clean
y = df2.Score

#split out training/test data 
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

#split training set again to test best params with reduce execution times 
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_train,y_train, test_size=0.8, random_state=0)

In [81]:
X_train2 = X_train2.str.cat()

AttributeError: ignored

In [0]:
def bag_of_words(text):
    allwords = [token.lemma_
               for token in text
               if not token.is_punct
               and not token.is_stop]
    return [item[0] for item in Counter(allwords).most_common(2000)]

def bow_features(sentences, common_words):
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:,common_words] = 0
    
    for i, sentence in enumerate(df['text_sentence']):
        words = [token.lemma_ 
                for token in sentence
                if (
                    not token.is_punct
                    and not token.is_stop
                    and token.lemma_ in common_words
                )]
        for word in words:
            df.loc[i, word] += 1
        if i%500 == 0:
            print('Processing row {}'.format(i))
    return df

In [82]:
nlp = spacy.load('en')
df_text = nlp(X_train2)

ValueError: ignored