In [1]:
import pandas
import numpy as np

In [2]:
columns = ["review_text", "score"]

training = pandas.read_csv('input/training.txt', delimiter = "\t", header=None)
training.columns = columns

## Text Pre-processing
__[Text Preprocessing in Python: Steps, Tools, and Examples](https://medium.com/@datamonsters/text-preprocessing-in-python-steps-tools-and-examples-bf025f872908)__

In [3]:
import string
from sklearn.feature_extraction import text

import nltk
nltk.download('punkt')
nltk.download('wordnet')

from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /home/mhssain9/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/mhssain9/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
def process_dataframe(dataframe, input_column, output_column):
    for input, output in zip(input_column,output_column):
        dataframe[output] = dataframe[input].astype(str).str.lower()

        dataframe[output] = dataframe[output].str.replace('\d+', '')

        dataframe[output] = dataframe[output].str.replace('[{}]'.format(string.punctuation), '')

        stop_words = text.ENGLISH_STOP_WORDS
        dataframe[output] = dataframe[output].str.replace( '\b(?:{})\b'.format('|'.join(stop_words)), '')
        dataframe[output] = dataframe[output].str.replace(r'\s+', ' ')

        stemmer= SnowballStemmer("english")
        lemmatizer=WordNetLemmatizer()

        dataframe[output] = dataframe[output].apply(word_tokenize)
        dataframe[output] = dataframe[output].apply(lambda x: [stemmer.stem(y) for y in x])
        dataframe[output] = dataframe[output].apply(lambda x: [lemmatizer.lemmatize(y) for y in x])
        dataframe[output] = dataframe[output].apply(lambda x: " ".join(x) )
    
    return dataframe

In [5]:
training = process_dataframe(training, ["review_text"], ["word_token"])

## Bag of Words

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

### Vectorization with SVD

In [7]:
def vectorize_words(dataframe, x, keys=None):
    
    vectorizer = TfidfVectorizer(vocabulary = keys)
    # tokenize and build vocab
    vectorizer.fit(dataframe[x])
    # encode document
    vectorized_word_token = vectorizer.transform(dataframe[x])
    
    return vectorized_word_token

In [8]:
def svd_words(vectorized_word_token, pca_n_components=1000, random_state=None):
    
    svd = TruncatedSVD(n_components=pca_n_components, random_state=random_state)
    vectorized_word_token = svd.fit_transform(vectorized_word_token) 
    
    return svd, vectorized_word_token

In [9]:
x_naive = vectorize_words(training, "word_token")
_, x = svd_words(x_naive)

In [10]:
y = training["score"]
y = np.ravel(y)

## Classification

### Cross-validation

In [11]:
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import recall_score

In [12]:
def cross_validation(classifier_dic, X, y):
    scoring = ['precision_weighted', 'recall_weighted', 'f1_weighted']
    
    scores = {}
    for key, classifier in classifier_dic.items():
        scores[key] = cross_validate(classifier, X, y, cv=3, return_train_score=True, scoring=scoring)
        print (key, " scores: ", scores[key], "\n" )
        
    return scores

### Classifier

In [13]:
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

In [14]:
from sklearn.model_selection import GridSearchCV
import itertools    

In [15]:
classifier_dic = {}
classifier_dic['MLPClassifier'] = MLPClassifier(max_iter=500, early_stopping=True) 
classifier_dic['LogisticRegression'] = LogisticRegression(solver='liblinear', multi_class='auto')
classifier_dic['AdaBoostClassifier'] = AdaBoostClassifier()
# SVC is very slow running 
classifier_dic['SVC'] = SVC(gamma='auto')

#### Grid Search to find parameters to to achieve best possible results 

In [16]:
params = {}

units = []
for hidden_layers in range(1, 4):
    units = units + [x for x in itertools.product(np.arange(32, 257, 32),repeat=hidden_layers)]

params['MLPClassifier'] = dict(hidden_layer_sizes=units)
params['SVC'] = dict( kernel=['linear', 'poly', 'rbf', 'sigmoid'], C=np.arange(0.1, 2.0, 0.3) )
params['AdaBoostClassifier'] = dict( n_estimators=np.arange(20, 101, 10) )
params['LogisticRegression'] = dict( penalty=['l1', 'l2'], C=np.arange(0.1, 2.0, 0.3) )

In [17]:
grid = {}
grid_result = {}
for key, classifier in classifier_dic.items():
        if key in params:
            grid[key] = GridSearchCV(classifier, param_grid=params[key], cv=3, verbose=2)
            grid_result[key] = grid[key].fit(x, y)

            # summarize results
            print(key, " : Best: %f using %s" % (grid_result[key].best_score_, grid_result[key].best_params_))

Fitting 3 folds for each of 584 candidates, totalling 1752 fits
[CV] hidden_layer_sizes=(32,) ........................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ......................... hidden_layer_sizes=(32,), total=  18.9s
[CV] hidden_layer_sizes=(32,) ........................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   18.9s remaining:    0.0s


[CV] ......................... hidden_layer_sizes=(32,), total=  17.8s
[CV] hidden_layer_sizes=(32,) ........................................
[CV] ......................... hidden_layer_sizes=(32,), total=  24.5s
[CV] hidden_layer_sizes=(64,) ........................................
[CV] ......................... hidden_layer_sizes=(64,), total=  18.1s
[CV] hidden_layer_sizes=(64,) ........................................
[CV] ......................... hidden_layer_sizes=(64,), total=  14.3s
[CV] hidden_layer_sizes=(64,) ........................................
[CV] ......................... hidden_layer_sizes=(64,), total=  15.3s
[CV] hidden_layer_sizes=(96,) ........................................
[CV] ......................... hidden_layer_sizes=(96,), total=  19.4s
[CV] hidden_layer_sizes=(96,) ........................................
[CV] ......................... hidden_layer_sizes=(96,), total=  20.5s
[CV] hidden_layer_sizes=(96,) ........................................
[CV] .

[Parallel(n_jobs=1)]: Done 1752 out of 1752 | elapsed: 1393.7min finished


MLPClassifier  : Best: 0.451071 using {'hidden_layer_sizes': (224, 192, 160)}
Fitting 3 folds for each of 14 candidates, totalling 42 fits
[CV] C=0.1, penalty=l1 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................ C=0.1, penalty=l1, total=   2.7s
[CV] C=0.1, penalty=l1 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.7s remaining:    0.0s


[CV] ................................ C=0.1, penalty=l1, total=   2.3s
[CV] C=0.1, penalty=l1 ...............................................
[CV] ................................ C=0.1, penalty=l1, total=   2.0s
[CV] C=0.1, penalty=l2 ...............................................
[CV] ................................ C=0.1, penalty=l2, total=   2.9s
[CV] C=0.1, penalty=l2 ...............................................
[CV] ................................ C=0.1, penalty=l2, total=   2.8s
[CV] C=0.1, penalty=l2 ...............................................
[CV] ................................ C=0.1, penalty=l2, total=   3.4s
[CV] C=0.4, penalty=l1 ...............................................
[CV] ................................ C=0.4, penalty=l1, total=   2.8s
[CV] C=0.4, penalty=l1 ...............................................
[CV] ................................ C=0.4, penalty=l1, total=   2.6s
[CV] C=0.4, penalty=l1 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  42 out of  42 | elapsed:  3.3min finished


LogisticRegression  : Best: 0.439964 using {'C': 0.4, 'penalty': 'l2'}
Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] n_estimators=20 .................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .................................. n_estimators=20, total=  15.4s
[CV] n_estimators=20 .................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   15.5s remaining:    0.0s


[CV] .................................. n_estimators=20, total=  15.0s
[CV] n_estimators=20 .................................................
[CV] .................................. n_estimators=20, total=  15.0s
[CV] n_estimators=30 .................................................
[CV] .................................. n_estimators=30, total=  22.4s
[CV] n_estimators=30 .................................................
[CV] .................................. n_estimators=30, total=  22.2s
[CV] n_estimators=30 .................................................
[CV] .................................. n_estimators=30, total=  22.3s
[CV] n_estimators=40 .................................................
[CV] .................................. n_estimators=40, total=  29.5s
[CV] n_estimators=40 .................................................
[CV] .................................. n_estimators=40, total=  29.4s
[CV] n_estimators=40 .................................................
[CV] .

[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed: 20.2min finished


AdaBoostClassifier  : Best: 0.386232 using {'n_estimators': 60}
Fitting 3 folds for each of 28 candidates, totalling 84 fits
[CV] C=0.1, kernel=linear ............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ............................. C=0.1, kernel=linear, total= 1.9min
[CV] C=0.1, kernel=linear ............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.8min remaining:    0.0s


[CV] ............................. C=0.1, kernel=linear, total= 1.9min
[CV] C=0.1, kernel=linear ............................................
[CV] ............................. C=0.1, kernel=linear, total= 1.9min
[CV] C=0.1, kernel=poly ..............................................
[CV] ............................... C=0.1, kernel=poly, total= 2.2min
[CV] C=0.1, kernel=poly ..............................................
[CV] ............................... C=0.1, kernel=poly, total= 2.1min
[CV] C=0.1, kernel=poly ..............................................
[CV] ............................... C=0.1, kernel=poly, total= 2.1min
[CV] C=0.1, kernel=rbf ...............................................
[CV] ................................ C=0.1, kernel=rbf, total= 2.2min
[CV] C=0.1, kernel=rbf ...............................................
[CV] ................................ C=0.1, kernel=rbf, total= 2.2min
[CV] C=0.1, kernel=rbf ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  84 out of  84 | elapsed: 244.0min finished


SVC  : Best: 0.445667 using {'C': 0.4, 'kernel': 'linear'}


In [18]:
classifier_dic

{'MLPClassifier': MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
        beta_2=0.999, early_stopping=True, epsilon=1e-08,
        hidden_layer_sizes=(100,), learning_rate='constant',
        learning_rate_init=0.001, max_iter=500, momentum=0.9,
        n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
        random_state=None, shuffle=True, solver='adam', tol=0.0001,
        validation_fraction=0.1, verbose=False, warm_start=False),
 'LogisticRegression': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='auto',
           n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
           tol=0.0001, verbose=0, warm_start=False),
 'AdaBoostClassifier': AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
           learning_rate=1.0, n_estimators=50, random_state=None),
 'SVC': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
   decisi

In [19]:
for key in grid_result:
    classifier_dic[key] = grid_result[key].best_estimator_

In [20]:
classifier_dic

{'MLPClassifier': MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
        beta_2=0.999, early_stopping=True, epsilon=1e-08,
        hidden_layer_sizes=(224, 192, 160), learning_rate='constant',
        learning_rate_init=0.001, max_iter=500, momentum=0.9,
        n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
        random_state=None, shuffle=True, solver='adam', tol=0.0001,
        validation_fraction=0.1, verbose=False, warm_start=False),
 'LogisticRegression': LogisticRegression(C=0.4, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='auto',
           n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
           tol=0.0001, verbose=0, warm_start=False),
 'AdaBoostClassifier': AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
           learning_rate=1.0, n_estimators=60, random_state=None),
 'SVC': SVC(C=0.4, cache_size=200, class_weight=None, coef0=0.0,


#### Cross-validation

In [21]:
scores = cross_validation(classifier_dic, x, y)

MLPClassifier  scores:  {'fit_time': array([51.44965816, 56.59893203, 46.19132113]), 'score_time': array([0.12717366, 0.10879779, 0.17913914]), 'test_precision_weighted': array([0.42537576, 0.43951351, 0.47879742]), 'train_precision_weighted': array([0.78676397, 0.79959736, 0.68934307]), 'test_recall_weighted': array([0.4140414 , 0.43667467, 0.45539201]), 'train_recall_weighted': array([0.78306561, 0.79930952, 0.68747187]), 'test_f1_weighted': array([0.38261431, 0.43735707, 0.46259672]), 'train_f1_weighted': array([0.7803419 , 0.7984398 , 0.68738088])} 

LogisticRegression  scores:  {'fit_time': array([3.87779951, 4.54455662, 4.33399916]), 'score_time': array([0.02047086, 0.02287316, 0.01790047]), 'test_precision_weighted': array([0.42213601, 0.41271386, 0.46889787]), 'train_precision_weighted': array([0.6655802 , 0.65958325, 0.63241605]), 'test_recall_weighted': array([0.40624062, 0.43187275, 0.48182637]), 'train_recall_weighted': array([0.66791773, 0.66196337, 0.63630908]), 'test_f1_

## Sentiment Words

In [22]:
from scipy.sparse import hstack

In [23]:
positive_words = pandas.read_csv('input/positive-words.txt', delimiter = "\t", header=None, 
                                 encoding='utf-8', names=["positive_words"])
negative_words = pandas.read_csv('input/negative-words.txt', delimiter = "\t", header=None, 
                                 encoding='utf-8', names=["negative_words"])

In [24]:
positive_words = process_dataframe(positive_words, ["positive_words"], ["positive_word_token"])
negative_words = process_dataframe(negative_words, ["negative_words"], ["negative_word_token"])

In [25]:
positive_words.drop_duplicates(subset ="positive_word_token", keep = False, inplace = True)
negative_words.drop_duplicates(subset ="negative_word_token", keep = False, inplace = True)

In [26]:
positive_word_vector = vectorize_words(training, "word_token", positive_words["positive_word_token"].values )
negative_word_vector = vectorize_words(training, "word_token", negative_words["negative_word_token"].values )

In [27]:
x_train_naive = hstack((positive_word_vector, negative_word_vector))

In [28]:
svd, x_train = svd_words(x_train_naive)

#### Cross-validation

In [29]:
scores = cross_validation(classifier_dic, x_train, y)

MLPClassifier  scores:  {'fit_time': array([46.51154065, 49.24570537, 54.11338973]), 'score_time': array([0.29101944, 0.17219973, 0.10764194]), 'test_precision_weighted': array([0.37512241, 0.35848457, 0.39421927]), 'train_precision_weighted': array([0.50620237, 0.48660251, 0.57099751]), 'test_recall_weighted': array([0.37233723, 0.3622449 , 0.39681586]), 'train_recall_weighted': array([0.49947455, 0.48649054, 0.55828957]), 'test_f1_weighted': array([0.36494665, 0.34836838, 0.38622813]), 'train_f1_weighted': array([0.49740406, 0.47707109, 0.54723746])} 

LogisticRegression  scores:  {'fit_time': array([4.34997749, 4.56386471, 4.57805753]), 'score_time': array([0.02422929, 0.01689863, 0.01790404]), 'test_precision_weighted': array([0.34012933, 0.37132648, 0.38715564]), 'train_precision_weighted': array([0.50874328, 0.49847636, 0.49308033]), 'test_recall_weighted': array([0.34983498, 0.37635054, 0.39591469]), 'train_recall_weighted': array([0.50923285, 0.49759832, 0.49407352]), 'test_f1_

## Testing

In [30]:
from sklearn.metrics import classification_report

In [31]:
testing = pandas.read_csv('input/test.txt', delimiter = "\t", header=None)
testing.columns = columns

In [32]:
testing = process_dataframe(testing, ["review_text"], ["word_token"])

In [33]:
testing_positive_word_dataframe = vectorize_words(testing, "word_token", positive_words["positive_word_token"].values )
testing_negative_word_dataframe = vectorize_words(testing, "word_token", negative_words["negative_word_token"].values )

In [34]:
x_true_naive = hstack((testing_positive_word_dataframe, testing_negative_word_dataframe))

In [35]:
x_true = svd.transform(x_true_naive)

In [36]:
y_true = testing["score"]
y_true = np.ravel(y_true)

In [37]:
for key, classifier in classifier_dic.items():
    classifier.fit(x_train, y)
    y_pred = classifier.predict(x_true)
    print(key, " : \n", classification_report(y_true, y_pred))

MLPClassifier  : 
               precision    recall  f1-score   support

         1.0       0.43      0.58      0.49       200
         2.0       0.30      0.13      0.18       200
         3.0       0.27      0.39      0.32       200
         4.0       0.33      0.17      0.23       200
         5.0       0.39      0.50      0.44       200

   micro avg       0.35      0.35      0.35      1000
   macro avg       0.34      0.35      0.33      1000
weighted avg       0.34      0.35      0.33      1000

LogisticRegression  : 
               precision    recall  f1-score   support

         1.0       0.46      0.52      0.48       200
         2.0       0.33      0.29      0.31       200
         3.0       0.27      0.28      0.28       200
         4.0       0.32      0.25      0.28       200
         5.0       0.42      0.49      0.46       200

   micro avg       0.37      0.37      0.37      1000
   macro avg       0.36      0.37      0.36      1000
weighted avg       0.36      0.37 

## Special Section for Naive Bayes

In [38]:
# MultinomialNB does not work with PCA, SVD or any other matrix factorization
# Non-Negative Matrix Factorization (NMF) works but gives 0.04% accuracy
multinomialNB = MultinomialNB()

In [39]:
MultinomialNB = {"MultinomialNB": multinomialNB}

### For base case

In [40]:
scores = cross_validation(MultinomialNB, x_naive, y)

MultinomialNB  scores:  {'fit_time': array([0.01318097, 0.01035047, 0.01041579]), 'score_time': array([0.0080874 , 0.00922632, 0.0090096 ]), 'test_precision_weighted': array([0.39931921, 0.36904878, 0.48401381]), 'train_precision_weighted': array([0.77569485, 0.76233826, 0.75907781]), 'test_recall_weighted': array([0.35283528, 0.35204082, 0.44127366]), 'train_recall_weighted': array([0.7710554 , 0.74707295, 0.75363841]), 'test_f1_weighted': array([0.32224638, 0.35493944, 0.45020898]), 'train_f1_weighted': array([0.77124942, 0.7485308 , 0.75393127])} 



### including sentiment

In [41]:
scores = cross_validation(MultinomialNB, x_train_naive, y)

MultinomialNB  scores:  {'fit_time': array([0.00374031, 0.00329423, 0.0032599 ]), 'score_time': array([0.00464392, 0.00451684, 0.00445366]), 'test_precision_weighted': array([0.34205694, 0.35014896, 0.37046502]), 'train_precision_weighted': array([0.53031574, 0.5197462 , 0.51206761]), 'test_recall_weighted': array([0.34533453, 0.35744298, 0.37999399]), 'train_recall_weighted': array([0.52950008, 0.51921345, 0.51237809]), 'test_f1_weighted': array([0.33442996, 0.35164079, 0.37290812]), 'train_f1_weighted': array([0.52654192, 0.51552732, 0.507029  ])} 



In [42]:
multinomialNB.fit(x_train_naive, y)
y_pred = multinomialNB.predict(x_true_naive)
print("MultinomialNB : \n", classification_report(y_true, y_pred))

MultinomialNB : 
               precision    recall  f1-score   support

         1.0       0.41      0.52      0.45       200
         2.0       0.35      0.34      0.34       200
         3.0       0.26      0.27      0.26       200
         4.0       0.32      0.30      0.31       200
         5.0       0.40      0.32      0.36       200

   micro avg       0.35      0.35      0.35      1000
   macro avg       0.35      0.35      0.35      1000
weighted avg       0.35      0.35      0.35      1000



### finding the tough

In [43]:
classifier = classifier_dic["AdaBoostClassifier"]
classifier.fit(x_train, y)
for i in range(1,6):
    
    df = testing.loc[testing['score'] == i]
    
    testing_positive_word_dataframe = vectorize_words(df, "word_token", positive_words["positive_word_token"].values )
    testing_negative_word_dataframe = vectorize_words(df, "word_token", negative_words["negative_word_token"].values )
    
    x_true_naive = hstack((testing_positive_word_dataframe, testing_negative_word_dataframe)) 
    x_true = svd.transform(x_true_naive)
    
    y_true = df["score"]
    y_true = np.ravel(y_true)
    
    y_pred = classifier.predict(x_true)
    print("Score = ", i, " : \n", classification_report(y_true, y_pred))

  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


Score =  1  : 
               precision    recall  f1-score   support

         1.0       1.00      0.43      0.61       200
         2.0       0.00      0.00      0.00         0
         3.0       0.00      0.00      0.00         0
         4.0       0.00      0.00      0.00         0
         5.0       0.00      0.00      0.00         0

   micro avg       0.43      0.43      0.43       200
   macro avg       0.20      0.09      0.12       200
weighted avg       1.00      0.43      0.61       200

Score =  2  : 
               precision    recall  f1-score   support

         1.0       0.00      0.00      0.00         0
         2.0       1.00      0.29      0.45       200
         3.0       0.00      0.00      0.00         0
         4.0       0.00      0.00      0.00         0
         5.0       0.00      0.00      0.00         0

   micro avg       0.29      0.29      0.29       200
   macro avg       0.20      0.06      0.09       200
weighted avg       1.00      0.29      0.45  

  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
