In [1]:
import pandas
import numpy as np

In [2]:
columns = ["review_text", "score"]

training = pandas.read_csv('input/training.txt', delimiter = "\t", header=None)
training.columns = columns

## Text Pre-processing
__[Text Preprocessing in Python: Steps, Tools, and Examples](https://medium.com/@datamonsters/text-preprocessing-in-python-steps-tools-and-examples-bf025f872908)__

In [3]:
import string
from sklearn.feature_extraction import text

import nltk
nltk.download('punkt')
nltk.download('wordnet')

from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /home/mhssain9/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/mhssain9/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
def process_dataframe(dataframe, input_column, output_column):
    for input, output in zip(input_column,output_column):
        dataframe[output] = dataframe[input].astype(str).str.lower()

        dataframe[output] = dataframe[output].str.replace('\d+', '')

        dataframe[output] = dataframe[output].str.replace('[{}]'.format(string.punctuation), '')

        stop_words = text.ENGLISH_STOP_WORDS
        dataframe[output] = dataframe[output].str.replace( '\b(?:{})\b'.format('|'.join(stop_words)), '')
        dataframe[output] = dataframe[output].str.replace(r'\s+', ' ')

        stemmer= SnowballStemmer("english")
        lemmatizer=WordNetLemmatizer()

        dataframe[output] = dataframe[output].apply(word_tokenize)
        dataframe[output] = dataframe[output].apply(lambda x: [stemmer.stem(y) for y in x])
        dataframe[output] = dataframe[output].apply(lambda x: [lemmatizer.lemmatize(y) for y in x])
        dataframe[output] = dataframe[output].apply(lambda x: " ".join(x) )
    
    return dataframe

In [5]:
training = process_dataframe(training, ["review_text"], ["word_token"])

In [6]:
training.sample(5)

Unnamed: 0,review_text,score,word_token
5662,quaint place. good service. extremely authenti...,4.0,quaint place good servic extrem authent went f...
875,"The sandwich at Geno's was dry, tasteless, the...",1.0,the sandwich at geno wa dri tasteless the brea...
8013,While White Dog's brunch food earns a spot amo...,3.0,while white dog brunch food earn a spot among ...
2322,lots of pictures on the walls....blinking ligh...,3.0,lot of pictur on the wallsblink light everywhe...
1628,This place did not live up to the hype. The ch...,1.0,this place did not live up to the hype the che...


## Bag of Words

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

### PCA

In [8]:
def pca_vectorize_words(dataframe, pca_n_components=100, random_state=None):
    
    pca = PCA(n_components=pca_n_components)  
    pca.fit(dataframe.iloc[ : , : -1])
    principalComponents = pca.transform(dataframe.iloc[ : , : -1])
    
    principalComponents = pandas.concat([pandas.DataFrame(data = principalComponents), dataframe.iloc[ : , -1 ] ], axis = 1)     
    return pca, principalComponents

### Vectorization

In [9]:
def vectorize_words(dataframe, x, y, keys=None):
    vectorizer = TfidfVectorizer(vocabulary = keys)
    # tokenize and build vocab
    vectorizer.fit(dataframe[x])
    # encode document
    vectorized_word_token = vectorizer.transform(dataframe[x])
    
    vectorized_word_dataframe = pandas.DataFrame(vectorized_word_token.todense())
    vectorized_word_dataframe.columns = vectorizer.get_feature_names()
    
    vectorized_word_dataframe = pandas.concat([vectorized_word_dataframe, dataframe[[y]]], axis = 1)
    
    return vectorizer, vectorized_word_dataframe

In [10]:
vectorizer, vectorized_word_dataframe = vectorize_words(training, "word_token", "score")

In [11]:
vectorized_word_dataframe.sample(5)

Unnamed: 0,aa,aaaa,aaaaa,aaaaah,aaaah,aaaand,aaaannnnnnd,aaamaz,aachar,aam,...,各种奇怪的意式酱真的,同行的朋友都没怎么吃,总之,正宗台灣排骨飯,特别特别不适合中国人的口味,看到评价很高就来了,第一次来费城,结果,面包硬的挑战你的口腔,score
8921,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8873,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4448,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
8838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0


In [12]:
pca, principalComponents = pca_vectorize_words(vectorized_word_dataframe)

In [13]:
principalComponents.sample(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,score
2449,-0.168439,0.269521,-0.081053,0.065131,-0.024999,0.088525,0.108261,0.091525,0.085998,-0.166116,...,0.029441,-0.030448,0.022122,-0.035076,-0.015809,0.023519,-0.036008,0.063106,-0.026739,2.0
4500,0.069201,0.024737,-0.219671,-0.102271,0.041292,-0.002687,-0.006764,0.060097,0.048161,-0.015674,...,0.043687,0.019479,0.002753,-0.014344,0.029442,0.007608,-0.011481,-0.022975,-0.010353,3.0
9547,0.036526,0.00202,0.053513,0.030996,0.048721,0.001106,-0.093268,0.088735,0.128506,0.187495,...,0.079577,0.031917,-0.048876,-0.006378,-0.013755,0.053378,-0.009473,0.003135,0.012592,2.0
8168,-0.039254,-0.010442,0.112099,-0.060277,0.000282,0.114715,0.021303,0.032858,-0.029632,-0.068834,...,-0.024968,0.09759,-0.048537,-0.00993,-0.120937,-0.035169,0.006627,-0.007681,0.02616,3.0
4570,-0.163995,-0.039754,-0.052797,0.073203,0.107733,0.056447,-0.166538,0.16929,0.150677,0.007742,...,-0.01695,0.020313,-0.01186,-0.002797,0.027172,-0.030241,-0.039262,-0.076902,0.021668,2.0


In [14]:
x = principalComponents.iloc[ : , : -1]

y = principalComponents.iloc[ : , -1 ]
y = np.ravel(y)

## Classification

### Cross-validation

In [15]:
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import recall_score

In [16]:
def cross_validation(classifier_dic, X, y):
    scoring = ['precision_weighted', 'recall_weighted', 'f1_weighted']
    
    scores = {}
    for key, classifier in classifier_dic.items():
        scores[key] = cross_validate(classifier, X, y, cv=5, return_train_score=True, scoring=scoring)
        print (key, " scores: ", scores[key], "\n" )
        
    return scores

### Classifier

In [17]:
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC

In [18]:
from sklearn.model_selection import GridSearchCV
import itertools    

In [19]:
classifier_dic = {}
classifier_dic['MLPClassifier'] = MLPClassifier(max_iter=500, early_stopping=True) 
classifier_dic['LogisticRegression'] = LogisticRegression(solver='liblinear', multi_class='auto')
classifier_dic['AdaBoostClassifier'] = AdaBoostClassifier()
# SVC is very slow running 
classifier_dic['SVC'] = SVC(gamma='auto')

#### Cross-validation

In [20]:
scores = cross_validation(classifier_dic, x, y)

MLPClassifier  scores:  {'fit_time': array([6.5107131 , 4.2095983 , 3.77017808, 4.40897465, 4.01801109]), 'score_time': array([0.0804472 , 0.08920884, 0.02526283, 0.02725291, 0.04615712]), 'test_precision_weighted': array([0.44554548, 0.45809553, 0.41987488, 0.49705276, 0.49207204]), 'train_precision_weighted': array([0.56074511, 0.57519501, 0.55251248, 0.54878062, 0.55069453]), 'test_recall_weighted': array([0.46673337, 0.44872436, 0.42821411, 0.50375188, 0.49049049]), 'train_recall_weighted': array([0.56860538, 0.5811132 , 0.55934959, 0.5558474 , 0.55690345]), 'test_f1_weighted': array([0.44796995, 0.44661596, 0.42289776, 0.49750854, 0.48375914]), 'train_f1_weighted': array([0.56148228, 0.57449602, 0.55321806, 0.55082253, 0.55218195])} 

LogisticRegression  scores:  {'fit_time': array([1.05770135, 0.46532679, 0.38678551, 0.38315058, 0.36537218]), 'score_time': array([0.05392122, 0.01202846, 0.01323795, 0.01224828, 0.02742505]), 'test_precision_weighted': array([0.43884677, 0.44963584

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


SVC  scores:  {'fit_time': array([16.15431857, 16.37906146, 16.36010933, 16.90243292, 17.00251222]), 'score_time': array([7.36271763, 6.91217518, 6.91531396, 7.58973813, 6.99309373]), 'test_precision_weighted': array([0.32164435, 0.34676092, 0.30743299, 0.38909788, 0.38038973]), 'train_precision_weighted': array([0.37313021, 0.40034186, 0.36854001, 0.38325081, 0.37620066]), 'test_recall_weighted': array([0.33516758, 0.34967484, 0.36218109, 0.44722361, 0.43843844]), 'train_recall_weighted': array([0.40925578, 0.45716073, 0.44565353, 0.44490306, 0.42958979]), 'test_f1_weighted': array([0.30378562, 0.31161113, 0.32572317, 0.39600445, 0.39154416]), 'train_f1_weighted': array([0.37341174, 0.41322921, 0.39436847, 0.39260038, 0.38413996])} 



  'precision', 'predicted', average, warn_for)


#### Grid Search to find parameters to to achieve best possible results 

In [21]:
params = {}

units = []
for hidden_layers in range(1, 3):
    units = units + [x for x in itertools.product(np.arange(32, 100, 32),repeat=hidden_layers)]

params['MLPClassifier'] = dict(hidden_layer_sizes=units)
params['SVC'] = dict( kernel=['linear', 'poly', 'rbf', 'sigmoid'], C=np.arange(0.1, 2.0, 0.1) )
params['AdaBoostClassifier'] = dict( n_estimators=np.arange(20, 101, 10) )
params['LogisticRegression'] = dict( penalty=['l1', 'l2'], C=np.arange(0.1, 2.0, 0.3) )

In [22]:
grid = {}
grid_result = {}
for key, classifier in classifier_dic.items():
        if key in params:
            grid[key] = GridSearchCV(classifier, param_grid=params[key], cv=3, verbose=2)
            grid_result[key] = grid[key].fit(x, y)

            # summarize results
            print(key, " : Best: %f using %s" % (grid_result[key].best_score_, grid_result[key].best_params_))

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] hidden_layer_sizes=(32,) ........................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ......................... hidden_layer_sizes=(32,), total=   8.2s
[CV] hidden_layer_sizes=(32,) ........................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.2s remaining:    0.0s


[CV] ......................... hidden_layer_sizes=(32,), total=   5.3s
[CV] hidden_layer_sizes=(32,) ........................................
[CV] ......................... hidden_layer_sizes=(32,), total=   5.2s
[CV] hidden_layer_sizes=(64,) ........................................
[CV] ......................... hidden_layer_sizes=(64,), total=   7.6s
[CV] hidden_layer_sizes=(64,) ........................................
[CV] ......................... hidden_layer_sizes=(64,), total=   6.1s
[CV] hidden_layer_sizes=(64,) ........................................
[CV] ......................... hidden_layer_sizes=(64,), total=   7.0s
[CV] hidden_layer_sizes=(96,) ........................................
[CV] ......................... hidden_layer_sizes=(96,), total=   4.3s
[CV] hidden_layer_sizes=(96,) ........................................
[CV] ......................... hidden_layer_sizes=(96,), total=   5.7s
[CV] hidden_layer_sizes=(96,) ........................................
[CV] .

[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:  7.4min finished


MLPClassifier  : Best: 0.465980 using {'hidden_layer_sizes': (32, 64)}
Fitting 3 folds for each of 14 candidates, totalling 42 fits
[CV] C=0.1, penalty=l1 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................ C=0.1, penalty=l1, total=   0.6s
[CV] C=0.1, penalty=l1 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s


[CV] ................................ C=0.1, penalty=l1, total=   0.3s
[CV] C=0.1, penalty=l1 ...............................................
[CV] ................................ C=0.1, penalty=l1, total=   0.2s
[CV] C=0.1, penalty=l2 ...............................................
[CV] ................................ C=0.1, penalty=l2, total=   0.3s
[CV] C=0.1, penalty=l2 ...............................................
[CV] ................................ C=0.1, penalty=l2, total=   0.3s
[CV] C=0.1, penalty=l2 ...............................................
[CV] ................................ C=0.1, penalty=l2, total=   0.2s
[CV] C=0.4, penalty=l1 ...............................................
[CV] ................................ C=0.4, penalty=l1, total=   0.3s
[CV] C=0.4, penalty=l1 ...............................................
[CV] ................................ C=0.4, penalty=l1, total=   0.2s
[CV] C=0.4, penalty=l1 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  42 out of  42 | elapsed:   15.7s finished


LogisticRegression  : Best: 0.441365 using {'C': 1.9000000000000004, 'penalty': 'l2'}
Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] n_estimators=20 .................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .................................. n_estimators=20, total=   1.5s
[CV] n_estimators=20 .................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.6s remaining:    0.0s


[CV] .................................. n_estimators=20, total=   1.5s
[CV] n_estimators=20 .................................................
[CV] .................................. n_estimators=20, total=   1.5s
[CV] n_estimators=30 .................................................
[CV] .................................. n_estimators=30, total=   2.3s
[CV] n_estimators=30 .................................................
[CV] .................................. n_estimators=30, total=   2.3s
[CV] n_estimators=30 .................................................
[CV] .................................. n_estimators=30, total=   2.3s
[CV] n_estimators=40 .................................................
[CV] .................................. n_estimators=40, total=   3.0s
[CV] n_estimators=40 .................................................
[CV] .................................. n_estimators=40, total=   3.0s
[CV] n_estimators=40 .................................................
[CV] .

[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:  2.1min finished


AdaBoostClassifier  : Best: 0.400340 using {'n_estimators': 100}
Fitting 3 folds for each of 76 candidates, totalling 228 fits
[CV] C=0.1, kernel=linear ............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ............................. C=0.1, kernel=linear, total=   9.2s
[CV] C=0.1, kernel=linear ............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   13.6s remaining:    0.0s


[CV] ............................. C=0.1, kernel=linear, total=   9.0s
[CV] C=0.1, kernel=linear ............................................
[CV] ............................. C=0.1, kernel=linear, total=  10.0s
[CV] C=0.1, kernel=poly ..............................................
[CV] ............................... C=0.1, kernel=poly, total=  10.7s
[CV] C=0.1, kernel=poly ..............................................
[CV] ............................... C=0.1, kernel=poly, total=  10.6s
[CV] C=0.1, kernel=poly ..............................................
[CV] ............................... C=0.1, kernel=poly, total=  10.7s
[CV] C=0.1, kernel=rbf ...............................................
[CV] ................................ C=0.1, kernel=rbf, total=  14.5s
[CV] C=0.1, kernel=rbf ...............................................
[CV] ................................ C=0.1, kernel=rbf, total=  14.6s
[CV] C=0.1, kernel=rbf ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done 228 out of 228 | elapsed: 65.5min finished


SVC  : Best: 0.452872 using {'C': 1.3000000000000003, 'kernel': 'linear'}


In [23]:
for key in grid_result:
    classifier_dic[key] = grid_result[key].best_estimator_

In [24]:
classifier_dic

{'MLPClassifier': MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
        beta_2=0.999, early_stopping=True, epsilon=1e-08,
        hidden_layer_sizes=(32, 64), learning_rate='constant',
        learning_rate_init=0.001, max_iter=500, momentum=0.9,
        n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
        random_state=None, shuffle=True, solver='adam', tol=0.0001,
        validation_fraction=0.1, verbose=False, warm_start=False),
 'LogisticRegression': LogisticRegression(C=1.9000000000000004, class_weight=None, dual=False,
           fit_intercept=True, intercept_scaling=1, max_iter=100,
           multi_class='auto', n_jobs=None, penalty='l2', random_state=None,
           solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
 'AdaBoostClassifier': AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
           learning_rate=1.0, n_estimators=100, random_state=None),
 'SVC': SVC(C=1.3000000000000003, cache_size=200, class_

#### Cross-validation

In [25]:
scores = cross_validation(classifier_dic, x, y)

MLPClassifier  scores:  {'fit_time': array([30.06809831, 16.69814634, 15.23569703,  8.82147431, 16.36937571]), 'score_time': array([0.11579561, 0.05235195, 0.03613639, 0.0780952 , 0.05653238]), 'test_precision_weighted': array([0.46921924, 0.46018472, 0.46323987, 0.52323857, 0.50685332]), 'train_precision_weighted': array([0.57151267, 0.58971781, 0.56512664, 0.55975231, 0.54963768]), 'test_recall_weighted': array([0.48374187, 0.44772386, 0.45672836, 0.52226113, 0.50650651]), 'train_recall_weighted': array([0.57736085, 0.59362101, 0.56672921, 0.56010006, 0.55527764]), 'test_f1_weighted': array([0.46951662, 0.44991313, 0.45840296, 0.52241164, 0.50043507]), 'train_f1_weighted': array([0.57324981, 0.59050501, 0.56507088, 0.55942837, 0.55099614])} 

LogisticRegression  scores:  {'fit_time': array([0.54912567, 0.5648694 , 0.53032541, 0.59849215, 0.60615134]), 'score_time': array([0.03349328, 0.06306148, 0.01161861, 0.0166142 , 0.01593828]), 'test_precision_weighted': array([0.4439131 , 0.452

## Sentiment Words

In [26]:
positive_words = pandas.read_csv('input/positive-words.txt', delimiter = "\t", header=None, 
                                 encoding='utf-8', names=["positive_words"])
negative_words = pandas.read_csv('input/negative-words.txt', delimiter = "\t", header=None, 
                                 encoding='utf-8', names=["negative_words"])

In [27]:
positive_words = process_dataframe(positive_words, ["positive_words"], ["positive_word_token"])
negative_words = process_dataframe(negative_words, ["negative_words"], ["negative_word_token"])

In [28]:
positive_words.drop_duplicates(subset ="positive_word_token", keep = False, inplace = True)
negative_words.drop_duplicates(subset ="negative_word_token", keep = False, inplace = True)

In [29]:
_, positive_word_dataframe = vectorize_words(training, "word_token", "score", 
                                                                   positive_words["positive_word_token"].values )
_, negative_word_dataframe = vectorize_words(training, "word_token", "score", 
                                                                   negative_words["negative_word_token"].values )

sentiment_word_dataframe = pandas.concat([
    positive_word_dataframe.iloc[ : , :-1 ], negative_word_dataframe.iloc[ : , : ] ], axis = 1) 

In [30]:
sentiment_word_dataframe.sample(5)

Unnamed: 0,a,acclam,accommod,accomod,acumen,adequ,adjust,adulatori,advanc,adventuresom,...,wreck,wrest,wrestl,wretched,writh,wrought,yawn,zealot,zombi,score
6869,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
557,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
3830,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
7562,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9415,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [31]:
sentiment_word_dataframe.shape

(9994, 2694)

In [32]:
pca = PCA(n_components=10)  
pca.fit(sentiment_word_dataframe.iloc[ : , : -1], sentiment_word_dataframe.iloc[ : , -1 ])

PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [33]:
x = pca.transform(sentiment_word_dataframe.iloc[ : , : -1])

y = sentiment_word_dataframe.iloc[ : , -1 ]
y = np.ravel(y)

#### Cross-validation

In [34]:
scores = cross_validation(classifier_dic, x, y)

MLPClassifier  scores:  {'fit_time': array([13.5766046 , 11.01875997,  4.64318514,  3.7267096 , 29.18529153]), 'score_time': array([0.03940797, 0.04304552, 0.0228138 , 0.02629137, 0.02749085]), 'test_precision_weighted': array([0.31300149, 0.28546358, 0.31289541, 0.32686765, 0.33347747]), 'train_precision_weighted': array([0.32502215, 0.34898738, 0.30015794, 0.31445419, 0.34891271]), 'test_recall_weighted': array([0.31765883, 0.27113557, 0.30865433, 0.32466233, 0.34684685]), 'train_recall_weighted': array([0.33771107, 0.34158849, 0.30556598, 0.30769231, 0.35917959]), 'test_f1_weighted': array([0.29109697, 0.26989987, 0.28733855, 0.29657605, 0.31335128]), 'train_f1_weighted': array([0.3177867 , 0.32885765, 0.28479091, 0.29044408, 0.33454342])} 

LogisticRegression  scores:  {'fit_time': array([0.12286782, 0.12389851, 0.11534977, 0.1091094 , 0.11225772]), 'score_time': array([0.00390482, 0.00401235, 0.00390911, 0.01450467, 0.01443982]), 'test_precision_weighted': array([0.30363113, 0.278

## Testing

In [35]:
from sklearn.metrics import classification_report

In [36]:
testing = pandas.read_csv('input/test.txt', delimiter = "\t", header=None)
testing.columns = columns

In [37]:
testing = process_dataframe(testing, ["review_text"], ["word_token"])

In [38]:
_, testing_positive_word_dataframe = vectorize_words(training, "word_token", "score", 
                                                                   positive_words["positive_word_token"].values )
_, testing_negative_word_dataframe = vectorize_words(training, "word_token", "score", 
                                                                   negative_words["negative_word_token"].values )

testing_sentiment_word_dataframe = pandas.concat([
    testing_positive_word_dataframe.iloc[ : , :-1 ], testing_negative_word_dataframe.iloc[ : , : ] ], axis = 1) 

In [39]:
X_true = pca.transform(testing_sentiment_word_dataframe.iloc[ : , :-1 ])
y_true = testing_sentiment_word_dataframe.iloc[ : , -1 ]

In [40]:
for key, classifier in classifier_dic.items():
    classifier.fit(x, y)
    y_pred = classifier.predict(X_true)
    print(key, " : \n", classification_report(y_true, y_pred))

MLPClassifier  : 
               precision    recall  f1-score   support

         1.0       0.39      0.44      0.42      2000
         2.0       0.30      0.37      0.33      1999
         3.0       0.28      0.10      0.15      1995
         4.0       0.27      0.16      0.20      2000
         5.0       0.36      0.61      0.45      2000

   micro avg       0.34      0.34      0.34      9994
   macro avg       0.32      0.34      0.31      9994
weighted avg       0.32      0.34      0.31      9994

LogisticRegression  : 
               precision    recall  f1-score   support

         1.0       0.35      0.49      0.41      2000
         2.0       0.31      0.21      0.25      1999
         3.0       0.30      0.11      0.16      1995
         4.0       0.25      0.27      0.26      2000
         5.0       0.37      0.55      0.44      2000

   micro avg       0.32      0.32      0.32      9994
   macro avg       0.32      0.32      0.30      9994
weighted avg       0.32      0.32 

### finding the tough

In [41]:
classifier = classifier_dic["AdaBoostClassifier"]
classifier.fit(x, y)
for i in range(1,6):
    
    df = testing_sentiment_word_dataframe.loc[testing_sentiment_word_dataframe['score'] == i]
    
    X_true = pca.transform(df.iloc[ : , :-1 ])
    y_true = df.iloc[ : , -1 ]    
    
    y_pred = classifier.predict(X_true)
    print("Score = ", i, " : \n", classification_report(y_true, y_pred))

  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


Score =  1  : 
               precision    recall  f1-score   support

         1.0       1.00      0.52      0.69      2000
         2.0       0.00      0.00      0.00         0
         3.0       0.00      0.00      0.00         0
         4.0       0.00      0.00      0.00         0
         5.0       0.00      0.00      0.00         0

   micro avg       0.52      0.52      0.52      2000
   macro avg       0.20      0.10      0.14      2000
weighted avg       1.00      0.52      0.69      2000

Score =  2  : 
               precision    recall  f1-score   support

         1.0       0.00      0.00      0.00         0
         2.0       1.00      0.30      0.46      1999
         3.0       0.00      0.00      0.00         0
         4.0       0.00      0.00      0.00         0
         5.0       0.00      0.00      0.00         0

   micro avg       0.30      0.30      0.30      1999
   macro avg       0.20      0.06      0.09      1999
weighted avg       1.00      0.30      0.46  

  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


Score =  3  : 
               precision    recall  f1-score   support

         1.0       0.00      0.00      0.00         0
         2.0       0.00      0.00      0.00         0
         3.0       1.00      0.20      0.33      1995
         4.0       0.00      0.00      0.00         0
         5.0       0.00      0.00      0.00         0

   micro avg       0.20      0.20      0.20      1995
   macro avg       0.20      0.04      0.07      1995
weighted avg       1.00      0.20      0.33      1995

Score =  4  : 
               precision    recall  f1-score   support

         1.0       0.00      0.00      0.00         0
         2.0       0.00      0.00      0.00         0
         3.0       0.00      0.00      0.00         0
         4.0       1.00      0.23      0.38      2000
         5.0       0.00      0.00      0.00         0

   micro avg       0.23      0.23      0.23      2000
   macro avg       0.20      0.05      0.08      2000
weighted avg       1.00      0.23      0.38  

  'recall', 'true', average, warn_for)
