In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import json
import re
import nltk
import pickle
from catboost import CatBoostClassifier
from sklearn.multiclass import OneVsRestClassifier
import operator
from sklearn.model_selection import train_test_split
import math
from sklearn import metrics
from gensim.models import word2vec
from sklearn.model_selection import cross_validate
%matplotlib inline

# Bag Of Words Model with Gradient Boosted Tree

In [2]:
BagOfWords = pickle.load(open("BagWords.pkl","rb"))

In [3]:
X_BoW = list()
y_BoW = list()

for i in range(len(BagOfWords)):
    X_BoW.append(BagOfWords[i][0])
    y_BoW.append(BagOfWords[i][1])

In [4]:
X_BoW_Train, X_BoW_Test, y_BoW_Train, y_BoW_Test = train_test_split(X_BoW,y_BoW, test_size = 0.2, random_state = 42)

In [8]:
Classifier = OneVsRestClassifier(CatBoostClassifier()).fit(X_BoW_Train, y_BoW_Train)
predicted = Classifier.predict(X_BoW_Test)
print("Accuracy", Classifier.score(X_BoW_Test,y_BoW_Test))
print("Metrics", metrics.classification_report(predicted,y_BoW_Test))

Accuracy 0.739403453689
Metrics              precision    recall  f1-score   support

         -1       0.34      0.80      0.48       873
          0       0.87      0.72      0.79     13166
          1       0.66      0.77      0.71      6345

avg / total       0.78      0.74      0.75     20384



In [9]:
scoring = ['precision_macro', 'recall_macro']
scores = cross_validate(Classifier, X_BoW_Train, y_BoW_Train, scoring=scoring,cv=5,return_train_score=False)
print(scores["test_precision_macro"])
print(scores["test_recall_macro"])

[ 0.75162868  0.75208625  0.76499164  0.76280028  0.76024529]
[ 0.62167867  0.60269444  0.61687831  0.61760752  0.61378019]


### Upload new prepared testing set

In [10]:
TestBagOfWords = pickle.load(open("testing.pkl", "rb"))
X_BoW_Test = list()
y_BoW_Test = list()

for i in range(len(TestBagOfWords)):
    X_BoW_Test.append(TestBagOfWords[i][0])
    y_BoW_Test.append(TestBagOfWords[i][1])

In [11]:
Classifier = OneVsRestClassifier(CatBoostClassifier()).fit(X_BoW, y_BoW)

In [12]:
predicted = Classifier.predict(X_BoW_Test)
print("Accuracy", Classifier.score(X_BoW_Test,y_BoW_Test))
print("Metrics", metrics.classification_report(predicted,y_BoW_Test))

Accuracy 0.770709177592
Metrics              precision    recall  f1-score   support

         -1       0.31      0.55      0.40       287
          0       0.89      0.82      0.86      5442
          1       0.46      0.55      0.50       983

avg / total       0.80      0.77      0.78      6712



# Word2Vec model with Gradient Boosted Tree

In [13]:
X_W2V = pickle.load(open("text.pkl","rb"))
y_W2V = pickle.load(open("sentiment.pkl","rb"))

In [14]:
X_W2V_Train, X_W2V_Test, y_W2V_Train, y_W2V_Test = train_test_split(X_W2V,y_W2V, test_size = 0.2, random_state = 42)

In [43]:
Classifier = OneVsRestClassifier(CatBoostClassifier(iterations=500,depth=10,learning_rate=0.03))
Classifier.fit(X_W2V_Train,y_W2V_Train)

OneVsRestClassifier(estimator=<catboost.core.CatBoostClassifier object at 0x7f8dbd7ed358>,
          n_jobs=1)

In [44]:
predicted = Classifier.predict(X_W2V_Test)
print("Accuracy", Classifier.score(X_W2V_Test,y_W2V_Test))
print("Metrics", metrics.classification_report(predicted,y_W2V_Test))

Accuracy 0.724076281287
Metrics              precision    recall  f1-score   support

         -1       0.47      0.34      0.40       713
          0       0.78      0.85      0.81      4648
          1       0.58      0.51      0.54      1351

avg / total       0.71      0.72      0.71      6712



In [35]:
scoring = ['precision_macro', 'recall_macro']
scores = cross_validate(Classifier, X_W2V_Train, y_W2V_Train, scoring=scoring,cv=5,return_train_score=False)
print(scores["test_precision_macro"])
print(scores["test_recall_macro"])

[ 0.80563738  0.80912373  0.81587733  0.80913044  0.81102667]
[ 0.77830542  0.78646356  0.78935734  0.7803588   0.78556843]


### Upload new testing set

In [46]:
X_W2V_Test = pickle.load(open("test_text.pkl","rb"))
y_W2V_Test = pickle.load(open("test_sentiment.pkl","rb"))

In [45]:
Classifier = OneVsRestClassifier(CatBoostClassifier(iterations=500,depth=10,learning_rate=0.03))
Classifier.fit(X_W2V,y_W2V)

OneVsRestClassifier(estimator=<catboost.core.CatBoostClassifier object at 0x7f8dbd7ed7f0>,
          n_jobs=1)

In [None]:
predicted = Classifier.predict(X_W2V_Test)
print("Accuracy", Classifier.score(X_W2V_Test,y_W2V_Test))
print("Metrics", metrics.classification_report(predicted,y_W2V_Test))

Accuracy 0.712455303933
Metrics              precision    recall  f1-score   support

         -1       0.48      0.30      0.37       809
          0       0.77      0.84      0.80      4580
          1       0.58      0.51      0.54      1323

avg / total       0.70      0.71      0.70      6712



In [None]:
scoring = ['precision_macro', 'recall_macro']
scores = cross_validate(Classifier, X_W2V, y_W2V, scoring=scoring,cv=5,return_train_score=False)
print(scores["test_precision_macro"])
print(scores["test_recall_macro"])

#### In future work I want to tune parameters of my model. Good results was shown with Random Forest(no tuning) and Extreme Gradient Boosting(no tuning) with CatBoost