In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import json
import re
import nltk
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
import operator
from sklearn.model_selection import train_test_split
import math
from sklearn import metrics
from gensim.models import word2vec
from sklearn.model_selection import cross_validate
%matplotlib inline

# Bag Of Words Model with Random Forest

In [2]:
BagOfWords = pickle.load(open("BagWords.pkl","rb"))

In [17]:
X_BoW = list()
y_BoW = list()

for i in range(len(BagOfWords)):
    X_BoW.append(BagOfWords[i][0])
    y_BoW.append(BagOfWords[i][1])

In [18]:
X_BoW_Train, X_BoW_Test, y_BoW_Train, y_BoW_Test = train_test_split(X_BoW,y_BoW, test_size = 0.2, random_state = 42)

In [19]:
Classifier = RandomForestClassifier()
Classifier.fit(X_BoW_Train,y_BoW_Train)
predicted = Classifier.predict(X_BoW_Test)
print("Accuracy", Classifier.score(X_BoW_Test,y_BoW_Test))
print("Metrics", metrics.classification_report(predicted,y_BoW_Test))

Accuracy 0.829130690738
Metrics              precision    recall  f1-score   support

         -1       0.65      0.84      0.73      1601
          0       0.90      0.82      0.85     11970
          1       0.78      0.85      0.81      6813

avg / total       0.84      0.83      0.83     20384



In [20]:
scoring = ['precision_macro', 'recall_macro']
scores = cross_validate(Classifier, X_BoW_Train, y_BoW_Train, scoring=scoring,cv=5,return_train_score=False)
print(scores["test_precision_macro"])
print(scores["test_recall_macro"])

[ 0.81977707  0.82642706  0.82679412  0.82052032  0.82452233]
[ 0.76644628  0.75643255  0.76486527  0.76421128  0.76181689]


### Upload new prepared testing set

In [7]:
TestBagOfWords = pickle.load(open("testing.pkl", "rb"))
X_BoW_Test = list()
y_BoW_Test = list()

for i in range(len(TestBagOfWords)):
    X_BoW_Test.append(TestBagOfWords[i][0])
    y_BoW_Test.append(TestBagOfWords[i][1])

In [8]:
Classifier = RandomForestClassifier(n_estimators = 300)
Classifier.fit(X_BoW,y_BoW)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [9]:
predicted = Classifier.predict(X_BoW_Test)
print("Accuracy", Classifier.score(X_BoW_Test,y_BoW_Test))
print("Metrics", metrics.classification_report(predicted,y_BoW_Test))

Accuracy 0.699493444577
Metrics              precision    recall  f1-score   support

         -1       0.34      0.28      0.31       621
          0       0.80      0.81      0.80      4972
          1       0.43      0.45      0.44      1119

avg / total       0.70      0.70      0.70      6712



# Word2Vec model with Random Forest

In [10]:
X_W2V = pickle.load(open("text.pkl","rb"))
y_W2V = pickle.load(open("sentiment.pkl","rb"))

In [11]:
X_W2V_Train, X_W2V_Test, y_W2V_Train, y_W2V_Test = train_test_split(X_W2V,y_W2V, test_size = 0.2, random_state = 42)

In [12]:
Classifier = RandomForestClassifier()
Classifier.fit(X_W2V_Train,y_W2V_Train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [13]:
predicted = Classifier.predict(X_W2V_Test)
print("Accuracy", Classifier.score(X_W2V_Test,y_W2V_Test))
print("Metrics", metrics.classification_report(predicted,y_W2V_Test))

Accuracy 0.794433854365
Metrics              precision    recall  f1-score   support

         -1       0.74      0.81      0.78      1932
          0       0.83      0.76      0.79      6411
          1       0.78      0.83      0.80      4772

avg / total       0.80      0.79      0.79     13115



In [14]:
X_W2V_Test = pickle.load(open("test_text.pkl","rb"))
y_W2V_Test = pickle.load(open("test_sentiment.pkl","rb"))

In [15]:
Classifier = RandomForestClassifier()
Classifier.fit(X_W2V,y_W2V)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [16]:
predicted = Classifier.predict(X_W2V_Test)
print("Accuracy", Classifier.score(X_W2V_Test,y_W2V_Test))
print("Metrics", metrics.classification_report(predicted,y_W2V_Test))

Accuracy 0.624404052443
Metrics              precision    recall  f1-score   support

         -1       0.48      0.23      0.31      1044
          0       0.68      0.80      0.74      4267
          1       0.45      0.38      0.41      1401

avg / total       0.60      0.62      0.60      6712



In [21]:
scoring = ['precision_macro', 'recall_macro']
scores = cross_validate(Classifier, X_W2V, y_W2V, scoring=scoring,cv=5,return_train_score=False)
print(scores["test_precision_macro"])
print(scores["test_recall_macro"])

[ 0.67691791  0.63672382  0.58981847  0.66582776  0.65361641]
[ 0.61957273  0.59595847  0.55562837  0.67267386  0.65083826]


#### In future work I want to tune parameters of my model. Good results was shown with Random Forest(no tuning) and Extreme Gradient Boosting(no tuning) with CatBoost