In [2]:
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import xgboost as xgb
import matplotlib.pyplot as plt
from mlxtend.plotting import heatmap
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [4]:
df = pd.read_csv('sentiment_dataset.csv')
print(df.shape)
df.head()

(102148, 7)


Unnamed: 0,keyword,Comment,affin_sentiments,nltk_sentiment,affin_score,nltk_score,Real_Sentiment
0,electric,Aa Chal ke tunea8 didnemeu ejebe,neu,neu,0,0.0,neu
1,electric,Only 25999,neu,neu,0,0.0,neu
2,electric,Im getting a new 85kw put in my 2013 model s B...,neu,neu,0,0.0,pos
3,electric,The budget is 5 per video a Very Cheap budget ...,pos,pos,2,0.3612,pos
4,electric,So natural materials still come from earth,pos,pos,1,0.4201,pos


In [5]:
df = df.drop('Comment',axis = 1)
df.head()

Unnamed: 0,keyword,affin_sentiments,nltk_sentiment,affin_score,nltk_score,Real_Sentiment
0,electric,neu,neu,0,0.0,neu
1,electric,neu,neu,0,0.0,neu
2,electric,neu,neu,0,0.0,pos
3,electric,pos,pos,2,0.3612,pos
4,electric,pos,pos,1,0.4201,pos


In [6]:
df = pd.get_dummies(df, columns = ['keyword','affin_sentiments','nltk_sentiment'])
print(df.shape)
df.head()

(102148, 21)


Unnamed: 0,affin_score,nltk_score,Real_Sentiment,keyword_battery-electric truck,keyword_electric,keyword_engine,keyword_fuel cell,keyword_fuel cell truck,keyword_h2 ice,keyword_hydrogen,...,keyword_power,keyword_powertrain,keyword_truck,keyword_truck engine,affin_sentiments_neg,affin_sentiments_neu,affin_sentiments_pos,nltk_sentiment_neg,nltk_sentiment_neu,nltk_sentiment_pos
0,0,0.0,neu,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1,0,0.0,neu,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,0,0.0,pos,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
3,2,0.3612,pos,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
4,1,0.4201,pos,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1


In [7]:
label_dict = {'neu': 0,
              'pos': 1,
              'neg': 2}

df['ClassLabel'] = df['Real_Sentiment'].map(label_dict)
df.tail()

Unnamed: 0,affin_score,nltk_score,Real_Sentiment,keyword_battery-electric truck,keyword_electric,keyword_engine,keyword_fuel cell,keyword_fuel cell truck,keyword_h2 ice,keyword_hydrogen,...,keyword_powertrain,keyword_truck,keyword_truck engine,affin_sentiments_neg,affin_sentiments_neu,affin_sentiments_pos,nltk_sentiment_neg,nltk_sentiment_neu,nltk_sentiment_pos,ClassLabel
102143,0,0.0,neu,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
102144,2,0.2716,pos,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,1,1
102145,0,0.0,neg,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,2
102146,0,0.0,neu,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
102147,0,-0.21,neg,0,0,0,0,0,0,0,...,0,1,0,0,1,0,1,0,0,2


In [8]:
X = df.drop(['Real_Sentiment','ClassLabel'],axis = 1)
y = df['ClassLabel']

In [9]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.3, random_state=1, shuffle = 'True')
print('Train/Valid/Test sizes:', y_train.shape[0], y_test.shape[0])

Train/Valid/Test sizes: 71503 30645


In [47]:
RF_param_grid = [{'max_depth': [20,30,14,16, None],
               'n_estimators': [500,700,800]}]

RF_gs = GridSearchCV(estimator=RandomForestClassifier(random_state=1),
                  param_grid=RF_param_grid,
                  refit=True,
                  cv=10,
                  n_jobs=-1)

RF_gs.fit(X_train, y_train)

print('Best Accuracy: %.2f%%' % (RF_gs.best_score_*100))
print('Best Params:', RF_gs.best_params_)

Best Accuracy: 70.20%
Best Params: {'max_depth': 14, 'n_estimators': 800}


In [51]:
XG_param_grid = [{'max_depth': [6,8,12,14,20,None],
               'objective': ['reg:squarederror'],
               'learning_nrate': [0.1,0.2,0.3],
               'n_estimators': [200,300,500]}]

XG_gs = GridSearchCV(estimator=xgb.XGBClassifier(random_state=1),
                  param_grid=XG_param_grid,
                  refit=True,
                  cv=10,
                  n_jobs=-1)

XG_gs.fit(X_train, y_train)

print('Best Accuracy: %.2f%%' % (XG_gs.best_score_*100))
print('Best Params:', XG_gs.best_params_)

Best Accuracy: 70.70%
Best Params: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 500, 'objective': 'reg:squarederror'}


In [58]:
param_grid = [{'max_depth': [2, 4, 6,12,20,None],
               'criterion': ['gini', 'entropy']}]

gs = GridSearchCV(estimator=DecisionTreeClassifier(random_state=1),
                  param_grid=param_grid,
                  refit=True,
                  cv=10,
                  n_jobs=None)

gs.fit(X_train, y_train)

print('Best Accuracy: %.2f%%' % (gs.best_score_*100))
print('Best Params:', gs.best_params_)


Best Accuracy: 69.65%
Best Params: {'criterion': 'entropy', 'max_depth': 12}


In [12]:
RF_gs = RandomForestClassifier(max_depth = 14, n_estimators = 800, random_state = 1)
RF_gs.fit(X_train, y_train)
y_pred = RF_gs.predict(X_test)
accuracy_score(y_pred,y_test)

0.7019742209169522

In [14]:
XG_gs = xgb.XGBClassifier(random_state=1, learning_rate = 0.1, max_depth = 6, n_estimators = 500, objective = 'reg:squarederror')
XG_gs.fit(X_train, y_train)
y_pred = XG_gs.predict(X_test)
accuracy_score(y_pred,y_test)

0.7058247674987763

In [15]:
Tree_gs = DecisionTreeClassifier(random_state=1, max_depth = 12, criterion = 'entropy')
Tree_gs.fit(X_train, y_train)
y_pred = Tree_gs.predict(X_test)
accuracy_score(y_pred,y_test)

0.6967205090553108