In [1]:
## This file consists of the extended TF model using DT algorithm.

In [2]:
# import standard libraries
%matplotlib inline
import numpy as np
import pandas as pd
import os
import random
import re
import matplotlib.pyplot as plt
import time
import pickle

In [3]:
# import needed libraries
import nltk
import gensim
import sklearn
from sklearn.tree import DecisionTreeClassifier

In [4]:
# set working directory to get the data
os.chdir(os.path.join(os.getcwd(), "..", "..", "data", "preprocessed"))

In [5]:
# read the data
df_tf = pd.read_csv("2.2-sh-data-preprocessed.csv", encoding = "ISO-8859-1")

In [6]:
df_tf.head()

Unnamed: 0,emotion,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [7]:
# set model name
MODEL_ALGO = "DT"

In [8]:
def run_model(name, estimator, parameters, scoring="accuracy", cv=5):
    start = time.time()
    
    print("Evaluating {} ...".format(name))
    
    clf = sklearn.model_selection.GridSearchCV(estimator=estimator,
                                       param_grid=parameters,
                                       cv=cv,
                                       scoring=scoring,
                                       n_jobs=-1)
    # fit the training data
    clf.fit(X_train, y_train)
    
    # get the best params
    print("\nBest params: {}".format(clf.best_params_))
    
    # predict on test data with best params
    y_pred = clf.predict(X_test)
    
    # get the accuracy score
    print("\nAccuracy score: {0:.3f}%".format(sklearn.metrics.accuracy_score(y_test, y_pred) * 100))
    
    # get the confusin matrix
    print("\nConfusion Matrix:\n {}".format(sklearn.metrics.confusion_matrix(y_test, y_pred)))

    FP = sklearn.metrics.confusion_matrix(y_test, y_pred)[0][1]
    FN = sklearn.metrics.confusion_matrix(y_test, y_pred)[1][0]
    TP = sklearn.metrics.confusion_matrix(y_test, y_pred)[1][1]
    TN = sklearn.metrics.confusion_matrix(y_test, y_pred)[0][0]
    
    type_1_error = FP / (FP + FN + TP + TN)
    type_2_error = FN / (FP + FN + TP + TN)

    print("\nTP: {}".format(TP))
    print("TN: {}".format(TN))
    print("FP: {}".format(FP))
    print("FN: {}".format(FN))
    print("\nType I Error (percent): {0:.3f}%".format(type_1_error * 100))
    print("Type II Error (percent): {0:.3f}%".format(type_2_error * 100))
    
    # print the classification report
    print("\nClassification Report:\n {}".format(sklearn.metrics.classification_report(y_test, y_pred)))
    
    # save the pkl file for future
    print("\nSaving model pkl file...")
    dump_file = '{}.pkl'.format(name.replace(" ", "_").lower())
    sklearn.externals.joblib.dump(clf, dump_file, compress = 1)
    
    end = time.time()
    total_time = (0.1 * round(end - start)) / 6
    print("\nTotal execution time: {0:.2f} minutes".format(total_time))
    
    return y_pred

In [9]:
# parameters required for hyper-tuning the model
criterion = ["gini", "entropy"]
splitter = ["best", "random"]
max_features = ["sqrt", "log2", None]
max_depth = [5, 10, 20, None]
min_samples_split = [5, 10, 20, 40]
min_samples_leaf = [1, 2, 4, 8]

In [10]:
# combination of parameters
parameters = {'criterion': criterion,
              'splitter': splitter,
              'max_features': max_features,
              'max_depth': max_depth,
              'min_samples_split': min_samples_split,
              'min_samples_leaf': min_samples_leaf}

### Modelling

In [11]:
# # define the target variable and independent variable
X = df_tf.loc[:, df_tf.columns != "emotion"]
y = df_tf.loc[:, df_tf.columns == "emotion"]

# X = df_tf.loc[:, ~df_tf.columns.isin(['negative_emotion', 'neutral_emotion', 'positive_emotion'])]
# y = df_tf[['negative_emotion', 'neutral_emotion', 'positive_emotion']]

In [12]:
# split the train and test data
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size = 0.25, random_state = 123)

In [13]:
X_train = np.array(X_train)
X_train.shape

(6416, 14)

In [14]:
y_train = y_train.values.reshape(-1,)
# y_train = np.array(y_train)
y_train.shape

(6416,)

In [15]:
X_test = np.array(X_test)
X_test.shape

(2139, 14)

In [16]:
y_test = y_test.values.reshape(-1,)
# y_test = np.array(y_test)
y_test.shape

(2139,)

In [17]:
os.chdir(os.path.join(os.getcwd(), "..", "classifiers", "tf"))

In [18]:
y_pred = run_model(name = MODEL_ALGO + " TF",
          estimator = DecisionTreeClassifier(),
          parameters = parameters, 
          cv = 10)

Evaluating DT TF ...

Best params: {'criterion': 'gini', 'max_depth': 5, 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 5, 'splitter': 'best'}

Accuracy score: 60.262%

Confusion Matrix:
 [[   0  114    4]
 [   0 1224   57]
 [   0  675   65]]

TP: 1224
TN: 0
FP: 114
FN: 0

Type I Error (percent): 8.520%
Type II Error (percent): 0.000%

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       118
           1       0.61      0.96      0.74      1281
           2       0.52      0.09      0.15       740

   micro avg       0.60      0.60      0.60      2139
   macro avg       0.37      0.35      0.30      2139
weighted avg       0.54      0.60      0.50      2139


Saving model pkl file...

Total execution time: 0.13 minutes


  'precision', 'predicted', average, warn_for)


In [19]:
y_actu = pd.Series(y_test, name='Actual')
y_predc = pd.Series(y_pred, name='Predicted')
df_confusion = pd.crosstab(y_actu, y_predc, rownames=['Actual'], colnames=['Predicted'], margins=True)
df_confusion

Predicted,1,2,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,114,4,118
1,1224,57,1281
2,675,65,740
All,2013,126,2139
