In [1]:
## This file consists of the extended TF model using LR algorithm.

In [2]:
# import standard libraries
%matplotlib inline
import numpy as np
import pandas as pd
import os
import random
import re
import matplotlib.pyplot as plt
import time
import pickle

In [3]:
# import needed libraries
import nltk
import gensim
import sklearn
from sklearn.linear_model import LogisticRegression

In [4]:
# set working directory to get the data
os.chdir(os.path.join(os.getcwd(), "..", "..", "data", "preprocessed"))

In [5]:
# read the data
df_tf = pd.read_csv("2.2-sh-data-preprocessed.csv", encoding = "ISO-8859-1")

In [6]:
df_tf.head()

Unnamed: 0,emotion,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [7]:
# set model name
MODEL_ALGO = "LR"

In [8]:
def run_model(name, estimator, parameters, scoring="accuracy", cv=5):
    start = time.time()
    
    print("Evaluating {} ...".format(name))
    
    clf = sklearn.model_selection.GridSearchCV(estimator=estimator,
                                       param_grid=parameters,
                                       cv=cv,
                                       scoring=scoring,
                                       n_jobs=-1)
    # fit the training data
    clf.fit(X_train, y_train)
    
    # get the best params
    print("\nBest params: {}".format(clf.best_params_))
    
    # predict on test data with best params
    y_pred = clf.predict(X_test)
    
    # get the accuracy score
    print("\nAccuracy score: {0:.3f}%".format(sklearn.metrics.accuracy_score(y_test, y_pred) * 100))
    
    # get the confusin matrix
    print("\nConfusion Matrix:\n {}".format(sklearn.metrics.confusion_matrix(y_test, y_pred)))

    FP = sklearn.metrics.confusion_matrix(y_test, y_pred)[0][1]
    FN = sklearn.metrics.confusion_matrix(y_test, y_pred)[1][0]
    TP = sklearn.metrics.confusion_matrix(y_test, y_pred)[1][1]
    TN = sklearn.metrics.confusion_matrix(y_test, y_pred)[0][0]
    
    type_1_error = FP / (FP + FN + TP + TN)
    type_2_error = FN / (FP + FN + TP + TN)

    # print details about the confusion matrix
    print("\nTP: {}".format(TP))
    print("TN: {}".format(TN))
    print("FP: {}".format(FP))
    print("FN: {}".format(FN))
    print("\nType I Error (percent): {0:.3f}%".format(type_1_error * 100))
    print("Type II Error (percent): {0:.3f}%".format(type_2_error * 100))
    
    # print the classification report
    print("\nClassification Report:\n {}".format(sklearn.metrics.classification_report(y_test, y_pred)))
    
    
    # save the pkl file for future
    print("\nSaving model pkl file...")
    dump_file = '{}.pkl'.format(name.replace(" ", "_").lower())
    sklearn.externals.joblib.dump(clf, dump_file, compress = 1)
    
    end = time.time()
    total_time = (0.1 * round(end - start)) / 6
    print("\nTotal execution time: {0:.2f} minutes".format(total_time))
    
    return y_pred

In [9]:
# parameters required for hyper-tuning the model
C = [0.00001, 0.0001, 0.001, 0.01, 1, 10, 100]
solver = ["newton-cg", "lbfgs", "sag", "saga"]
multi_class = ["ovr", "multinomial", "auto"]
max_iter = [100, 200, 400]
fit_intercept = [True, False]

In [10]:
# combination of parameters
parameters = {'C': C,
             'solver': solver,
             'multi_class': multi_class,
             'max_iter': max_iter,
             'fit_intercept': fit_intercept}

### Modelling

In [33]:
# define the target variable and independent variable
X = df_tf.loc[:, df_tf.columns != "emotion"]
y = df_tf.loc[:, df_tf.columns == "emotion"]

# X = df_tf.loc[:, ~df_tf.columns.isin(['negative_emotion', 'neutral_emotion', 'positive_emotion'])]
# y = df_tf.loc[:, df_tf.columns == "negative_emotion"]

In [34]:
# split the train and test data
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size = 0.25, random_state = 123)

In [22]:
X_train = np.array(X_train)
X_train.shape

(6416, 14)

In [14]:
y_train = y_train.values.reshape(-1,)
y_train.shape

(6416,)

In [36]:
X_test_org = X_test
X_test = np.array(X_test)
X_test.shape

(2139, 14)

In [37]:
y_test_org = y_test
y_test = y_test.values.reshape(-1,)
y_test = np.array(y_test)
y_test.shape

(2139,)

In [17]:
os.chdir(os.path.join(os.getcwd(), "..", "classifiers", "tf"))

In [18]:
y_pred = run_model(name = MODEL_ALGO + " TF",
          estimator = LogisticRegression(),
          parameters = parameters, 
          cv = 10)

Evaluating LR TF ...

Best params: {'C': 10, 'fit_intercept': True, 'max_iter': 100, 'multi_class': 'ovr', 'solver': 'newton-cg'}

Accuracy score: 60.729%

Confusion Matrix:
 [[   0  113    5]
 [   0 1255   26]
 [   0  696   44]]

TP: 1255
TN: 0
FP: 113
FN: 0

Type I Error (percent): 8.260%
Type II Error (percent): 0.000%

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       118
           1       0.61      0.98      0.75      1281
           2       0.59      0.06      0.11       740

   micro avg       0.61      0.61      0.61      2139
   macro avg       0.40      0.35      0.29      2139
weighted avg       0.57      0.61      0.49      2139


Saving model pkl file...

Total execution time: 1.07 minutes


  'precision', 'predicted', average, warn_for)


In [20]:
y_actu = pd.Series(y_test, name='Actual')
y_predc = pd.Series(y_pred, name='Predicted')
df_confusion = pd.crosstab(y_actu, y_predc, rownames=['Actual'], colnames=['Predicted'], margins=True)
df_confusion

Predicted,1,2,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,113,5,118
1,1255,26,1281
2,696,44,740
All,2064,75,2139


In [39]:
X_test_org.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
4025,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5899,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2943,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1769,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7863,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [41]:
y_test_org.head()

Unnamed: 0,emotion
4025,1
5899,1
2943,2
1769,2
7863,2


In [49]:
y_pred[3]

1