In [30]:
import pandas as pd

# Read TSV file into DataFrame
df = pd.read_table('Project Files/train.tsv/train.tsv')
df

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
...,...,...,...,...
156055,156056,8544,Hearst 's,2
156056,156057,8544,forced avuncular chortles,1
156057,156058,8544,avuncular chortles,3
156058,156059,8544,avuncular,2


In [31]:
df[1:10]

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
5,6,1,of escapades demonstrating the adage that what...,2
6,7,1,of,2
7,8,1,escapades demonstrating the adage that what is...,2
8,9,1,escapades,2
9,10,1,demonstrating the adage that what is good for ...,2


We can see that the sentences are divided into phrases which are assigned sentiments in the above data sample.

## To chech if all  the phrases are assigned a unique ID, making them all of them unique identifier for records in the dataset.

In [32]:
def _total_number_phrases(df, column_name):
    phrases = df[column_name]
    return phrases, len(phrases)

phrases, _n_phrases = _total_number_phrases(df, "PhraseId")

In [33]:
## Checking the length of list of unique phrase set
len(list(set(phrases)))

156060

In [34]:
## Checking the length of list of all phrases
len(phrases)

156060

In [35]:
def _are_unique(phrases):
    if len(list(set(phrases)))==len(phrases):
        return True
    return False

if _are_unique(phrases):
    print("All the Phrases have UNIQUE ID.")
else:
    print("Phrase ID is not unique.")

All the Phrases have UNIQUE ID.


## Analyzing all the attributes

In [36]:
## Sentiments
set(df["Sentiment"])

{0, 1, 2, 3, 4}

There are 5 sentiments classes in the dataset.

In [37]:
len(set(df['SentenceId']))

8529

There are unique 8529 sentences.

In [38]:
max(set(df['PhraseId']))

156060

There are 156060 unique phrases.

In [39]:
len(set(df['Phrase']))

156060

The phrase content is unique as well and is 156060 in number.

In [40]:
df.isna().sum()

PhraseId      0
SentenceId    0
Phrase        0
Sentiment     0
dtype: int64

## Finding Embeddings of the Phrases.

SETTING UP CLASS FOR INDEPENDENT MODELS

In [41]:
from sentence_transformers import SentenceTransformer
class PreTrained_EmbeddingModels:
    def __init__(self, model_url=None, model_name=None, model=None):
        self.model_url = model_url
        self.model_name = model_name
        self.model = model
        self._load(model_url)
        
    def _load(self, model_url):
        if self.model != None:
            print("Model Switching is disabled.")
            return
        self.model = SentenceTransformer(model_url)
        
    def _get_model(self):
        return self.model

LOADING BERT MODEL

In [42]:
model_instance_bert = PreTrained_EmbeddingModels(model_url="sentence-transformers/bert-base-nli-mean-tokens", model_name="BERT")._get_model()

FINDING EMBEDDINGS

In [43]:
embeddings_bert = model_instance_bert.encode(df['Phrase'].tolist()[:10000])

In [45]:
import pickle
with open('mypickle.pickle', 'wb') as f:
    pickle.dump([embeddings_bert], f)

In [46]:
list(phrases)

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185

## Training and Testing Data

In [49]:
Processing_DF = pd.DataFrame(df.iloc[:10000,:])
Processing_DF['Embeds'] = list(embeddings_bert)

In [50]:
from sklearn.model_selection import train_test_split
Processing_DF.sample(frac=1)
Processing_DF.sample(frac=1)
Processing_DF.sample(frac=1)
X_train, X_test, y_train, y_test = train_test_split(Processing_DF.drop(['Sentiment'],axis=1), list(Processing_DF['Sentiment']), test_size=0.33, random_state=42)

## Classification Models

In [51]:
class ClassificationModel:
    def __init__(self, model_name=None, parameter_dict=None):
        self.model_name = model_name
        self.parameter_dict = parameter_dict
        self.model = self.load_model()
        
    def load_model(self):            
        if self.model_name in ["Naive Bayes", "NB"]:
            from sklearn.naive_bayes import GaussianNB
            return GaussianNB(**self.parameter_dict)

        elif self.model_name in ["Support Vector Machine", "SVC"]:
            from sklearn.svm import SVC
            from sklearn.pipeline import make_pipeline
            from sklearn.preprocessing import StandardScaler
            return make_pipeline(StandardScaler(), SVC(**self.parameter_dict))

        elif self.model_name in ["Logistic Regression", "LR"]:
            from sklearn.linear_model import LogisticRegression
            return LogisticRegression(**self.parameter_dict)

        elif self.model_name in ["Decision Tree", "DT"]:
            from sklearn.tree import DecisionTreeClassifier
            return DecisionTreeClassifier(**self.parameter_dict)

        elif self.model_name in ["K Nearest Neighbour", "KNN"]:
            from sklearn.neighbors import KNeighborsClassifier
            return KNeighborsClassifier(**self.parameter_dict)

        elif self.model_name in ["Multi Layer Perceptron", "ANN"]:
            from sklearn.neural_network import MLPClassifier
            return MLPClassifier(**self.parameter_dict)

        elif self.model_name in ["Gradient Boosted Decision Tree", "GBDT"]:
            from sklearn.ensemble import GradientBoostingClassifier
            return GradientBoostingClassifier(**self.parameter_dict)
        else:
            return None
        
    def fit(self, features, labels):
        self.model = self.model.fit(features, labels)
        
    def predict(self, test_data):
        return self.model.predict(test_data)
    
    def get_confusion_matrix(self, actual, prediction):
        from sklearn.metrics import confusion_matrix
        return confusion_matrix(actual,prediction)

In [55]:
from sklearn import metrics
import numpy as np

## Naive Bayes

In [57]:
_naive_bayes = ClassificationModel(model_name="NB", parameter_dict={})
_naive_bayes.fit(np.array(X_train["Embeds"].tolist()),y_train)
_naive_bayes_predictions = _naive_bayes.predict(np.array(X_test["Embeds"].tolist()))
_naive_bayes_cm = _naive_bayes.get_confusion_matrix(y_test, _naive_bayes_predictions)
print("CONFUSION MATRIX\n")
print(_naive_bayes_cm)
print("\n"+str(sum([_naive_bayes_cm[i][j] for i in range(len(_naive_bayes_cm)) for j in range(len(_naive_bayes_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _naive_bayes_predictions, digits=3))

CONFUSION MATRIX

[[  81   29    6    3    4]
 [ 189  201   84   36   26]
 [  93  323 1173  219   55]
 [  32   66  109  223  182]
 [   4    3    5   42  112]]

1790

              precision    recall  f1-score   support

           0      0.203     0.659     0.310       123
           1      0.323     0.375     0.347       536
           2      0.852     0.630     0.724      1863
           3      0.426     0.364     0.393       612
           4      0.296     0.675     0.411       166

    accuracy                          0.542      3300
   macro avg      0.420     0.540     0.437      3300
weighted avg      0.635     0.542     0.570      3300



## Logistic Regression

In [58]:
_logistic_regression = ClassificationModel(model_name="LR", parameter_dict={'penalty':'l2'})
_logistic_regression.fit(np.array(X_train["Embeds"].tolist()),y_train)
_logistic_regression_predictions = _logistic_regression.predict(np.array(X_test["Embeds"].tolist()))
_logistic_regression_cm = _logistic_regression.get_confusion_matrix(y_test, _logistic_regression_predictions)
print("CONFUSION MATRIX\n")
print(_logistic_regression_cm)
print("\n"+str(sum([_logistic_regression_cm[i][j] for i in range(len(_logistic_regression_cm)) for j in range(len(_logistic_regression_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _logistic_regression_predictions, digits=3))

CONFUSION MATRIX

[[  33   70   19    1    0]
 [  36  282  212    6    0]
 [   5  169 1554  128    7]
 [   0   27  242  297   46]
 [   0    0   16   93   57]]

2223

              precision    recall  f1-score   support

           0      0.446     0.268     0.335       123
           1      0.515     0.526     0.520       536
           2      0.761     0.834     0.796      1863
           3      0.566     0.485     0.522       612
           4      0.518     0.343     0.413       166

    accuracy                          0.674      3300
   macro avg      0.561     0.491     0.517      3300
weighted avg      0.661     0.674     0.664      3300



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [60]:
_logistic_regression = ClassificationModel(model_name="LR", parameter_dict={'penalty':'l2','tol':0.00000001,'class_weight':'balanced'})
_logistic_regression.fit(np.array(X_train["Embeds"].tolist()),y_train)
_logistic_regression_predictions = _logistic_regression.predict(np.array(X_test["Embeds"].tolist()))
_logistic_regression_cm = _logistic_regression.get_confusion_matrix(y_test, _logistic_regression_predictions)
print("CONFUSION MATRIX\n")
print(_logistic_regression_cm)
print("\n"+str(sum([_logistic_regression_cm[i][j] for i in range(len(_logistic_regression_cm)) for j in range(len(_logistic_regression_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _logistic_regression_predictions, digits=3))

CONFUSION MATRIX

[[  66   52    2    3    0]
 [ 119  298   84   27    8]
 [  40  289 1223  273   38]
 [   8   47  121  308  128]
 [   0    2    6   61   97]]

1992

              precision    recall  f1-score   support

           0      0.283     0.537     0.371       123
           1      0.433     0.556     0.487       536
           2      0.852     0.656     0.741      1863
           3      0.458     0.503     0.480       612
           4      0.358     0.584     0.444       166

    accuracy                          0.604      3300
   macro avg      0.477     0.567     0.505      3300
weighted avg      0.665     0.604     0.623      3300



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [61]:
_logistic_regression = ClassificationModel(model_name="LR", parameter_dict={'penalty':'l2','tol':0.00000001,'solver':'saga'})
_logistic_regression.fit(np.array(X_train["Embeds"].tolist()),y_train)
_logistic_regression_predictions = _logistic_regression.predict(np.array(X_test["Embeds"].tolist()))
_logistic_regression_cm = _logistic_regression.get_confusion_matrix(y_test, _logistic_regression_predictions)
print("CONFUSION MATRIX\n")
print(_logistic_regression_cm)
print("\n"+str(sum([_logistic_regression_cm[i][j] for i in range(len(_logistic_regression_cm)) for j in range(len(_logistic_regression_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _logistic_regression_predictions, digits=3))

CONFUSION MATRIX

[[  33   69   20    1    0]
 [  38  280  208    9    1]
 [  10  156 1548  140    9]
 [   0   23  242  288   59]
 [   0    0   15   82   69]]

2218

              precision    recall  f1-score   support

           0      0.407     0.268     0.324       123
           1      0.530     0.522     0.526       536
           2      0.761     0.831     0.795      1863
           3      0.554     0.471     0.509       612
           4      0.500     0.416     0.454       166

    accuracy                          0.672      3300
   macro avg      0.551     0.502     0.521      3300
weighted avg      0.659     0.672     0.663      3300





In [62]:
_logistic_regression = ClassificationModel(model_name="LR", parameter_dict={'penalty':'l2','tol':0.00000001,'solver':'sag'})
_logistic_regression.fit(np.array(X_train["Embeds"].tolist()),y_train)
_logistic_regression_predictions = _logistic_regression.predict(np.array(X_test["Embeds"].tolist()))
_logistic_regression_cm = _logistic_regression.get_confusion_matrix(y_test, _logistic_regression_predictions)
print("CONFUSION MATRIX\n")
print(_logistic_regression_cm)
print("\n"+str(sum([_logistic_regression_cm[i][j] for i in range(len(_logistic_regression_cm)) for j in range(len(_logistic_regression_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _logistic_regression_predictions, digits=3))

CONFUSION MATRIX

[[  39   63   20    1    0]
 [  49  258  218   10    1]
 [  13  145 1558  138    9]
 [   0   23  244  273   72]
 [   0    0   15   81   70]]

2198

              precision    recall  f1-score   support

           0      0.386     0.317     0.348       123
           1      0.528     0.481     0.503       536
           2      0.758     0.836     0.795      1863
           3      0.543     0.446     0.490       612
           4      0.461     0.422     0.440       166

    accuracy                          0.666      3300
   macro avg      0.535     0.500     0.515      3300
weighted avg      0.652     0.666     0.657      3300





In [63]:
_logistic_regression = ClassificationModel(model_name="LR", parameter_dict={'penalty':'l1','tol':0.00000001, 'solver':'liblinear'})
_logistic_regression.fit(np.array(X_train["Embeds"].tolist()),y_train)
_logistic_regression_predictions = _logistic_regression.predict(np.array(X_test["Embeds"].tolist()))
_logistic_regression_cm = _logistic_regression.get_confusion_matrix(y_test, _logistic_regression_predictions)
print("CONFUSION MATRIX\n")
print(_logistic_regression_cm)
print("\n"+str(sum([_logistic_regression_cm[i][j] for i in range(len(_logistic_regression_cm)) for j in range(len(_logistic_regression_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _logistic_regression_predictions, digits=3))

CONFUSION MATRIX

[[  29   76   17    1    0]
 [  27  277  222   10    0]
 [   8  139 1586  127    3]
 [   0   22  255  297   38]
 [   0    0   16   89   61]]

2250

              precision    recall  f1-score   support

           0      0.453     0.236     0.310       123
           1      0.539     0.517     0.528       536
           2      0.757     0.851     0.801      1863
           3      0.567     0.485     0.523       612
           4      0.598     0.367     0.455       166

    accuracy                          0.682      3300
   macro avg      0.583     0.491     0.523      3300
weighted avg      0.667     0.682     0.669      3300





In [64]:
_logistic_regression = ClassificationModel(model_name="LR", parameter_dict={'penalty':'l2','tol':0.00000001, 'solver':'newton-cg'})
_logistic_regression.fit(np.array(X_train["Embeds"].tolist()),y_train)
_logistic_regression_predictions = _logistic_regression.predict(np.array(X_test["Embeds"].tolist()))
_logistic_regression_cm = _logistic_regression.get_confusion_matrix(y_test, _logistic_regression_predictions)
print("CONFUSION MATRIX\n")
print(_logistic_regression_cm)
print("\n"+str(sum([_logistic_regression_cm[i][j] for i in range(len(_logistic_regression_cm)) for j in range(len(_logistic_regression_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _logistic_regression_predictions, digits=3))

CONFUSION MATRIX

[[  33   68   21    1    0]
 [  45  272  207   11    1]
 [  12  159 1543  139   10]
 [   0   19  243  288   62]
 [   0    0   14   88   64]]

2200

              precision    recall  f1-score   support

           0      0.367     0.268     0.310       123
           1      0.525     0.507     0.516       536
           2      0.761     0.828     0.793      1863
           3      0.546     0.471     0.506       612
           4      0.467     0.386     0.422       166

    accuracy                          0.667      3300
   macro avg      0.533     0.492     0.509      3300
weighted avg      0.653     0.667     0.658      3300





## Support Vector Machine

In [65]:
_svc_classification = ClassificationModel(model_name="SVC", parameter_dict={})
_svc_classification.fit(np.array(X_train["Embeds"].tolist()),y_train)
_svc_classification_predictions = _svc_classification.predict(np.array(X_test["Embeds"].tolist()))
_svc_classification_cm = _svc_classification.get_confusion_matrix(y_test, _svc_classification_predictions)
print("CONFUSION MATRIX\n")
print(_svc_classification_cm)
print("\n"+str(sum([_svc_classification_cm[i][j] for i in range(len(_svc_classification_cm)) for j in range(len(_svc_classification_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _svc_classification_predictions, digits=3))

CONFUSION MATRIX

[[  19   89   14    1    0]
 [  10  298  218   10    0]
 [   4  118 1655   86    0]
 [   0   17  267  315   13]
 [   0    1   20  117   28]]

2315

              precision    recall  f1-score   support

           0      0.576     0.154     0.244       123
           1      0.570     0.556     0.563       536
           2      0.761     0.888     0.820      1863
           3      0.595     0.515     0.552       612
           4      0.683     0.169     0.271       166

    accuracy                          0.702      3300
   macro avg      0.637     0.456     0.490      3300
weighted avg      0.689     0.702     0.679      3300



In [66]:
_svc_classification = ClassificationModel(model_name="SVC", parameter_dict={'kernel':'poly'})
_svc_classification.fit(np.array(X_train["Embeds"].tolist()),y_train)
_svc_classification_predictions = _svc_classification.predict(np.array(X_test["Embeds"].tolist()))
_svc_classification_cm = _svc_classification.get_confusion_matrix(y_test, _svc_classification_predictions)
print("CONFUSION MATRIX\n")
print(_svc_classification_cm)
print("\n"+str(sum([_svc_classification_cm[i][j] for i in range(len(_svc_classification_cm)) for j in range(len(_svc_classification_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _svc_classification_predictions, digits=3))

CONFUSION MATRIX

[[  26   82   14    1    0]
 [  21  280  228    7    0]
 [   5  118 1648   91    1]
 [   0   11  284  297   20]
 [   0    1   18  109   38]]

2289

              precision    recall  f1-score   support

           0      0.500     0.211     0.297       123
           1      0.569     0.522     0.545       536
           2      0.752     0.885     0.813      1863
           3      0.588     0.485     0.532       612
           4      0.644     0.229     0.338       166

    accuracy                          0.694      3300
   macro avg      0.611     0.467     0.505      3300
weighted avg      0.677     0.694     0.674      3300



In [68]:
_svc_classification = ClassificationModel(model_name="SVC", parameter_dict={'kernel':'sigmoid'})
_svc_classification.fit(np.array(X_train["Embeds"].tolist()),y_train)
_svc_classification_predictions = _svc_classification.predict(np.array(X_test["Embeds"].tolist()))
_svc_classification_cm = _svc_classification.get_confusion_matrix(y_test, _svc_classification_predictions)
print("CONFUSION MATRIX\n")
print(_svc_classification_cm)
print("\n"+str(sum([_svc_classification_cm[i][j] for i in range(len(_svc_classification_cm)) for j in range(len(_svc_classification_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _svc_classification_predictions, digits=3))

CONFUSION MATRIX

[[   3   78   36    6    0]
 [   7  236  268   25    0]
 [   2  171 1530  159    1]
 [   0   38  281  280   13]
 [   1    3   39  111   12]]

2061

              precision    recall  f1-score   support

           0      0.231     0.024     0.044       123
           1      0.449     0.440     0.444       536
           2      0.710     0.821     0.762      1863
           3      0.482     0.458     0.469       612
           4      0.462     0.072     0.125       166

    accuracy                          0.625      3300
   macro avg      0.467     0.363     0.369      3300
weighted avg      0.595     0.625     0.597      3300



In [67]:
_svc_classification = ClassificationModel(model_name="SVC", parameter_dict={'kernel':'poly','gamma':'auto'})
_svc_classification.fit(np.array(X_train["Embeds"].tolist()),y_train)
_svc_classification_predictions = _svc_classification.predict(np.array(X_test["Embeds"].tolist()))
_svc_classification_cm = _svc_classification.get_confusion_matrix(y_test, _svc_classification_predictions)
print("CONFUSION MATRIX\n")
print(_svc_classification_cm)
print("\n"+str(sum([_svc_classification_cm[i][j] for i in range(len(_svc_classification_cm)) for j in range(len(_svc_classification_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _svc_classification_predictions, digits=3))

CONFUSION MATRIX

[[  26   82   14    1    0]
 [  21  280  228    7    0]
 [   5  118 1648   91    1]
 [   0   11  284  297   20]
 [   0    1   18  109   38]]

2289

              precision    recall  f1-score   support

           0      0.500     0.211     0.297       123
           1      0.569     0.522     0.545       536
           2      0.752     0.885     0.813      1863
           3      0.588     0.485     0.532       612
           4      0.644     0.229     0.338       166

    accuracy                          0.694      3300
   macro avg      0.611     0.467     0.505      3300
weighted avg      0.677     0.694     0.674      3300



In [69]:
_svc_classification = ClassificationModel(model_name="SVC", parameter_dict={'gamma':'auto'})
_svc_classification.fit(np.array(X_train["Embeds"].tolist()),y_train)
_svc_classification_predictions = _svc_classification.predict(np.array(X_test["Embeds"].tolist()))
_svc_classification_cm = _svc_classification.get_confusion_matrix(y_test, _svc_classification_predictions)
print("CONFUSION MATRIX\n")
print(_svc_classification_cm)
print("\n"+str(sum([_svc_classification_cm[i][j] for i in range(len(_svc_classification_cm)) for j in range(len(_svc_classification_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _svc_classification_predictions, digits=3))

CONFUSION MATRIX

[[  19   89   14    1    0]
 [  10  298  218   10    0]
 [   4  118 1655   86    0]
 [   0   17  267  315   13]
 [   0    1   20  117   28]]

2315

              precision    recall  f1-score   support

           0      0.576     0.154     0.244       123
           1      0.570     0.556     0.563       536
           2      0.761     0.888     0.820      1863
           3      0.595     0.515     0.552       612
           4      0.683     0.169     0.271       166

    accuracy                          0.702      3300
   macro avg      0.637     0.456     0.490      3300
weighted avg      0.689     0.702     0.679      3300



## K-Nearest Neighbours

In [70]:
_knn_classification = ClassificationModel(model_name="KNN", parameter_dict={'n_neighbors':3})
_knn_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_knn_classification_predictions = _knn_classification.predict(np.array(X_test["Embeds"].tolist()))
_knn_classification_cm = _knn_classification.get_confusion_matrix(y_test, list(_knn_classification_predictions))
print("CONFUSION MATRIX\n")
print(_knn_classification_cm)
print("\n"+str(sum([_knn_classification_cm[i][j] for i in range(len(_knn_classification_cm)) for j in range(len(_knn_classification_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _knn_classification_predictions, digits=3))

CONFUSION MATRIX

[[  48   61   13    1    0]
 [  88  272  165   10    1]
 [  32  175 1530  120    6]
 [  11   34  263  261   43]
 [   1    4   23   82   56]]

2167

              precision    recall  f1-score   support

           0      0.267     0.390     0.317       123
           1      0.498     0.507     0.503       536
           2      0.767     0.821     0.793      1863
           3      0.551     0.426     0.481       612
           4      0.528     0.337     0.412       166

    accuracy                          0.657      3300
   macro avg      0.522     0.497     0.501      3300
weighted avg      0.653     0.657     0.651      3300



In [71]:
_knn_classification = ClassificationModel(model_name="KNN", parameter_dict={'n_neighbors':7,'weights':'distance'})
_knn_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_knn_classification_predictions = _knn_classification.predict(np.array(X_test["Embeds"].tolist()))
_knn_classification_cm = _knn_classification.get_confusion_matrix(y_test, list(_knn_classification_predictions))
print("CONFUSION MATRIX\n")
print(_knn_classification_cm)
print("\n"+str(sum([_knn_classification_cm[i][j] for i in range(len(_knn_classification_cm)) for j in range(len(_knn_classification_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _knn_classification_predictions, digits=3))

CONFUSION MATRIX

[[  33   75   14    1    0]
 [  38  294  191   12    1]
 [  15  133 1579  133    3]
 [   2   19  250  295   46]
 [   0    0   19   93   54]]

2255

              precision    recall  f1-score   support

           0      0.375     0.268     0.313       123
           1      0.564     0.549     0.556       536
           2      0.769     0.848     0.806      1863
           3      0.552     0.482     0.515       612
           4      0.519     0.325     0.400       166

    accuracy                          0.683      3300
   macro avg      0.556     0.494     0.518      3300
weighted avg      0.668     0.683     0.673      3300



In [72]:
_knn_classification = ClassificationModel(model_name="KNN", parameter_dict={'n_neighbors':7,'weights':'distance','algorithm':'kd_tree'})
_knn_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_knn_classification_predictions = _knn_classification.predict(np.array(X_test["Embeds"].tolist()))
_knn_classification_cm = _knn_classification.get_confusion_matrix(y_test, list(_knn_classification_predictions))
print("CONFUSION MATRIX\n")
print(_knn_classification_cm)
print("\n"+str(sum([_knn_classification_cm[i][j] for i in range(len(_knn_classification_cm)) for j in range(len(_knn_classification_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _knn_classification_predictions, digits=3))

CONFUSION MATRIX

[[  33   75   14    1    0]
 [  38  294  191   12    1]
 [  15  133 1579  133    3]
 [   2   19  250  295   46]
 [   0    0   19   93   54]]

2255

              precision    recall  f1-score   support

           0      0.375     0.268     0.313       123
           1      0.564     0.549     0.556       536
           2      0.769     0.848     0.806      1863
           3      0.552     0.482     0.515       612
           4      0.519     0.325     0.400       166

    accuracy                          0.683      3300
   macro avg      0.556     0.494     0.518      3300
weighted avg      0.668     0.683     0.673      3300



In [73]:
_knn_classification = ClassificationModel(model_name="KNN", parameter_dict={'p':1, 'n_neighbors':7,'weights':'distance','algorithm':'kd_tree'})
_knn_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_knn_classification_predictions = _knn_classification.predict(np.array(X_test["Embeds"].tolist()))
_knn_classification_cm = _knn_classification.get_confusion_matrix(y_test, list(_knn_classification_predictions))
print("CONFUSION MATRIX\n")
print(_knn_classification_cm)
print("\n"+str(sum([_knn_classification_cm[i][j] for i in range(len(_knn_classification_cm)) for j in range(len(_knn_classification_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _knn_classification_predictions, digits=3))

CONFUSION MATRIX

[[  36   72   14    1    0]
 [  42  289  191   12    2]
 [  15  142 1572  131    3]
 [   2   20  248  294   48]
 [   0    1   18   93   54]]

2245

              precision    recall  f1-score   support

           0      0.379     0.293     0.330       123
           1      0.552     0.539     0.545       536
           2      0.769     0.844     0.805      1863
           3      0.554     0.480     0.514       612
           4      0.505     0.325     0.396       166

    accuracy                          0.680      3300
   macro avg      0.552     0.496     0.518      3300
weighted avg      0.666     0.680     0.671      3300



In [74]:
_knn_classification = ClassificationModel(model_name="KNN", parameter_dict={'n_neighbors':7,'weights':'distance','algorithm':'ball_tree'})
_knn_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_knn_classification_predictions = _knn_classification.predict(np.array(X_test["Embeds"].tolist()))
_knn_classification_cm = _knn_classification.get_confusion_matrix(y_test, list(_knn_classification_predictions))
print("CONFUSION MATRIX\n")
print(_knn_classification_cm)
print("\n"+str(sum([_knn_classification_cm[i][j] for i in range(len(_knn_classification_cm)) for j in range(len(_knn_classification_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _knn_classification_predictions, digits=3))

CONFUSION MATRIX

[[  33   75   14    1    0]
 [  38  294  191   12    1]
 [  15  133 1579  133    3]
 [   2   19  250  295   46]
 [   0    0   19   93   54]]

2255

              precision    recall  f1-score   support

           0      0.375     0.268     0.313       123
           1      0.564     0.549     0.556       536
           2      0.769     0.848     0.806      1863
           3      0.552     0.482     0.515       612
           4      0.519     0.325     0.400       166

    accuracy                          0.683      3300
   macro avg      0.556     0.494     0.518      3300
weighted avg      0.668     0.683     0.673      3300



In [75]:
_knn_classification = ClassificationModel(model_name="KNN", parameter_dict={'p':1, 'n_neighbors':7,'weights':'distance','algorithm':'ball_tree'})
_knn_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_knn_classification_predictions = _knn_classification.predict(np.array(X_test["Embeds"].tolist()))
_knn_classification_cm = _knn_classification.get_confusion_matrix(y_test, list(_knn_classification_predictions))
print("CONFUSION MATRIX\n")
print(_knn_classification_cm)
print("\n"+str(sum([_knn_classification_cm[i][j] for i in range(len(_knn_classification_cm)) for j in range(len(_knn_classification_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _knn_classification_predictions, digits=3))

CONFUSION MATRIX

[[  36   72   14    1    0]
 [  42  289  191   12    2]
 [  15  142 1572  131    3]
 [   2   20  248  294   48]
 [   0    1   18   93   54]]

2245

              precision    recall  f1-score   support

           0      0.379     0.293     0.330       123
           1      0.552     0.539     0.545       536
           2      0.769     0.844     0.805      1863
           3      0.554     0.480     0.514       612
           4      0.505     0.325     0.396       166

    accuracy                          0.680      3300
   macro avg      0.552     0.496     0.518      3300
weighted avg      0.666     0.680     0.671      3300



In [76]:
_knn_classification = ClassificationModel(model_name="KNN", parameter_dict={'n_neighbors':10,'weights':'distance','p':1})
_knn_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_knn_classification_predictions = _knn_classification.predict(np.array(X_test["Embeds"].tolist()))
_knn_classification_cm = _knn_classification.get_confusion_matrix(y_test, list(_knn_classification_predictions))
print("CONFUSION MATRIX\n")
print(_knn_classification_cm)
print("\n"+str(sum([_knn_classification_cm[i][j] for i in range(len(_knn_classification_cm)) for j in range(len(_knn_classification_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _knn_classification_predictions, digits=3))

CONFUSION MATRIX

[[  36   71   15    1    0]
 [  37  281  205   13    0]
 [  12  130 1596  123    2]
 [   1   21  245  302   43]
 [   1    1   16   99   49]]

2264

              precision    recall  f1-score   support

           0      0.414     0.293     0.343       123
           1      0.558     0.524     0.540       536
           2      0.768     0.857     0.810      1863
           3      0.561     0.493     0.525       612
           4      0.521     0.295     0.377       166

    accuracy                          0.686      3300
   macro avg      0.564     0.492     0.519      3300
weighted avg      0.670     0.686     0.674      3300



## Gradient Boosted Decision Tree

In [77]:
_gbdt_classification = ClassificationModel(model_name="GBDT", parameter_dict={})
_gbdt_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_gbdt_classification_predictions = _gbdt_classification.predict(np.array(X_test["Embeds"].tolist()))
_gbdt_classification_cm = _gbdt_classification.get_confusion_matrix(y_test, list(_gbdt_classification_predictions))
print("CONFUSION MATRIX\n")
print(_gbdt_classification_cm)
print("\n"+str(sum([_gbdt_classification_cm[i][j] for i in range(len(_gbdt_classification_cm)) for j in range(len(_gbdt_classification_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _gbdt_classification_predictions, digits=3))

CONFUSION MATRIX

[[  28   71   22    2    0]
 [  22  265  225   23    1]
 [   6  114 1631  110    2]
 [   1   21  262  299   29]
 [   0    2   16  110   38]]

2261

              precision    recall  f1-score   support

           0      0.491     0.228     0.311       123
           1      0.560     0.494     0.525       536
           2      0.756     0.875     0.812      1863
           3      0.550     0.489     0.517       612
           4      0.543     0.229     0.322       166

    accuracy                          0.685      3300
   macro avg      0.580     0.463     0.497      3300
weighted avg      0.666     0.685     0.667      3300



## Decision Tree

In [78]:
_dt_classification = ClassificationModel(model_name="DT", parameter_dict={})
_dt_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_dt_classification_predictions = _dt_classification.predict(np.array(X_test["Embeds"].tolist()))
print("CONFUSION MATRIX\n")
_dt_classification_cm = _dt_classification.get_confusion_matrix(y_test, list(_dt_classification_predictions))
print(_dt_classification_cm)
print("\n")
print(metrics.classification_report(y_test, _dt_classification_predictions, digits=3))

CONFUSION MATRIX

[[  40   52   24    7    0]
 [  48  226  221   37    4]
 [  28  213 1335  247   40]
 [   3   43  258  228   80]
 [   0    6   39   78   43]]


              precision    recall  f1-score   support

           0      0.336     0.325     0.331       123
           1      0.419     0.422     0.420       536
           2      0.711     0.717     0.714      1863
           3      0.382     0.373     0.377       612
           4      0.257     0.259     0.258       166

    accuracy                          0.567      3300
   macro avg      0.421     0.419     0.420      3300
weighted avg      0.566     0.567     0.567      3300



In [79]:
_dt_classification = ClassificationModel(model_name="DT", parameter_dict={'criterion':'entropy'})
_dt_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_dt_classification_predictions = _dt_classification.predict(np.array(X_test["Embeds"].tolist()))
print("CONFUSION MATRIX\n")
_dt_classification_cm = _dt_classification.get_confusion_matrix(y_test, list(_dt_classification_predictions))
print(_dt_classification_cm)
print("\n")
print(metrics.classification_report(y_test, _dt_classification_predictions, digits=3))

CONFUSION MATRIX

[[  33   57   24    8    1]
 [  54  232  211   35    4]
 [  32  208 1362  241   20]
 [   8   48  231  267   58]
 [   0   10   28   86   42]]


              precision    recall  f1-score   support

           0      0.260     0.268     0.264       123
           1      0.418     0.433     0.425       536
           2      0.734     0.731     0.732      1863
           3      0.419     0.436     0.428       612
           4      0.336     0.253     0.289       166

    accuracy                          0.587      3300
   macro avg      0.433     0.424     0.428      3300
weighted avg      0.587     0.587     0.586      3300



In [80]:
_dt_classification = ClassificationModel(model_name="DT", parameter_dict={'max_depth':5})
_dt_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_dt_classification_predictions = _dt_classification.predict(np.array(X_test["Embeds"].tolist()))
print("CONFUSION MATRIX\n")
_dt_classification_cm = _dt_classification.get_confusion_matrix(y_test, list(_dt_classification_predictions))
print(_dt_classification_cm)
print("\n")
print(metrics.classification_report(y_test, _dt_classification_predictions, digits=3))

CONFUSION MATRIX

[[   8   64   48    3    0]
 [  10  208  299   19    0]
 [   6  145 1609  103    0]
 [   0   43  325  244    0]
 [   0   13   50  103    0]]


              precision    recall  f1-score   support

           0      0.333     0.065     0.109       123
           1      0.440     0.388     0.412       536
           2      0.690     0.864     0.767      1863
           3      0.517     0.399     0.450       612
           4      0.000     0.000     0.000       166

    accuracy                          0.627      3300
   macro avg      0.396     0.343     0.348      3300
weighted avg      0.569     0.627     0.588      3300



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Multi Layer Perceptron

In [81]:
_ann_classification = ClassificationModel(model_name="ANN", parameter_dict={})
_ann_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_ann_classification_predictions = _ann_classification.predict(np.array(X_test["Embeds"].tolist()))
_ann_classification_cm = _ann_classification.get_confusion_matrix(y_test, list(_ann_classification_predictions))
print("CONFUSION MATRIX\n")
print(_ann_classification_cm)
print("\n")
print(metrics.classification_report(y_test, _ann_classification_predictions, digits=3))

CONFUSION MATRIX

[[  43   64   13    2    1]
 [  55  267  192   19    3]
 [  15  163 1441  238    6]
 [   2   14  217  316   63]
 [   1    0   17   96   52]]


              precision    recall  f1-score   support

           0      0.371     0.350     0.360       123
           1      0.526     0.498     0.511       536
           2      0.766     0.773     0.770      1863
           3      0.471     0.516     0.493       612
           4      0.416     0.313     0.357       166

    accuracy                          0.642      3300
   macro avg      0.510     0.490     0.498      3300
weighted avg      0.640     0.642     0.641      3300





In [82]:
_ann_classification = ClassificationModel(model_name="ANN", parameter_dict={'activation':'tanh'})
_ann_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_ann_classification_predictions = _ann_classification.predict(np.array(X_test["Embeds"].tolist()))
_ann_classification_cm = _ann_classification.get_confusion_matrix(y_test, list(_ann_classification_predictions))
print("CONFUSION MATRIX\n")
print(_ann_classification_cm)
print("\n")
print(metrics.classification_report(y_test, _ann_classification_predictions, digits=3))

CONFUSION MATRIX

[[  38   67   17    1    0]
 [  40  304  173   17    2]
 [  12  169 1487  194    1]
 [   0   20  214  339   39]
 [   0    0   12  112   42]]


              precision    recall  f1-score   support

           0      0.422     0.309     0.357       123
           1      0.543     0.567     0.555       536
           2      0.781     0.798     0.790      1863
           3      0.511     0.554     0.532       612
           4      0.500     0.253     0.336       166

    accuracy                          0.670      3300
   macro avg      0.552     0.496     0.514      3300
weighted avg      0.665     0.670     0.665      3300





In [83]:
_ann_classification = ClassificationModel(model_name="ANN", parameter_dict={'early_stopping':True})
_ann_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_ann_classification_predictions = _ann_classification.predict(np.array(X_test["Embeds"].tolist()))
_ann_classification_cm = _ann_classification.get_confusion_matrix(y_test, list(_ann_classification_predictions))
print("CONFUSION MATRIX\n")
print(_ann_classification_cm)
print("\n")
print(metrics.classification_report(y_test, _ann_classification_predictions, digits=3))

CONFUSION MATRIX

[[  19   82   21    1    0]
 [  20  263  240   10    3]
 [   2  119 1645   95    2]
 [   1   21  276  281   33]
 [   0    2   18   90   56]]


              precision    recall  f1-score   support

           0      0.452     0.154     0.230       123
           1      0.540     0.491     0.514       536
           2      0.748     0.883     0.810      1863
           3      0.589     0.459     0.516       612
           4      0.596     0.337     0.431       166

    accuracy                          0.686      3300
   macro avg      0.585     0.465     0.500      3300
weighted avg      0.666     0.686     0.667      3300



In [84]:
_ann_classification = ClassificationModel(model_name="ANN", parameter_dict={'early_stopping':True, 'validation_fraction': 0.1})
_ann_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_ann_classification_predictions = _ann_classification.predict(np.array(X_test["Embeds"].tolist()))
_ann_classification_cm = _ann_classification.get_confusion_matrix(y_test, list(_ann_classification_predictions))
print("CONFUSION MATRIX\n")
print(_ann_classification_cm)
print("\n")
print(metrics.classification_report(y_test, _ann_classification_predictions, digits=3))

CONFUSION MATRIX

[[  31   70   21    1    0]
 [  25  283  221    6    1]
 [   7  140 1640   76    0]
 [   1   23  310  266   12]
 [   1    3   18  115   29]]


              precision    recall  f1-score   support

           0      0.477     0.252     0.330       123
           1      0.545     0.528     0.536       536
           2      0.742     0.880     0.805      1863
           3      0.573     0.435     0.494       612
           4      0.690     0.175     0.279       166

    accuracy                          0.682      3300
   macro avg      0.606     0.454     0.489      3300
weighted avg      0.666     0.682     0.660      3300



In [85]:
_ann_classification = ClassificationModel(model_name="ANN", parameter_dict={'learning_rate':'invscaling', 'early_stopping':True, 'validation_fraction': 0.1, 'shuffle':True})
_ann_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_ann_classification_predictions = _ann_classification.predict(np.array(X_test["Embeds"].tolist()))
_ann_classification_cm = _ann_classification.get_confusion_matrix(y_test, list(_ann_classification_predictions))
print("CONFUSION MATRIX\n")
print(_ann_classification_cm)
print("\n")
print(metrics.classification_report(y_test, _ann_classification_predictions, digits=3))

CONFUSION MATRIX

[[  25   86   11    1    0]
 [  25  330  175    6    0]
 [   4  192 1552  107    8]
 [   2   28  244  306   32]
 [   0    1   13   86   66]]


              precision    recall  f1-score   support

           0      0.446     0.203     0.279       123
           1      0.518     0.616     0.563       536
           2      0.778     0.833     0.805      1863
           3      0.605     0.500     0.547       612
           4      0.623     0.398     0.485       166

    accuracy                          0.691      3300
   macro avg      0.594     0.510     0.536      3300
weighted avg      0.683     0.691     0.682      3300

