In [1]:
import pandas as pd

# Read TSV file into DataFrame
df = pd.read_table('Project Files/train.tsv/train.tsv')
df

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
...,...,...,...,...
156055,156056,8544,Hearst 's,2
156056,156057,8544,forced avuncular chortles,1
156057,156058,8544,avuncular chortles,3
156058,156059,8544,avuncular,2


In [2]:
df[1:10]

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
5,6,1,of escapades demonstrating the adage that what...,2
6,7,1,of,2
7,8,1,escapades demonstrating the adage that what is...,2
8,9,1,escapades,2
9,10,1,demonstrating the adage that what is good for ...,2


We can see that the sentences are divided into phrases which are assigned sentiments in the above data sample.

## To chech if all  the phrases are assigned a unique ID, making them all of them unique identifier for records in the dataset.

In [3]:
def _total_number_phrases(df, column_name):
    phrases = df[column_name]
    return phrases, len(phrases)

phrases, _n_phrases = _total_number_phrases(df, "PhraseId")

In [4]:
## Checking the length of list of unique phrase set
len(list(set(phrases)))

156060

In [5]:
## Checking the length of list of all phrases
len(phrases)

156060

In [6]:
def _are_unique(phrases):
    if len(list(set(phrases)))==len(phrases):
        return True
    return False

if _are_unique(phrases):
    print("All the Phrases have UNIQUE ID.")
else:
    print("Phrase ID is not unique.")

All the Phrases have UNIQUE ID.


## Analyzing all the attributes

In [18]:
## Sentiments
set(df["Sentiment"])

{0, 1, 2, 3, 4}

There are 5 sentiments classes in the dataset.

In [8]:
len(set(df['SentenceId']))

8529

There are unique 8529 sentences.

In [9]:
max(set(df['PhraseId']))

156060

There are 156060 unique phrases.

In [10]:
len(set(df['Phrase']))

156060

The phrase content is unique as well and is 156060 in number.

In [11]:
df.isna().sum()

PhraseId      0
SentenceId    0
Phrase        0
Sentiment     0
dtype: int64

## Finding Embeddings of the Phrases.

SETTING UP CLASS FOR INDEPENDENT MODELS

In [12]:
from sentence_transformers import SentenceTransformer
class PreTrained_EmbeddingModels:
    def __init__(self, model_url=None, model_name=None, model=None):
        self.model_url = model_url
        self.model_name = model_name
        self.model = model
        self._load(model_url)
        
    def _load(self, model_url):
        if self.model != None:
            print("Model Switching is disabled.")
            return
        self.model = SentenceTransformer(model_url)
        
    def _get_model(self):
        return self.model

LOADING BERT MODEL

model_instance_bert = PreTrained_EmbeddingModels(model_url="sentence-transformers/all-mpnet-base-v2", model_name="MPNET")._get_model()

FINDING EMBEDDINGS

In [14]:
embeddings_bert = model_instance_bert.encode(df['Phrase'].tolist()[:10000])

In [17]:
1+1

2

In [15]:
import pickle
with open('mypickle_03.pickle', 'wb') as f:
    pickle.dump([embeddings_bert], f)

In [None]:
list(phrases)

## Training and Testing Data

In [19]:
Processing_DF = pd.DataFrame(df.iloc[:10000,:])
Processing_DF['Embeds'] = list(embeddings_bert)

In [20]:
from sklearn.model_selection import train_test_split
Processing_DF.sample(frac=1)
Processing_DF.sample(frac=1)
Processing_DF.sample(frac=1)
X_train, X_test, y_train, y_test = train_test_split(Processing_DF.drop(['Sentiment'],axis=1), list(Processing_DF['Sentiment']), test_size=0.33, random_state=42)

## Classification Models

In [21]:
class ClassificationModel:
    def __init__(self, model_name=None, parameter_dict=None):
        self.model_name = model_name
        self.parameter_dict = parameter_dict
        self.model = self.load_model()
        
    def load_model(self):            
        if self.model_name in ["Naive Bayes", "NB"]:
            from sklearn.naive_bayes import GaussianNB
            return GaussianNB(**self.parameter_dict)

        elif self.model_name in ["Support Vector Machine", "SVC"]:
            from sklearn.svm import SVC
            from sklearn.pipeline import make_pipeline
            from sklearn.preprocessing import StandardScaler
            return make_pipeline(StandardScaler(), SVC(**self.parameter_dict))

        elif self.model_name in ["Logistic Regression", "LR"]:
            from sklearn.linear_model import LogisticRegression
            return LogisticRegression(**self.parameter_dict)

        elif self.model_name in ["Decision Tree", "DT"]:
            from sklearn.tree import DecisionTreeClassifier
            return DecisionTreeClassifier(**self.parameter_dict)

        elif self.model_name in ["K Nearest Neighbour", "KNN"]:
            from sklearn.neighbors import KNeighborsClassifier
            return KNeighborsClassifier(**self.parameter_dict)

        elif self.model_name in ["Multi Layer Perceptron", "ANN"]:
            from sklearn.neural_network import MLPClassifier
            return MLPClassifier(**self.parameter_dict)

        elif self.model_name in ["Gradient Boosted Decision Tree", "GBDT"]:
            from sklearn.ensemble import GradientBoostingClassifier
            return GradientBoostingClassifier(**self.parameter_dict)
        else:
            return None
        
    def fit(self, features, labels):
        self.model = self.model.fit(features, labels)
        
    def predict(self, test_data):
        return self.model.predict(test_data)
    
    def get_confusion_matrix(self, actual, prediction):
        from sklearn.metrics import confusion_matrix
        return confusion_matrix(actual,prediction)

In [25]:
from sklearn import metrics
import numpy as np

## Naive Bayes

In [26]:
_naive_bayes = ClassificationModel(model_name="NB", parameter_dict={})
_naive_bayes.fit(np.array(X_train["Embeds"].tolist()),y_train)
_naive_bayes_predictions = _naive_bayes.predict(np.array(X_test["Embeds"].tolist()))
_naive_bayes_cm = _naive_bayes.get_confusion_matrix(y_test, _naive_bayes_predictions)
print("CONFUSION MATRIX\n")
print(_naive_bayes_cm)
print("\n"+str(sum([_naive_bayes_cm[i][j] for i in range(len(_naive_bayes_cm)) for j in range(len(_naive_bayes_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _naive_bayes_predictions, digits=3))

CONFUSION MATRIX

[[  57   45   14    4    3]
 [ 114  210  156   31   25]
 [  46  193 1330  164  130]
 [  22   53  167  228  142]
 [   1    3   12   55   95]]

1920

              precision    recall  f1-score   support

           0      0.237     0.463     0.314       123
           1      0.417     0.392     0.404       536
           2      0.792     0.714     0.751      1863
           3      0.473     0.373     0.417       612
           4      0.241     0.572     0.339       166

    accuracy                          0.582      3300
   macro avg      0.432     0.503     0.445      3300
weighted avg      0.624     0.582     0.596      3300



## Logistic Regression

In [27]:
_logistic_regression = ClassificationModel(model_name="LR", parameter_dict={'penalty':'l2'})
_logistic_regression.fit(np.array(X_train["Embeds"].tolist()),y_train)
_logistic_regression_predictions = _logistic_regression.predict(np.array(X_test["Embeds"].tolist()))
_logistic_regression_cm = _logistic_regression.get_confusion_matrix(y_test, _logistic_regression_predictions)
print("CONFUSION MATRIX\n")
print(_logistic_regression_cm)
print("\n"+str(sum([_logistic_regression_cm[i][j] for i in range(len(_logistic_regression_cm)) for j in range(len(_logistic_regression_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _logistic_regression_predictions, digits=3))

CONFUSION MATRIX

[[  12   83   24    4    0]
 [   9  254  252   21    0]
 [   4   98 1677   82    2]
 [   1   25  287  283   16]
 [   0    1   26  109   30]]

2256

              precision    recall  f1-score   support

           0      0.462     0.098     0.161       123
           1      0.551     0.474     0.510       536
           2      0.740     0.900     0.812      1863
           3      0.567     0.462     0.509       612
           4      0.625     0.181     0.280       166

    accuracy                          0.684      3300
   macro avg      0.589     0.423     0.455      3300
weighted avg      0.661     0.684     0.656      3300



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
_logistic_regression = ClassificationModel(model_name="LR", parameter_dict={'penalty':'l2','tol':0.00000001,'class_weight':'balanced'})
_logistic_regression.fit(np.array(X_train["Embeds"].tolist()),y_train)
_logistic_regression_predictions = _logistic_regression.predict(np.array(X_test["Embeds"].tolist()))
_logistic_regression_cm = _logistic_regression.get_confusion_matrix(y_test, _logistic_regression_predictions)
print("CONFUSION MATRIX\n")
print(_logistic_regression_cm)
print("\n"+str(sum([_logistic_regression_cm[i][j] for i in range(len(_logistic_regression_cm)) for j in range(len(_logistic_regression_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _logistic_regression_predictions, digits=3))

CONFUSION MATRIX

[[  69   43    4    4    3]
 [ 136  269   85   33   13]
 [  57  263 1253  253   37]
 [  17   55  116  301  123]
 [   0    3    3   39  121]]

2013

              precision    recall  f1-score   support

           0      0.247     0.561     0.343       123
           1      0.425     0.502     0.460       536
           2      0.858     0.673     0.754      1863
           3      0.478     0.492     0.485       612
           4      0.407     0.729     0.523       166

    accuracy                          0.610      3300
   macro avg      0.483     0.591     0.513      3300
weighted avg      0.672     0.610     0.629      3300



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
_logistic_regression = ClassificationModel(model_name="LR", parameter_dict={'penalty':'l2','tol':0.00000001,'solver':'saga'})
_logistic_regression.fit(np.array(X_train["Embeds"].tolist()),y_train)
_logistic_regression_predictions = _logistic_regression.predict(np.array(X_test["Embeds"].tolist()))
_logistic_regression_cm = _logistic_regression.get_confusion_matrix(y_test, _logistic_regression_predictions)
print("CONFUSION MATRIX\n")
print(_logistic_regression_cm)
print("\n"+str(sum([_logistic_regression_cm[i][j] for i in range(len(_logistic_regression_cm)) for j in range(len(_logistic_regression_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _logistic_regression_predictions, digits=3))

CONFUSION MATRIX

[[  12   83   25    3    0]
 [   9  254  252   21    0]
 [   4   98 1677   82    2]
 [   1   25  287  283   16]
 [   0    1   26  110   29]]

2255

              precision    recall  f1-score   support

           0      0.462     0.098     0.161       123
           1      0.551     0.474     0.510       536
           2      0.740     0.900     0.812      1863
           3      0.567     0.462     0.509       612
           4      0.617     0.175     0.272       166

    accuracy                          0.683      3300
   macro avg      0.587     0.422     0.453      3300
weighted avg      0.661     0.683     0.655      3300





In [30]:
_logistic_regression = ClassificationModel(model_name="LR", parameter_dict={'penalty':'l2','tol':0.00000001,'solver':'sag'})
_logistic_regression.fit(np.array(X_train["Embeds"].tolist()),y_train)
_logistic_regression_predictions = _logistic_regression.predict(np.array(X_test["Embeds"].tolist()))
_logistic_regression_cm = _logistic_regression.get_confusion_matrix(y_test, _logistic_regression_predictions)
print("CONFUSION MATRIX\n")
print(_logistic_regression_cm)
print("\n"+str(sum([_logistic_regression_cm[i][j] for i in range(len(_logistic_regression_cm)) for j in range(len(_logistic_regression_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _logistic_regression_predictions, digits=3))

CONFUSION MATRIX

[[  12   83   25    3    0]
 [   9  254  252   21    0]
 [   4   98 1677   82    2]
 [   1   25  287  283   16]
 [   0    1   26  110   29]]

2255

              precision    recall  f1-score   support

           0      0.462     0.098     0.161       123
           1      0.551     0.474     0.510       536
           2      0.740     0.900     0.812      1863
           3      0.567     0.462     0.509       612
           4      0.617     0.175     0.272       166

    accuracy                          0.683      3300
   macro avg      0.587     0.422     0.453      3300
weighted avg      0.661     0.683     0.655      3300





In [31]:
_logistic_regression = ClassificationModel(model_name="LR", parameter_dict={'penalty':'l1','tol':0.00000001, 'solver':'liblinear'})
_logistic_regression.fit(np.array(X_train["Embeds"].tolist()),y_train)
_logistic_regression_predictions = _logistic_regression.predict(np.array(X_test["Embeds"].tolist()))
_logistic_regression_cm = _logistic_regression.get_confusion_matrix(y_test, _logistic_regression_predictions)
print("CONFUSION MATRIX\n")
print(_logistic_regression_cm)
print("\n"+str(sum([_logistic_regression_cm[i][j] for i in range(len(_logistic_regression_cm)) for j in range(len(_logistic_regression_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _logistic_regression_predictions, digits=3))

CONFUSION MATRIX

[[   5   86   30    2    0]
 [   3  235  276   22    0]
 [   1   81 1711   68    2]
 [   0   29  330  248    5]
 [   0    1   35  114   16]]

2215

              precision    recall  f1-score   support

           0      0.556     0.041     0.076       123
           1      0.544     0.438     0.486       536
           2      0.718     0.918     0.806      1863
           3      0.546     0.405     0.465       612
           4      0.696     0.096     0.169       166

    accuracy                          0.671      3300
   macro avg      0.612     0.380     0.400      3300
weighted avg      0.651     0.671     0.632      3300



In [32]:
_logistic_regression = ClassificationModel(model_name="LR", parameter_dict={'penalty':'l2','tol':0.00000001, 'solver':'newton-cg'})
_logistic_regression.fit(np.array(X_train["Embeds"].tolist()),y_train)
_logistic_regression_predictions = _logistic_regression.predict(np.array(X_test["Embeds"].tolist()))
_logistic_regression_cm = _logistic_regression.get_confusion_matrix(y_test, _logistic_regression_predictions)
print("CONFUSION MATRIX\n")
print(_logistic_regression_cm)
print("\n"+str(sum([_logistic_regression_cm[i][j] for i in range(len(_logistic_regression_cm)) for j in range(len(_logistic_regression_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _logistic_regression_predictions, digits=3))

CONFUSION MATRIX

[[  12   83   25    3    0]
 [   9  254  252   21    0]
 [   4   98 1677   82    2]
 [   1   25  287  283   16]
 [   0    1   26  110   29]]

2255

              precision    recall  f1-score   support

           0      0.462     0.098     0.161       123
           1      0.551     0.474     0.510       536
           2      0.740     0.900     0.812      1863
           3      0.567     0.462     0.509       612
           4      0.617     0.175     0.272       166

    accuracy                          0.683      3300
   macro avg      0.587     0.422     0.453      3300
weighted avg      0.661     0.683     0.655      3300





## Support Vector Classifier

In [33]:
_svc_classification = ClassificationModel(model_name="SVC", parameter_dict={})
_svc_classification.fit(np.array(X_train["Embeds"].tolist()),y_train)
_svc_classification_predictions = _svc_classification.predict(np.array(X_test["Embeds"].tolist()))
_svc_classification_cm = _svc_classification.get_confusion_matrix(y_test, _svc_classification_predictions)
print("CONFUSION MATRIX\n")
print(_svc_classification_cm)
print("\n"+str(sum([_svc_classification_cm[i][j] for i in range(len(_svc_classification_cm)) for j in range(len(_svc_classification_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _svc_classification_predictions, digits=3))

CONFUSION MATRIX

[[  20   84   19    0    0]
 [  13  268  246    9    0]
 [   5   92 1697   68    1]
 [   0   17  292  287   16]
 [   0    0   18  112   36]]

2308

              precision    recall  f1-score   support

           0      0.526     0.163     0.248       123
           1      0.581     0.500     0.538       536
           2      0.747     0.911     0.821      1863
           3      0.603     0.469     0.528       612
           4      0.679     0.217     0.329       166

    accuracy                          0.699      3300
   macro avg      0.627     0.452     0.493      3300
weighted avg      0.682     0.699     0.674      3300



In [34]:
_svc_classification = ClassificationModel(model_name="SVC", parameter_dict={'kernel':'poly'})
_svc_classification.fit(np.array(X_train["Embeds"].tolist()),y_train)
_svc_classification_predictions = _svc_classification.predict(np.array(X_test["Embeds"].tolist()))
_svc_classification_cm = _svc_classification.get_confusion_matrix(y_test, _svc_classification_predictions)
print("CONFUSION MATRIX\n")
print(_svc_classification_cm)
print("\n"+str(sum([_svc_classification_cm[i][j] for i in range(len(_svc_classification_cm)) for j in range(len(_svc_classification_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _svc_classification_predictions, digits=3))

CONFUSION MATRIX

[[  20   72   31    0    0]
 [  13  205  312    6    0]
 [   7   55 1733   67    1]
 [   0   13  358  225   16]
 [   0    0   40   94   32]]

2215

              precision    recall  f1-score   support

           0      0.500     0.163     0.245       123
           1      0.594     0.382     0.465       536
           2      0.700     0.930     0.799      1863
           3      0.574     0.368     0.448       612
           4      0.653     0.193     0.298       166

    accuracy                          0.671      3300
   macro avg      0.604     0.407     0.451      3300
weighted avg      0.650     0.671     0.634      3300



In [36]:
_svc_classification = ClassificationModel(model_name="SVC", parameter_dict={'kernel':'sigmoid'})
_svc_classification.fit(np.array(X_train["Embeds"].tolist()),y_train)
_svc_classification_predictions = _svc_classification.predict(np.array(X_test["Embeds"].tolist()))
_svc_classification_cm = _svc_classification.get_confusion_matrix(y_test, _svc_classification_predictions)
print("CONFUSION MATRIX\n")
print(_svc_classification_cm)
print("\n"+str(sum([_svc_classification_cm[i][j] for i in range(len(_svc_classification_cm)) for j in range(len(_svc_classification_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _svc_classification_predictions, digits=3))

CONFUSION MATRIX

[[   5   90   26    2    0]
 [   6  257  258   15    0]
 [   1  112 1650  100    0]
 [   0   36  286  279   11]
 [   0    2   20  124   20]]

2211

              precision    recall  f1-score   support

           0      0.417     0.041     0.074       123
           1      0.517     0.479     0.498       536
           2      0.737     0.886     0.804      1863
           3      0.537     0.456     0.493       612
           4      0.645     0.120     0.203       166

    accuracy                          0.670      3300
   macro avg      0.570     0.396     0.414      3300
weighted avg      0.647     0.670     0.639      3300



In [35]:
_svc_classification = ClassificationModel(model_name="SVC", parameter_dict={'kernel':'poly','gamma':'auto'})
_svc_classification.fit(np.array(X_train["Embeds"].tolist()),y_train)
_svc_classification_predictions = _svc_classification.predict(np.array(X_test["Embeds"].tolist()))
_svc_classification_cm = _svc_classification.get_confusion_matrix(y_test, _svc_classification_predictions)
print("CONFUSION MATRIX\n")
print(_svc_classification_cm)
print("\n"+str(sum([_svc_classification_cm[i][j] for i in range(len(_svc_classification_cm)) for j in range(len(_svc_classification_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _svc_classification_predictions, digits=3))

CONFUSION MATRIX

[[  19   73   31    0    0]
 [  12  203  315    6    0]
 [   7   55 1733   67    1]
 [   0   13  360  224   15]
 [   0    0   40   95   31]]

2210

              precision    recall  f1-score   support

           0      0.500     0.154     0.236       123
           1      0.590     0.379     0.461       536
           2      0.699     0.930     0.798      1863
           3      0.571     0.366     0.446       612
           4      0.660     0.187     0.291       166

    accuracy                          0.670      3300
   macro avg      0.604     0.403     0.447      3300
weighted avg      0.648     0.670     0.632      3300



In [37]:
_svc_classification = ClassificationModel(model_name="SVC", parameter_dict={'gamma':'auto'})
_svc_classification.fit(np.array(X_train["Embeds"].tolist()),y_train)
_svc_classification_predictions = _svc_classification.predict(np.array(X_test["Embeds"].tolist()))
_svc_classification_cm = _svc_classification.get_confusion_matrix(y_test, _svc_classification_predictions)
print("CONFUSION MATRIX\n")
print(_svc_classification_cm)
print("\n"+str(sum([_svc_classification_cm[i][j] for i in range(len(_svc_classification_cm)) for j in range(len(_svc_classification_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _svc_classification_predictions, digits=3))

CONFUSION MATRIX

[[  20   84   19    0    0]
 [  13  268  246    9    0]
 [   5   92 1697   68    1]
 [   0   17  292  287   16]
 [   0    0   18  112   36]]

2308

              precision    recall  f1-score   support

           0      0.526     0.163     0.248       123
           1      0.581     0.500     0.538       536
           2      0.747     0.911     0.821      1863
           3      0.603     0.469     0.528       612
           4      0.679     0.217     0.329       166

    accuracy                          0.699      3300
   macro avg      0.627     0.452     0.493      3300
weighted avg      0.682     0.699     0.674      3300



## K-Nearest Neighbours

In [38]:
_knn_classification = ClassificationModel(model_name="KNN", parameter_dict={'n_neighbors':3})
_knn_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_knn_classification_predictions = _knn_classification.predict(np.array(X_test["Embeds"].tolist()))
_knn_classification_cm = _knn_classification.get_confusion_matrix(y_test, list(_knn_classification_predictions))
print("CONFUSION MATRIX\n")
print(_knn_classification_cm)
print("\n"+str(sum([_knn_classification_cm[i][j] for i in range(len(_knn_classification_cm)) for j in range(len(_knn_classification_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _knn_classification_predictions, digits=3))

CONFUSION MATRIX

[[  47   61   14    1    0]
 [  91  241  181   22    1]
 [  35  203 1449  167    9]
 [  11   58  276  228   39]
 [   3    6   42   64   51]]

2016

              precision    recall  f1-score   support

           0      0.251     0.382     0.303       123
           1      0.424     0.450     0.436       536
           2      0.739     0.778     0.758      1863
           3      0.473     0.373     0.417       612
           4      0.510     0.307     0.383       166

    accuracy                          0.611      3300
   macro avg      0.479     0.458     0.459      3300
weighted avg      0.608     0.611     0.606      3300



In [39]:
_knn_classification = ClassificationModel(model_name="KNN", parameter_dict={'n_neighbors':7,'weights':'distance'})
_knn_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_knn_classification_predictions = _knn_classification.predict(np.array(X_test["Embeds"].tolist()))
_knn_classification_cm = _knn_classification.get_confusion_matrix(y_test, list(_knn_classification_predictions))
print("CONFUSION MATRIX\n")
print(_knn_classification_cm)
print("\n"+str(sum([_knn_classification_cm[i][j] for i in range(len(_knn_classification_cm)) for j in range(len(_knn_classification_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _knn_classification_predictions, digits=3))

CONFUSION MATRIX

[[  35   63   24    1    0]
 [  39  236  235   26    0]
 [  13  128 1567  144   11]
 [   1   28  272  277   34]
 [   1    0   23   88   54]]

2169

              precision    recall  f1-score   support

           0      0.393     0.285     0.330       123
           1      0.519     0.440     0.476       536
           2      0.739     0.841     0.787      1863
           3      0.517     0.453     0.483       612
           4      0.545     0.325     0.408       166

    accuracy                          0.657      3300
   macro avg      0.543     0.469     0.497      3300
weighted avg      0.639     0.657     0.644      3300



In [40]:
_knn_classification = ClassificationModel(model_name="KNN", parameter_dict={'n_neighbors':7,'weights':'distance','algorithm':'kd_tree'})
_knn_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_knn_classification_predictions = _knn_classification.predict(np.array(X_test["Embeds"].tolist()))
_knn_classification_cm = _knn_classification.get_confusion_matrix(y_test, list(_knn_classification_predictions))
print("CONFUSION MATRIX\n")
print(_knn_classification_cm)
print("\n"+str(sum([_knn_classification_cm[i][j] for i in range(len(_knn_classification_cm)) for j in range(len(_knn_classification_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _knn_classification_predictions, digits=3))

CONFUSION MATRIX

[[  35   63   24    1    0]
 [  39  236  235   26    0]
 [  13  128 1567  144   11]
 [   1   28  272  277   34]
 [   1    0   23   88   54]]

2169

              precision    recall  f1-score   support

           0      0.393     0.285     0.330       123
           1      0.519     0.440     0.476       536
           2      0.739     0.841     0.787      1863
           3      0.517     0.453     0.483       612
           4      0.545     0.325     0.408       166

    accuracy                          0.657      3300
   macro avg      0.543     0.469     0.497      3300
weighted avg      0.639     0.657     0.644      3300



In [41]:
_knn_classification = ClassificationModel(model_name="KNN", parameter_dict={'p':1, 'n_neighbors':7,'weights':'distance','algorithm':'kd_tree'})
_knn_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_knn_classification_predictions = _knn_classification.predict(np.array(X_test["Embeds"].tolist()))
_knn_classification_cm = _knn_classification.get_confusion_matrix(y_test, list(_knn_classification_predictions))
print("CONFUSION MATRIX\n")
print(_knn_classification_cm)
print("\n"+str(sum([_knn_classification_cm[i][j] for i in range(len(_knn_classification_cm)) for j in range(len(_knn_classification_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _knn_classification_predictions, digits=3))

CONFUSION MATRIX

[[  34   65   23    1    0]
 [  40  230  241   25    0]
 [  13  123 1574  141   12]
 [   2   32  272  270   36]
 [   1    0   25   87   53]]

2161

              precision    recall  f1-score   support

           0      0.378     0.276     0.319       123
           1      0.511     0.429     0.467       536
           2      0.737     0.845     0.787      1863
           3      0.515     0.441     0.475       612
           4      0.525     0.319     0.397       166

    accuracy                          0.655      3300
   macro avg      0.533     0.462     0.489      3300
weighted avg      0.635     0.655     0.640      3300



In [42]:
_knn_classification = ClassificationModel(model_name="KNN", parameter_dict={'n_neighbors':7,'weights':'distance','algorithm':'ball_tree'})
_knn_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_knn_classification_predictions = _knn_classification.predict(np.array(X_test["Embeds"].tolist()))
_knn_classification_cm = _knn_classification.get_confusion_matrix(y_test, list(_knn_classification_predictions))
print("CONFUSION MATRIX\n")
print(_knn_classification_cm)
print("\n"+str(sum([_knn_classification_cm[i][j] for i in range(len(_knn_classification_cm)) for j in range(len(_knn_classification_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _knn_classification_predictions, digits=3))

CONFUSION MATRIX

[[  35   63   24    1    0]
 [  39  236  235   26    0]
 [  13  128 1567  144   11]
 [   1   28  272  277   34]
 [   1    0   23   88   54]]

2169

              precision    recall  f1-score   support

           0      0.393     0.285     0.330       123
           1      0.519     0.440     0.476       536
           2      0.739     0.841     0.787      1863
           3      0.517     0.453     0.483       612
           4      0.545     0.325     0.408       166

    accuracy                          0.657      3300
   macro avg      0.543     0.469     0.497      3300
weighted avg      0.639     0.657     0.644      3300



In [43]:
_knn_classification = ClassificationModel(model_name="KNN", parameter_dict={'p':1, 'n_neighbors':7,'weights':'distance','algorithm':'ball_tree'})
_knn_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_knn_classification_predictions = _knn_classification.predict(np.array(X_test["Embeds"].tolist()))
_knn_classification_cm = _knn_classification.get_confusion_matrix(y_test, list(_knn_classification_predictions))
print("CONFUSION MATRIX\n")
print(_knn_classification_cm)
print("\n"+str(sum([_knn_classification_cm[i][j] for i in range(len(_knn_classification_cm)) for j in range(len(_knn_classification_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _knn_classification_predictions, digits=3))

CONFUSION MATRIX

[[  34   65   23    1    0]
 [  40  230  241   25    0]
 [  13  123 1574  141   12]
 [   2   32  272  270   36]
 [   1    0   25   87   53]]

2161

              precision    recall  f1-score   support

           0      0.378     0.276     0.319       123
           1      0.511     0.429     0.467       536
           2      0.737     0.845     0.787      1863
           3      0.515     0.441     0.475       612
           4      0.525     0.319     0.397       166

    accuracy                          0.655      3300
   macro avg      0.533     0.462     0.489      3300
weighted avg      0.635     0.655     0.640      3300



In [44]:
_knn_classification = ClassificationModel(model_name="KNN", parameter_dict={'n_neighbors':10,'weights':'distance','p':1})
_knn_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_knn_classification_predictions = _knn_classification.predict(np.array(X_test["Embeds"].tolist()))
_knn_classification_cm = _knn_classification.get_confusion_matrix(y_test, list(_knn_classification_predictions))
print("CONFUSION MATRIX\n")
print(_knn_classification_cm)
print("\n"+str(sum([_knn_classification_cm[i][j] for i in range(len(_knn_classification_cm)) for j in range(len(_knn_classification_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _knn_classification_predictions, digits=3))

CONFUSION MATRIX

[[  29   68   25    1    0]
 [  40  224  247   25    0]
 [  10  107 1621  117    8]
 [   0   24  276  282   30]
 [   0    0   24   86   56]]

2212

              precision    recall  f1-score   support

           0      0.367     0.236     0.287       123
           1      0.530     0.418     0.467       536
           2      0.739     0.870     0.799      1863
           3      0.552     0.461     0.502       612
           4      0.596     0.337     0.431       166

    accuracy                          0.670      3300
   macro avg      0.557     0.464     0.497      3300
weighted avg      0.649     0.670     0.653      3300



## Gradient Boosted Decision Tree

In [45]:
_gbdt_classification = ClassificationModel(model_name="GBDT", parameter_dict={})
_gbdt_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_gbdt_classification_predictions = _gbdt_classification.predict(np.array(X_test["Embeds"].tolist()))
_gbdt_classification_cm = _gbdt_classification.get_confusion_matrix(y_test, list(_gbdt_classification_predictions))
print("CONFUSION MATRIX\n")
print(_gbdt_classification_cm)
print("\n"+str(sum([_gbdt_classification_cm[i][j] for i in range(len(_gbdt_classification_cm)) for j in range(len(_gbdt_classification_cm[i]))  if i==j]))+"\n")
print(metrics.classification_report(y_test, _gbdt_classification_predictions, digits=3))

CONFUSION MATRIX

[[  16   69   37    1    0]
 [   9  208  300   19    0]
 [   2   83 1707   64    7]
 [   1   22  347  222   20]
 [   0    3   45   89   29]]

2182

              precision    recall  f1-score   support

           0      0.571     0.130     0.212       123
           1      0.540     0.388     0.452       536
           2      0.701     0.916     0.794      1863
           3      0.562     0.363     0.441       612
           4      0.518     0.175     0.261       166

    accuracy                          0.661      3300
   macro avg      0.578     0.394     0.432      3300
weighted avg      0.635     0.661     0.625      3300



## Decision Tree

In [46]:
_dt_classification = ClassificationModel(model_name="DT", parameter_dict={})
_dt_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_dt_classification_predictions = _dt_classification.predict(np.array(X_test["Embeds"].tolist()))
print("CONFUSION MATRIX\n")
_dt_classification_cm = _dt_classification.get_confusion_matrix(y_test, list(_dt_classification_predictions))
print(_dt_classification_cm)
print("\n")
print(metrics.classification_report(y_test, _dt_classification_predictions, digits=3))

CONFUSION MATRIX

[[  17   42   46   14    4]
 [  46  182  234   65    9]
 [  63  202 1216  313   69]
 [  20   62  276  209   45]
 [   5   15   57   57   32]]


              precision    recall  f1-score   support

           0      0.113     0.138     0.124       123
           1      0.362     0.340     0.350       536
           2      0.665     0.653     0.659      1863
           3      0.318     0.342     0.329       612
           4      0.201     0.193     0.197       166

    accuracy                          0.502      3300
   macro avg      0.332     0.333     0.332      3300
weighted avg      0.507     0.502     0.504      3300



In [47]:
_dt_classification = ClassificationModel(model_name="DT", parameter_dict={'criterion':'entropy'})
_dt_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_dt_classification_predictions = _dt_classification.predict(np.array(X_test["Embeds"].tolist()))
print("CONFUSION MATRIX\n")
_dt_classification_cm = _dt_classification.get_confusion_matrix(y_test, list(_dt_classification_predictions))
print(_dt_classification_cm)
print("\n")
print(metrics.classification_report(y_test, _dt_classification_predictions, digits=3))

CONFUSION MATRIX

[[  20   39   44   14    6]
 [  50  161  255   59   11]
 [  37  236 1239  299   52]
 [  17   75  276  196   48]
 [   4    8   49   69   36]]


              precision    recall  f1-score   support

           0      0.156     0.163     0.159       123
           1      0.310     0.300     0.305       536
           2      0.665     0.665     0.665      1863
           3      0.308     0.320     0.314       612
           4      0.235     0.217     0.226       166

    accuracy                          0.501      3300
   macro avg      0.335     0.333     0.334      3300
weighted avg      0.501     0.501     0.501      3300



In [48]:
_dt_classification = ClassificationModel(model_name="DT", parameter_dict={'max_depth':5})
_dt_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_dt_classification_predictions = _dt_classification.predict(np.array(X_test["Embeds"].tolist()))
print("CONFUSION MATRIX\n")
_dt_classification_cm = _dt_classification.get_confusion_matrix(y_test, list(_dt_classification_predictions))
print(_dt_classification_cm)
print("\n")
print(metrics.classification_report(y_test, _dt_classification_predictions, digits=3))

CONFUSION MATRIX

[[   0   32   82    9    0]
 [   0   77  415   42    2]
 [   0   71 1651  132    9]
 [   0   25  453  126    8]
 [   0    9  110   39    8]]


              precision    recall  f1-score   support

           0      0.000     0.000     0.000       123
           1      0.360     0.144     0.205       536
           2      0.609     0.886     0.722      1863
           3      0.362     0.206     0.263       612
           4      0.296     0.048     0.083       166

    accuracy                          0.564      3300
   macro avg      0.325     0.257     0.255      3300
weighted avg      0.484     0.564     0.494      3300



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Multi Layer Perceptron

In [49]:
_ann_classification = ClassificationModel(model_name="ANN", parameter_dict={})
_ann_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_ann_classification_predictions = _ann_classification.predict(np.array(X_test["Embeds"].tolist()))
_ann_classification_cm = _ann_classification.get_confusion_matrix(y_test, list(_ann_classification_predictions))
print("CONFUSION MATRIX\n")
print(_ann_classification_cm)
print("\n")
print(metrics.classification_report(y_test, _ann_classification_predictions, digits=3))

CONFUSION MATRIX

[[  39   60   20    2    2]
 [  47  277  191   19    2]
 [  12  168 1493  182    8]
 [   2   20  243  298   49]
 [   0    1   14   96   55]]


              precision    recall  f1-score   support

           0      0.390     0.317     0.350       123
           1      0.527     0.517     0.522       536
           2      0.761     0.801     0.781      1863
           3      0.499     0.487     0.493       612
           4      0.474     0.331     0.390       166

    accuracy                          0.655      3300
   macro avg      0.530     0.491     0.507      3300
weighted avg      0.646     0.655     0.650      3300





In [50]:
_ann_classification = ClassificationModel(model_name="ANN", parameter_dict={'activation':'tanh'})
_ann_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_ann_classification_predictions = _ann_classification.predict(np.array(X_test["Embeds"].tolist()))
_ann_classification_cm = _ann_classification.get_confusion_matrix(y_test, list(_ann_classification_predictions))
print("CONFUSION MATRIX\n")
print(_ann_classification_cm)
print("\n")
print(metrics.classification_report(y_test, _ann_classification_predictions, digits=3))

CONFUSION MATRIX

[[  40   62   18    2    1]
 [  68  267  181   18    2]
 [  27  160 1523  138   15]
 [   4   35  241  267   65]
 [   0    1   16   86   63]]


              precision    recall  f1-score   support

           0      0.288     0.325     0.305       123
           1      0.509     0.498     0.503       536
           2      0.770     0.817     0.793      1863
           3      0.523     0.436     0.476       612
           4      0.432     0.380     0.404       166

    accuracy                          0.655      3300
   macro avg      0.504     0.491     0.496      3300
weighted avg      0.646     0.655     0.649      3300





In [51]:
_ann_classification = ClassificationModel(model_name="ANN", parameter_dict={'early_stopping':True})
_ann_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_ann_classification_predictions = _ann_classification.predict(np.array(X_test["Embeds"].tolist()))
_ann_classification_cm = _ann_classification.get_confusion_matrix(y_test, list(_ann_classification_predictions))
print("CONFUSION MATRIX\n")
print(_ann_classification_cm)
print("\n")
print(metrics.classification_report(y_test, _ann_classification_predictions, digits=3))

CONFUSION MATRIX

[[  29   73   18    3    0]
 [  24  296  197   19    0]
 [   6  154 1584  115    4]
 [   1   29  247  294   41]
 [   0    2   10   86   68]]


              precision    recall  f1-score   support

           0      0.483     0.236     0.317       123
           1      0.534     0.552     0.543       536
           2      0.770     0.850     0.808      1863
           3      0.569     0.480     0.521       612
           4      0.602     0.410     0.487       166

    accuracy                          0.688      3300
   macro avg      0.592     0.506     0.535      3300
weighted avg      0.675     0.688     0.677      3300



In [52]:
_ann_classification = ClassificationModel(model_name="ANN", parameter_dict={'early_stopping':True, 'validation_fraction': 0.1})
_ann_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_ann_classification_predictions = _ann_classification.predict(np.array(X_test["Embeds"].tolist()))
_ann_classification_cm = _ann_classification.get_confusion_matrix(y_test, list(_ann_classification_predictions))
print("CONFUSION MATRIX\n")
print(_ann_classification_cm)
print("\n")
print(metrics.classification_report(y_test, _ann_classification_predictions, digits=3))

CONFUSION MATRIX

[[  18   83   19    3    0]
 [  11  299  207   19    0]
 [   2  134 1601  124    2]
 [   2   23  245  317   25]
 [   0    1   10  109   46]]


              precision    recall  f1-score   support

           0      0.545     0.146     0.231       123
           1      0.554     0.558     0.556       536
           2      0.769     0.859     0.812      1863
           3      0.554     0.518     0.535       612
           4      0.630     0.277     0.385       166

    accuracy                          0.691      3300
   macro avg      0.610     0.472     0.504      3300
weighted avg      0.679     0.691     0.676      3300



In [53]:
_ann_classification = ClassificationModel(model_name="ANN", parameter_dict={'learning_rate':'invscaling', 'early_stopping':True, 'validation_fraction': 0.1, 'shuffle':True})
_ann_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_ann_classification_predictions = _ann_classification.predict(np.array(X_test["Embeds"].tolist()))
_ann_classification_cm = _ann_classification.get_confusion_matrix(y_test, list(_ann_classification_predictions))
print("CONFUSION MATRIX\n")
print(_ann_classification_cm)
print("\n")
print(metrics.classification_report(y_test, _ann_classification_predictions, digits=3))

CONFUSION MATRIX

[[   0   97   23    3    0]
 [   0  273  237   26    0]
 [   0  121 1635  107    0]
 [   0   31  267  310    4]
 [   0    2   19  136    9]]


              precision    recall  f1-score   support

           0      0.000     0.000     0.000       123
           1      0.521     0.509     0.515       536
           2      0.750     0.878     0.809      1863
           3      0.533     0.507     0.519       612
           4      0.692     0.054     0.101       166

    accuracy                          0.675      3300
   macro avg      0.499     0.390     0.389      3300
weighted avg      0.641     0.675     0.642      3300



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
