In [1]:
import pandas as pd

# Read TSV file into DataFrame
df = pd.read_table('Project Files/train.tsv/train.tsv')
df

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
...,...,...,...,...
156055,156056,8544,Hearst 's,2
156056,156057,8544,forced avuncular chortles,1
156057,156058,8544,avuncular chortles,3
156058,156059,8544,avuncular,2


In [2]:
df[1:10]

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
5,6,1,of escapades demonstrating the adage that what...,2
6,7,1,of,2
7,8,1,escapades demonstrating the adage that what is...,2
8,9,1,escapades,2
9,10,1,demonstrating the adage that what is good for ...,2


We can see that the sentences are divided into phrases which are assigned sentiments in the above data sample.

## To chech if all  the phrases are assigned a unique ID, making them all of them unique identifier for records in the dataset.

In [3]:
def _total_number_phrases(df, column_name):
    phrases = df[column_name]
    return phrases, len(phrases)

phrases, _n_phrases = _total_number_phrases(df, "PhraseId")

In [4]:
## Checking the length of list of unique phrase set
len(list(set(phrases)))

156060

In [5]:
## Checking the length of list of all phrases
len(phrases)

156060

In [6]:
def _are_unique(phrases):
    if len(list(set(phrases)))==len(phrases):
        return True
    return False

if _are_unique(phrases):
    print("All the Phrases have UNIQUE ID.")
else:
    print("Phrase ID is not unique.")

All the Phrases have UNIQUE ID.


## Analyzing all the attributes

In [7]:
## Sentiments
set(df["Sentiment"])

{0, 1, 2, 3, 4}

There are 5 sentiments classes in the dataset.

In [8]:
len(set(df['SentenceId']))

8529

There are unique 8529 sentences.

In [9]:
max(set(df['PhraseId']))

156060

There are 156060 unique phrases.

In [10]:
len(set(df['Phrase']))

156060

The phrase content is unique as well and is 156060 in number.

In [11]:
df.isna().sum()

PhraseId      0
SentenceId    0
Phrase        0
Sentiment     0
dtype: int64

## Finding Embeddings of the Phrases.

SETTING UP CLASS FOR INDEPENDENT MODELS

In [13]:
from sentence_transformers import SentenceTransformer
class PreTrained_EmbeddingModels:
    def __init__(self, model_url=None, model_name=None, model=None):
        self.model_url = model_url
        self.model_name = model_name
        self.model = model
        self._load(model_url)
        
    def _load(self, model_url):
        if self.model != None:
            print("Model Switching is disabled.")
            return
        self.model = SentenceTransformer(model_url)
        
    def _get_model(self):
        return self.model

LOADING BERT MODEL

In [14]:
model_instance_bert = PreTrained_EmbeddingModels(model_url="all-MiniLM-L6-v2", model_name="Sentence BERT")._get_model()

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

FINDING EMBEDDINGS

In [15]:
embeddings_bert = model_instance_bert.encode(df['Phrase'].tolist()[:10000])

In [16]:
import pickle
with open('mypickle_01.pickle', 'wb') as f:
    pickle.dump([embeddings_bert], f)

In [17]:
len(embeddings_bert)

10000

In [20]:
len(embeddings_bert[0])

384

In [21]:
embeddings_bert.shape

(10000, 384)

In [36]:
Processing_DF = pd.DataFrame(df.iloc[:10000,:])
Processing_DF['Embeds'] = list(embeddings_bert)

In [39]:
Processing_DF

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,Embeds
0,1,1,A series of escapades demonstrating the adage ...,1,"[-0.058720715, -0.032839425, 0.047727715, 0.03..."
1,2,1,A series of escapades demonstrating the adage ...,2,"[-0.01865507, -0.01602184, 0.048112143, 0.0666..."
2,3,1,A series,2,"[-0.081645176, 0.062079225, 0.004442434, 0.077..."
3,4,1,A,2,"[-0.036432922, 0.034703396, -0.045959894, 0.03..."
4,5,1,series,2,"[-0.06352139, 0.042595528, -0.0017584751, 0.05..."
...,...,...,...,...,...
9995,9996,420,plays like some corny television production fr...,1,"[-0.05058179, -0.024217928, 0.0065322598, -0.0..."
9996,9997,420,plays like some corny television,1,"[0.010927222, -0.10147872, 0.05232218, -0.0810..."
9997,9998,420,like some corny television,2,"[-0.021114677, -0.09299212, 0.044081654, -0.04..."
9998,9999,420,some corny television,1,"[-0.03643913, -0.06907892, 0.050544772, -0.075..."


## Training and Testing Data

In [41]:
from sklearn.model_selection import train_test_split
Processing_DF.sample(frac=1)
Processing_DF.sample(frac=1)
Processing_DF.sample(frac=1)
X_train, X_test, y_train, y_test = train_test_split(Processing_DF.drop(['Sentiment'],axis=1), list(Processing_DF['Sentiment']), test_size=0.33, random_state=42)

In [53]:
X_train

Unnamed: 0,PhraseId,SentenceId,Phrase,Embeds
8371,8372,348,saw this movie,"[-0.055958994, 0.00279459, -0.07206712, 0.0252..."
5027,5028,196,about life itself,"[-0.050040357, 0.014964542, -0.026103517, 0.02..."
9234,9235,386,Bear,"[-0.038280882, 0.05772985, 0.05183284, 0.11231..."
3944,3945,149,a silly -LRB- but not sophomoric -RRB- romp th...,"[-0.0048351, 0.0071490733, -0.06909708, -0.016..."
6862,6863,275,critique,"[-0.06255015, 0.14269522, -0.037193395, 0.0477..."
...,...,...,...,...
5734,5735,227,less interesting,"[0.039559014, 0.05566624, 0.062052976, 0.06134..."
5191,5192,206,the well-wrought story,"[-0.023460204, 0.11553944, 0.08540614, 0.10312..."
5390,5391,215,to make than it is to sit through,"[0.018826177, 0.033386596, 0.02532654, 0.07621..."
860,861,32,by a self-indulgent script,"[-0.006140305, 0.030622993, 0.00408741, 0.0061..."


In [54]:
y_train

[2,
 2,
 2,
 2,
 2,
 3,
 2,
 2,
 3,
 4,
 1,
 2,
 2,
 2,
 2,
 0,
 1,
 2,
 3,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 2,
 2,
 4,
 2,
 2,
 1,
 2,
 3,
 3,
 1,
 2,
 2,
 2,
 2,
 2,
 3,
 2,
 1,
 2,
 2,
 2,
 2,
 2,
 1,
 2,
 2,
 2,
 2,
 2,
 1,
 3,
 1,
 2,
 2,
 2,
 2,
 2,
 1,
 2,
 3,
 2,
 1,
 2,
 2,
 2,
 2,
 3,
 1,
 3,
 2,
 4,
 1,
 2,
 2,
 1,
 2,
 3,
 1,
 2,
 2,
 2,
 3,
 3,
 2,
 1,
 1,
 3,
 1,
 3,
 3,
 2,
 1,
 1,
 3,
 2,
 2,
 3,
 2,
 2,
 2,
 1,
 2,
 2,
 2,
 2,
 0,
 3,
 2,
 2,
 2,
 2,
 3,
 1,
 1,
 3,
 2,
 1,
 2,
 1,
 2,
 2,
 1,
 2,
 2,
 2,
 3,
 1,
 2,
 2,
 2,
 4,
 2,
 3,
 2,
 2,
 2,
 2,
 2,
 0,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 1,
 1,
 3,
 1,
 2,
 3,
 2,
 2,
 2,
 2,
 2,
 0,
 2,
 2,
 2,
 2,
 1,
 2,
 2,
 0,
 2,
 2,
 2,
 3,
 3,
 3,
 2,
 2,
 2,
 2,
 2,
 1,
 4,
 3,
 3,
 3,
 1,
 3,
 4,
 3,
 2,
 1,
 2,
 2,
 2,
 0,
 1,
 1,
 3,
 3,
 4,
 1,
 2,
 0,
 2,
 4,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 0,
 2,
 2,
 1,
 3,
 3,
 3,
 2,
 2,
 3,
 2,
 2,
 3,
 2,
 3,
 2,
 2,
 2,
 2,
 3,
 2,
 1,


## Class Definition for Classification Models

In [214]:
class ClassificationModel:
    def __init__(self, model_name=None, parameter_dict=None):
        self.model_name = model_name
        self.parameter_dict = parameter_dict
        self.model = self.load_model()
        
    def load_model(self):            
        if self.model_name in ["Naive Bayes", "NB"]:
            from sklearn.naive_bayes import GaussianNB
            return GaussianNB(**self.parameter_dict)

        elif self.model_name in ["Support Vector Machine", "SVC"]:
            from sklearn.svm import SVC
            from sklearn.pipeline import make_pipeline
            from sklearn.preprocessing import StandardScaler
            return make_pipeline(StandardScaler(), SVC(**self.parameter_dict))

        elif self.model_name in ["Logistic Regression", "LR"]:
            from sklearn.linear_model import LogisticRegression
            return LogisticRegression(**self.parameter_dict)

        elif self.model_name in ["Decision Tree", "DT"]:
            from sklearn.tree import DecisionTreeClassifier
            return DecisionTreeClassifier(**self.parameter_dict)

        elif self.model_name in ["K Nearest Neighbour", "KNN"]:
            from sklearn.neighbors import KNeighborsClassifier
            return KNeighborsClassifier(**self.parameter_dict)

        elif self.model_name in ["Multi Layer Perceptron", "ANN"]:
            from sklearn.neural_network import MLPClassifier
            return MLPClassifier(**self.parameter_dict)

        elif self.model_name in ["Gradient Boosted Decision Tree", "GBDT"]:
            from sklearn.ensemble import GradientBoostingClassifier
            return GradientBoostingClassifier(**self.parameter_dict)
        else:
            return None
        
    def fit(self, features, labels):
        self.model = self.model.fit(features, labels)
        
    def predict(self, test_data):
        return self.model.predict(test_data)
    
    def get_confusion_matrix(self, actual, prediction):
        from sklearn.metrics import confusion_matrix
        return confusion_matrix(actual,prediction)

## Naive Bayes

In [150]:
_naive_bayes = ClassificationModel(model_name="NB", parameter_dict=None)



In [151]:
_naive_bayes.fit(np.array(X_train["Embeds"].tolist()),y_train)

In [152]:
_naive_bayes_predictions = _naive_bayes.predict(np.array(X_test["Embeds"].tolist()))

In [153]:
_naive_bayes_cm = _naive_bayes.get_confusion_matrix(y_test, _naive_bayes_predictions)

In [274]:
print("CONFUSION MATRIX\n")
print(_naive_bayes_cm)

CONFUSION MATRIX

[[  52   44   12    4   11]
 [  77  208  154   44   53]
 [  61  201 1279  204  118]
 [  26   62  172  221  131]
 [   2    5   21   51   87]]


In [155]:
sum([_naive_bayes_cm[i][j] for i in range(len(_naive_bayes_cm)) for j in range(len(_naive_bayes_cm[i]))  if i==j])

1847

In [200]:
from sklearn import metrics
print(metrics.classification_report(y_test, _naive_bayes_predictions, digits=3))

              precision    recall  f1-score   support

           0      0.239     0.423     0.305       123
           1      0.400     0.388     0.394       536
           2      0.781     0.687     0.731      1863
           3      0.422     0.361     0.389       612
           4      0.217     0.524     0.307       166

    accuracy                          0.560      3300
   macro avg      0.412     0.477     0.425      3300
weighted avg      0.604     0.560     0.575      3300



In [271]:
_naive_bayes = ClassificationModel(model_name="NB", parameter_dict={})
_naive_bayes.fit(np.array(X_train["Embeds"].tolist()),y_train)
_naive_bayes_predictions = _naive_bayes.predict(np.array(X_test["Embeds"].tolist()))
_naive_bayes_cm = _naive_bayes.get_confusion_matrix(y_test, _naive_bayes_predictions)
print(_naive_bayes_cm)
print(sum([_naive_bayes_cm[i][j] for i in range(len(_naive_bayes_cm)) for j in range(len(_naive_bayes_cm[i]))  if i==j]))
print(metrics.classification_report(y_test, _naive_bayes_predictions, digits=3))

[[  52   44   12    4   11]
 [  77  208  154   44   53]
 [  61  201 1279  204  118]
 [  26   62  172  221  131]
 [   2    5   21   51   87]]
1847
              precision    recall  f1-score   support

           0      0.239     0.423     0.305       123
           1      0.400     0.388     0.394       536
           2      0.781     0.687     0.731      1863
           3      0.422     0.361     0.389       612
           4      0.217     0.524     0.307       166

    accuracy                          0.560      3300
   macro avg      0.412     0.477     0.425      3300
weighted avg      0.604     0.560     0.575      3300



## Logistic Regression

In [275]:
_logistic_regression = ClassificationModel(model_name="LR", parameter_dict={'penalty':'l2'})

In [276]:
_logistic_regression.fit(np.array(X_train["Embeds"].tolist()),y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [277]:
_logistic_regression_predictions = _logistic_regression.predict(np.array(X_test["Embeds"].tolist()))

In [280]:
_logistic_regression_cm = _logistic_regression.get_confusion_matrix(y_test, _logistic_regression_predictions)
print("CONFUSION MATRIX\n")
print(_logistic_regression_cm)

CONFUSION MATRIX

[[  12   74   35    2    0]
 [   8  201  292   29    6]
 [   3   93 1670   96    1]
 [   2   37  304  258   11]
 [   0    5   34  102   25]]


In [279]:
print(metrics.classification_report(y_test, _naive_bayes_predictions, digits=3))

              precision    recall  f1-score   support

           0      0.239     0.423     0.305       123
           1      0.400     0.388     0.394       536
           2      0.781     0.687     0.731      1863
           3      0.422     0.361     0.389       612
           4      0.217     0.524     0.307       166

    accuracy                          0.560      3300
   macro avg      0.412     0.477     0.425      3300
weighted avg      0.604     0.560     0.575      3300



In [160]:
sum([_logistic_regression_cm[i][j] for i in range(len(_logistic_regression_cm)) for j in range(len(_logistic_regression_cm[i]))  if i==j])

2166

In [281]:
_logistic_regression = ClassificationModel(model_name="LR", parameter_dict={'penalty':'l2','tol':0.00000001})
_logistic_regression.fit(np.array(X_train["Embeds"].tolist()),y_train)
_logistic_regression_predictions = _logistic_regression.predict(np.array(X_test["Embeds"].tolist()))
_logistic_regression_cm = _logistic_regression.get_confusion_matrix(y_test, _logistic_regression_predictions)
print("CONFUSION MATRIX\n")
print(_logistic_regression_cm)
#print(sum([_logistic_regression_cm[i][j] for i in range(len(_logistic_regression_cm)) for j in range(len(_logistic_regression_cm[i]))  if i==j]))
print(metrics.classification_report(y_test, _logistic_regression_predictions, digits=3))

CONFUSION MATRIX

[[  12   74   35    2    0]
 [   8  201  292   29    6]
 [   3   93 1670   96    1]
 [   2   37  304  258   11]
 [   0    5   34  102   25]]
              precision    recall  f1-score   support

           0      0.480     0.098     0.162       123
           1      0.490     0.375     0.425       536
           2      0.715     0.896     0.796      1863
           3      0.530     0.422     0.470       612
           4      0.581     0.151     0.239       166

    accuracy                          0.656      3300
   macro avg      0.559     0.388     0.418      3300
weighted avg      0.629     0.656     0.623      3300



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [282]:
_logistic_regression = ClassificationModel(model_name="LR", parameter_dict={'penalty':'l2','tol':0.00000001,'class_weight':'balanced'})
_logistic_regression.fit(np.array(X_train["Embeds"].tolist()),y_train)
_logistic_regression_predictions = _logistic_regression.predict(np.array(X_test["Embeds"].tolist()))
_logistic_regression_cm = _logistic_regression.get_confusion_matrix(y_test, _logistic_regression_predictions)
print("CONFUSION MATRIX\n")
print(_logistic_regression_cm)
print(sum([_logistic_regression_cm[i][j] for i in range(len(_logistic_regression_cm)) for j in range(len(_logistic_regression_cm[i]))  if i==j]))
print(metrics.classification_report(y_test, _logistic_regression_predictions, digits=3))

CONFUSION MATRIX

[[  57   57    4    3    2]
 [ 124  247   95   47   23]
 [  66  285 1197  253   62]
 [  26   59  122  253  152]
 [   4    2    4   40  116]]
1870
              precision    recall  f1-score   support

           0      0.206     0.463     0.285       123
           1      0.380     0.461     0.417       536
           2      0.842     0.643     0.729      1863
           3      0.424     0.413     0.419       612
           4      0.327     0.699     0.445       166

    accuracy                          0.567      3300
   macro avg      0.436     0.536     0.459      3300
weighted avg      0.640     0.567     0.590      3300



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [283]:
_logistic_regression = ClassificationModel(model_name="LR", parameter_dict={'penalty':'l2','tol':0.00000001, 'solver':'saga'})
_logistic_regression.fit(np.array(X_train["Embeds"].tolist()),y_train)
_logistic_regression_predictions = _logistic_regression.predict(np.array(X_test["Embeds"].tolist()))
_logistic_regression_cm = _logistic_regression.get_confusion_matrix(y_test, _logistic_regression_predictions)
print("CONFUSION MATRIX\n")
print(_logistic_regression_cm)
print(sum([_logistic_regression_cm[i][j] for i in range(len(_logistic_regression_cm)) for j in range(len(_logistic_regression_cm[i]))  if i==j]))
print(metrics.classification_report(y_test, _logistic_regression_predictions, digits=3))

CONFUSION MATRIX

[[  12   74   35    2    0]
 [   8  200  292   30    6]
 [   3   93 1670   96    1]
 [   2   37  304  258   11]
 [   0    5   36  100   25]]
2165
              precision    recall  f1-score   support

           0      0.480     0.098     0.162       123
           1      0.489     0.373     0.423       536
           2      0.715     0.896     0.795      1863
           3      0.531     0.422     0.470       612
           4      0.581     0.151     0.239       166

    accuracy                          0.656      3300
   macro avg      0.559     0.388     0.418      3300
weighted avg      0.628     0.656     0.623      3300





In [209]:
_logistic_regression = ClassificationModel(model_name="LR", parameter_dict={'penalty':'l2','tol':0.00000001, 'solver':'sag'})
_logistic_regression.fit(np.array(X_train["Embeds"].tolist()),y_train)
_logistic_regression_predictions = _logistic_regression.predict(np.array(X_test["Embeds"].tolist()))
_logistic_regression_cm = _logistic_regression.get_confusion_matrix(y_test, _logistic_regression_predictions)
print(_logistic_regression_cm)
print(sum([_logistic_regression_cm[i][j] for i in range(len(_logistic_regression_cm)) for j in range(len(_logistic_regression_cm[i]))  if i==j]))
print(metrics.classification_report(y_test, _logistic_regression_predictions, digits=3))

[[  12   74   35    2    0]
 [   8  200  292   30    6]
 [   3   93 1670   96    1]
 [   2   37  304  258   11]
 [   0    5   36  100   25]]
2165
              precision    recall  f1-score   support

           0      0.480     0.098     0.162       123
           1      0.489     0.373     0.423       536
           2      0.715     0.896     0.795      1863
           3      0.531     0.422     0.470       612
           4      0.581     0.151     0.239       166

    accuracy                          0.656      3300
   macro avg      0.559     0.388     0.418      3300
weighted avg      0.628     0.656     0.623      3300





In [211]:
_logistic_regression = ClassificationModel(model_name="LR", parameter_dict={'penalty':'l1','tol':0.00000001, 'solver':'liblinear'})
_logistic_regression.fit(np.array(X_train["Embeds"].tolist()),y_train)
_logistic_regression_predictions = _logistic_regression.predict(np.array(X_test["Embeds"].tolist()))
_logistic_regression_cm = _logistic_regression.get_confusion_matrix(y_test, _logistic_regression_predictions)
print(_logistic_regression_cm)
print(sum([_logistic_regression_cm[i][j] for i in range(len(_logistic_regression_cm)) for j in range(len(_logistic_regression_cm[i]))  if i==j]))
print(metrics.classification_report(y_test, _logistic_regression_predictions, digits=3))

[[  10   68   41    3    1]
 [   6  195  297   35    3]
 [   3   81 1691   88    0]
 [   0   40  317  247    8]
 [   1    5   38  102   20]]
2163
              precision    recall  f1-score   support

           0      0.500     0.081     0.140       123
           1      0.501     0.364     0.422       536
           2      0.709     0.908     0.796      1863
           3      0.520     0.404     0.454       612
           4      0.625     0.120     0.202       166

    accuracy                          0.655      3300
   macro avg      0.571     0.375     0.403      3300
weighted avg      0.628     0.655     0.618      3300



In [212]:
_logistic_regression = ClassificationModel(model_name="LR", parameter_dict={'penalty':'l2','tol':0.00000001, 'solver':'newton-cg'})
_logistic_regression.fit(np.array(X_train["Embeds"].tolist()),y_train)
_logistic_regression_predictions = _logistic_regression.predict(np.array(X_test["Embeds"].tolist()))
_logistic_regression_cm = _logistic_regression.get_confusion_matrix(y_test, _logistic_regression_predictions)
print(_logistic_regression_cm)
print(sum([_logistic_regression_cm[i][j] for i in range(len(_logistic_regression_cm)) for j in range(len(_logistic_regression_cm[i]))  if i==j]))
print(metrics.classification_report(y_test, _logistic_regression_predictions, digits=3))

[[  12   74   35    2    0]
 [   8  200  292   30    6]
 [   3   93 1670   96    1]
 [   2   37  304  258   11]
 [   0    5   36  100   25]]
2165
              precision    recall  f1-score   support

           0      0.480     0.098     0.162       123
           1      0.489     0.373     0.423       536
           2      0.715     0.896     0.795      1863
           3      0.531     0.422     0.470       612
           4      0.581     0.151     0.239       166

    accuracy                          0.656      3300
   macro avg      0.559     0.388     0.418      3300
weighted avg      0.628     0.656     0.623      3300





## Support Vector Machine

In [284]:
_svc_classification = ClassificationModel(model_name="SVC", parameter_dict={})
_svc_classification.fit(np.array(X_train["Embeds"].tolist()),y_train)
_svc_classification_predictions = _svc_classification.predict(np.array(X_test["Embeds"].tolist()))
_svc_classification_cm = _svc_classification.get_confusion_matrix(y_test, _svc_classification_predictions)
print("CONFUSION MATRIX\n")
print(_svc_classification_cm)

CONFUSION MATRIX

[[  17   82   23    1    0]
 [  14  243  264   15    0]
 [   5   85 1693   80    0]
 [   0   23  299  279   11]
 [   0    1   18  113   34]]


In [285]:
print(metrics.classification_report(y_test, _svc_classification_predictions, digits=3))

              precision    recall  f1-score   support

           0      0.472     0.138     0.214       123
           1      0.560     0.453     0.501       536
           2      0.737     0.909     0.814      1863
           3      0.572     0.456     0.507       612
           4      0.756     0.205     0.322       166

    accuracy                          0.687      3300
   macro avg      0.619     0.432     0.472      3300
weighted avg      0.669     0.687     0.659      3300



In [162]:
sum([_svc_classification_cm[i][j] for i in range(len(_svc_classification_cm)) for j in range(len(_svc_classification_cm[i]))  if i==j])

2266

In [215]:
_svc_classification = ClassificationModel(model_name="SVC", parameter_dict={'kernel':'poly'})
_svc_classification.fit(np.array(X_train["Embeds"].tolist()),y_train)
_svc_classification_predictions = _svc_classification.predict(np.array(X_test["Embeds"].tolist()))
_svc_classification_cm = _svc_classification.get_confusion_matrix(y_test, _svc_classification_predictions)
print(_svc_classification_cm)
#print(sum([_svc_classification_cm[i][j] for i in range(len(_svc_classification_cm)) for j in range(len(_svc_classification_cm[i]))  if i==j]))
print(metrics.classification_report(y_test, _svc_classification_predictions, digits=3))

[[  18   69   36    0    0]
 [  13  211  302   10    0]
 [   6   77 1691   88    1]
 [   0   13  344  237   18]
 [   0    0   43   95   28]]
2185
              precision    recall  f1-score   support

           0      0.486     0.146     0.225       123
           1      0.570     0.394     0.466       536
           2      0.700     0.908     0.790      1863
           3      0.551     0.387     0.455       612
           4      0.596     0.169     0.263       166

    accuracy                          0.662      3300
   macro avg      0.581     0.401     0.440      3300
weighted avg      0.638     0.662     0.628      3300



In [216]:
_svc_classification = ClassificationModel(model_name="SVC", parameter_dict={'kernel':'sigmoid'})
_svc_classification.fit(np.array(X_train["Embeds"].tolist()),y_train)
_svc_classification_predictions = _svc_classification.predict(np.array(X_test["Embeds"].tolist()))
_svc_classification_cm = _svc_classification.get_confusion_matrix(y_test, _svc_classification_predictions)
print(_svc_classification_cm)
print(sum([_svc_classification_cm[i][j] for i in range(len(_svc_classification_cm)) for j in range(len(_svc_classification_cm[i]))  if i==j]))
print(metrics.classification_report(y_test, _svc_classification_predictions, digits=3))

[[   8   71   39    5    0]
 [   7  201  284   42    2]
 [   2  126 1613  119    3]
 [   3   50  316  241    2]
 [   0   12   33  112    9]]
2072
              precision    recall  f1-score   support

           0      0.400     0.065     0.112       123
           1      0.437     0.375     0.404       536
           2      0.706     0.866     0.778      1863
           3      0.464     0.394     0.426       612
           4      0.562     0.054     0.099       166

    accuracy                          0.628      3300
   macro avg      0.514     0.351     0.364      3300
weighted avg      0.599     0.628     0.593      3300



In [218]:
_svc_classification = ClassificationModel(model_name="SVC", parameter_dict={'kernel':'poly','gamma':'auto'})
_svc_classification.fit(np.array(X_train["Embeds"].tolist()),y_train)
_svc_classification_predictions = _svc_classification.predict(np.array(X_test["Embeds"].tolist()))
_svc_classification_cm = _svc_classification.get_confusion_matrix(y_test, _svc_classification_predictions)
print(_svc_classification_cm)
print(sum([_svc_classification_cm[i][j] for i in range(len(_svc_classification_cm)) for j in range(len(_svc_classification_cm[i]))  if i==j]))
print(metrics.classification_report(y_test, _svc_classification_predictions, digits=3))

[[  18   69   36    0    0]
 [  13  209  304   10    0]
 [   6   74 1698   84    1]
 [   0   13  346  235   18]
 [   0    0   43   95   28]]
2188
              precision    recall  f1-score   support

           0      0.486     0.146     0.225       123
           1      0.573     0.390     0.464       536
           2      0.700     0.911     0.792      1863
           3      0.554     0.384     0.454       612
           4      0.596     0.169     0.263       166

    accuracy                          0.663      3300
   macro avg      0.582     0.400     0.439      3300
weighted avg      0.639     0.663     0.628      3300



In [219]:
_svc_classification = ClassificationModel(model_name="SVC", parameter_dict={'gamma':'auto'})
_svc_classification.fit(np.array(X_train["Embeds"].tolist()),y_train)
_svc_classification_predictions = _svc_classification.predict(np.array(X_test["Embeds"].tolist()))
_svc_classification_cm = _svc_classification.get_confusion_matrix(y_test, _svc_classification_predictions)
print(_svc_classification_cm)
print(sum([_svc_classification_cm[i][j] for i in range(len(_svc_classification_cm)) for j in range(len(_svc_classification_cm[i]))  if i==j]))
print(metrics.classification_report(y_test, _svc_classification_predictions, digits=3))

[[  17   82   23    1    0]
 [  14  243  264   15    0]
 [   5   86 1691   81    0]
 [   0   23  299  279   11]
 [   0    1   18  113   34]]
2264
              precision    recall  f1-score   support

           0      0.472     0.138     0.214       123
           1      0.559     0.453     0.501       536
           2      0.737     0.908     0.813      1863
           3      0.571     0.456     0.507       612
           4      0.756     0.205     0.322       166

    accuracy                          0.686      3300
   macro avg      0.619     0.432     0.471      3300
weighted avg      0.668     0.686     0.659      3300



## K-Nearest Neighbours

In [296]:
_knn_classification = ClassificationModel(model_name="KNN", parameter_dict={'n_neighbors':3})
_knn_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_knn_classification_predictions = _knn_classification.predict(np.array(X_test["Embeds"].tolist()))
_knn_classification_cm = _knn_classification.get_confusion_matrix(y_test, list(_knn_classification_predictions))
_knn_classification_cm

array([[  44,   59,   19,    1,    0],
       [  83,  242,  191,   18,    2],
       [  39,  188, 1453,  173,   10],
       [  16,   39,  290,  230,   37],
       [   1,    5,   36,   74,   50]], dtype=int64)

In [186]:
sum([_knn_classification_cm[i][j] for i in range(len(_knn_classification_cm)) for j in range(len(_knn_classification_cm[i]))  if i==j])

2019

In [297]:
print(metrics.classification_report(y_test, _knn_classification_predictions, digits=3))

              precision    recall  f1-score   support

           0      0.240     0.358     0.288       123
           1      0.454     0.451     0.453       536
           2      0.731     0.780     0.754      1863
           3      0.464     0.376     0.415       612
           4      0.505     0.301     0.377       166

    accuracy                          0.612      3300
   macro avg      0.479     0.453     0.457      3300
weighted avg      0.607     0.612     0.606      3300



In [289]:
_knn_classification = ClassificationModel(model_name="KNN", parameter_dict={'n_neighbors':7,'weights':'distance'})
_knn_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_knn_classification_predictions = _knn_classification.predict(np.array(X_test["Embeds"].tolist()))
_knn_classification_cm = _knn_classification.get_confusion_matrix(y_test, list(_knn_classification_predictions))
print("CONFUSION MATRIX\n")
print(_knn_classification_cm)
#print(sum([_knn_classification_cm[i][j] for i in range(len(_knn_classification_cm)) for j in range(len(_knn_classification_cm[i]))  if i==j]))
print(metrics.classification_report(y_test, _knn_classification_predictions, digits=3))

CONFUSION MATRIX

[[  39   62   20    1    1]
 [  42  238  230   26    0]
 [  16  128 1549  159   11]
 [   1   21  283  266   41]
 [   0    1   29   84   52]]
              precision    recall  f1-score   support

           0      0.398     0.317     0.353       123
           1      0.529     0.444     0.483       536
           2      0.734     0.831     0.780      1863
           3      0.496     0.435     0.463       612
           4      0.495     0.313     0.384       166

    accuracy                          0.650      3300
   macro avg      0.530     0.468     0.492      3300
weighted avg      0.632     0.650     0.637      3300



In [231]:
_knn_classification = ClassificationModel(model_name="KNN", parameter_dict={'n_neighbors':7,'weights':'distance','algorithm':'kd_tree'})
_knn_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_knn_classification_predictions = _knn_classification.predict(np.array(X_test["Embeds"].tolist()))
_knn_classification_cm = _knn_classification.get_confusion_matrix(y_test, list(_knn_classification_predictions))
print(_knn_classification_cm)
print(sum([_knn_classification_cm[i][j] for i in range(len(_knn_classification_cm)) for j in range(len(_knn_classification_cm[i]))  if i==j]))
print(metrics.classification_report(y_test, _knn_classification_predictions, digits=3))

[[  39   62   20    1    1]
 [  42  238  230   26    0]
 [  16  128 1549  159   11]
 [   1   21  283  266   41]
 [   0    1   29   84   52]]
2144
              precision    recall  f1-score   support

           0      0.398     0.317     0.353       123
           1      0.529     0.444     0.483       536
           2      0.734     0.831     0.780      1863
           3      0.496     0.435     0.463       612
           4      0.495     0.313     0.384       166

    accuracy                          0.650      3300
   macro avg      0.530     0.468     0.492      3300
weighted avg      0.632     0.650     0.637      3300



In [241]:
_knn_classification = ClassificationModel(model_name="KNN", parameter_dict={'p':1, 'n_neighbors':7,'weights':'distance','algorithm':'kd_tree'})
_knn_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_knn_classification_predictions = _knn_classification.predict(np.array(X_test["Embeds"].tolist()))
_knn_classification_cm = _knn_classification.get_confusion_matrix(y_test, list(_knn_classification_predictions))
print(_knn_classification_cm)
print(sum([_knn_classification_cm[i][j] for i in range(len(_knn_classification_cm)) for j in range(len(_knn_classification_cm[i]))  if i==j]))
print(metrics.classification_report(y_test, _knn_classification_predictions, digits=3))

[[  39   61   20    2    1]
 [  41  241  225   28    1]
 [  18  131 1545  158   11]
 [   0   22  289  261   40]
 [   0    0   27   88   51]]
2137
              precision    recall  f1-score   support

           0      0.398     0.317     0.353       123
           1      0.530     0.450     0.486       536
           2      0.734     0.829     0.779      1863
           3      0.486     0.426     0.454       612
           4      0.490     0.307     0.378       166

    accuracy                          0.648      3300
   macro avg      0.528     0.466     0.490      3300
weighted avg      0.630     0.648     0.635      3300



In [232]:
_knn_classification = ClassificationModel(model_name="KNN", parameter_dict={'n_neighbors':7,'weights':'distance','algorithm':'ball_tree'})
_knn_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_knn_classification_predictions = _knn_classification.predict(np.array(X_test["Embeds"].tolist()))
_knn_classification_cm = _knn_classification.get_confusion_matrix(y_test, list(_knn_classification_predictions))
print(_knn_classification_cm)
print(sum([_knn_classification_cm[i][j] for i in range(len(_knn_classification_cm)) for j in range(len(_knn_classification_cm[i]))  if i==j]))
print(metrics.classification_report(y_test, _knn_classification_predictions, digits=3))

[[  39   62   20    1    1]
 [  42  238  230   26    0]
 [  16  128 1549  159   11]
 [   1   21  283  266   41]
 [   0    1   29   84   52]]
2144
              precision    recall  f1-score   support

           0      0.398     0.317     0.353       123
           1      0.529     0.444     0.483       536
           2      0.734     0.831     0.780      1863
           3      0.496     0.435     0.463       612
           4      0.495     0.313     0.384       166

    accuracy                          0.650      3300
   macro avg      0.530     0.468     0.492      3300
weighted avg      0.632     0.650     0.637      3300



In [240]:
_knn_classification = ClassificationModel(model_name="KNN", parameter_dict={'p':1, 'n_neighbors':7,'weights':'distance','algorithm':'ball_tree'})
_knn_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_knn_classification_predictions = _knn_classification.predict(np.array(X_test["Embeds"].tolist()))
_knn_classification_cm = _knn_classification.get_confusion_matrix(y_test, list(_knn_classification_predictions))
print(_knn_classification_cm)
print(sum([_knn_classification_cm[i][j] for i in range(len(_knn_classification_cm)) for j in range(len(_knn_classification_cm[i]))  if i==j]))
print(metrics.classification_report(y_test, _knn_classification_predictions, digits=3))

[[  39   61   20    2    1]
 [  41  241  225   28    1]
 [  18  131 1545  158   11]
 [   0   22  289  261   40]
 [   0    0   27   88   51]]
2137
              precision    recall  f1-score   support

           0      0.398     0.317     0.353       123
           1      0.530     0.450     0.486       536
           2      0.734     0.829     0.779      1863
           3      0.486     0.426     0.454       612
           4      0.490     0.307     0.378       166

    accuracy                          0.648      3300
   macro avg      0.528     0.466     0.490      3300
weighted avg      0.630     0.648     0.635      3300



In [239]:
_knn_classification = ClassificationModel(model_name="KNN", parameter_dict={'n_neighbors':10,'weights':'distance','p':1})
_knn_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_knn_classification_predictions = _knn_classification.predict(np.array(X_test["Embeds"].tolist()))
_knn_classification_cm = _knn_classification.get_confusion_matrix(y_test, list(_knn_classification_predictions))
print(_knn_classification_cm)
print(sum([_knn_classification_cm[i][j] for i in range(len(_knn_classification_cm)) for j in range(len(_knn_classification_cm[i]))  if i==j]))
print(metrics.classification_report(y_test, _knn_classification_predictions, digits=3))

[[  32   61   26    3    1]
 [  33  229  246   28    0]
 [  16  100 1611  129    7]
 [   1   20  311  243   37]
 [   0    0   32   88   46]]
2161
              precision    recall  f1-score   support

           0      0.390     0.260     0.312       123
           1      0.559     0.427     0.484       536
           2      0.724     0.865     0.788      1863
           3      0.495     0.397     0.441       612
           4      0.505     0.277     0.358       166

    accuracy                          0.655      3300
   macro avg      0.535     0.445     0.477      3300
weighted avg      0.631     0.655     0.635      3300



## Gradient Boosted Decision Tree

In [295]:
_gbdt_classification = ClassificationModel(model_name="GBDT", parameter_dict={})

In [195]:
_gbdt_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)

In [196]:
_gbdt_classification_predictions = _gbdt_classification.predict(np.array(X_test["Embeds"].tolist()))

In [197]:
_gbdt_classification_cm = _gbdt_classification.get_confusion_matrix(y_test, list(_gbdt_classification_predictions))
_gbdt_classification_cm

array([[  14,   59,   47,    3,    0],
       [  15,  171,  326,   21,    3],
       [   5,   66, 1715,   76,    1],
       [   2,   30,  364,  196,   20],
       [   0,    4,   58,   82,   22]], dtype=int64)

In [198]:
sum([_gbdt_classification_cm[i][j] for i in range(len(_gbdt_classification_cm)) for j in range(len(_gbdt_classification_cm[i]))  if i==j])

2118

In [290]:
print(metrics.classification_report(y_test, _knn_classification_predictions, digits=3))

              precision    recall  f1-score   support

           0      0.398     0.317     0.353       123
           1      0.529     0.444     0.483       536
           2      0.734     0.831     0.780      1863
           3      0.496     0.435     0.463       612
           4      0.495     0.313     0.384       166

    accuracy                          0.650      3300
   macro avg      0.530     0.468     0.492      3300
weighted avg      0.632     0.650     0.637      3300



## Decision Tree

In [292]:
_dt_classification = ClassificationModel(model_name="DT", parameter_dict={})
_dt_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_dt_classification_predictions = _dt_classification.predict(np.array(X_test["Embeds"].tolist()))
print("CONFUSION MATRIX\n")
_dt_classification_cm = _dt_classification.get_confusion_matrix(y_test, list(_dt_classification_predictions))
print(_dt_classification_cm)

CONFUSION MATRIX

[[  14   54   36   15    4]
 [  40  169  224   80   23]
 [  45  267 1168  321   62]
 [  18   80  263  212   39]
 [   4   13   56   67   26]]


In [293]:
print(metrics.classification_report(y_test, _dt_classification_predictions, digits=3))

              precision    recall  f1-score   support

           0      0.116     0.114     0.115       123
           1      0.290     0.315     0.302       536
           2      0.669     0.627     0.647      1863
           3      0.305     0.346     0.324       612
           4      0.169     0.157     0.163       166

    accuracy                          0.482      3300
   macro avg      0.310     0.312     0.310      3300
weighted avg      0.494     0.482     0.487      3300



In [191]:
sum([_dt_classification_cm[i][j] for i in range(len(_dt_classification_cm)) for j in range(len(_dt_classification_cm[i]))  if i==j])

1588

In [242]:
_dt_classification = ClassificationModel(model_name="DT", parameter_dict={'criterion':'entropy'})
_dt_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_dt_classification_predictions = _dt_classification.predict(np.array(X_test["Embeds"].tolist()))
_dt_classification_cm = _dt_classification.get_confusion_matrix(y_test, list(_dt_classification_predictions))
print(_dt_classification_cm)
print(sum([_dt_classification_cm[i][j] for i in range(len(_dt_classification_cm)) for j in range(len(_dt_classification_cm[i]))  if i==j]))
print(metrics.classification_report(y_test, _dt_classification_predictions, digits=3))

[[  23   45   42   12    1]
 [  46  163  236   82    9]
 [  48  235 1260  276   44]
 [  12   62  290  198   50]
 [   2   13   47   57   47]]
1691
              precision    recall  f1-score   support

           0      0.176     0.187     0.181       123
           1      0.315     0.304     0.309       536
           2      0.672     0.676     0.674      1863
           3      0.317     0.324     0.320       612
           4      0.311     0.283     0.297       166

    accuracy                          0.512      3300
   macro avg      0.358     0.355     0.356      3300
weighted avg      0.511     0.512     0.512      3300



In [250]:
_dt_classification = ClassificationModel(model_name="DT", parameter_dict={'max_depth':5})
_dt_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_dt_classification_predictions = _dt_classification.predict(np.array(X_test["Embeds"].tolist()))
_dt_classification_cm = _dt_classification.get_confusion_matrix(y_test, list(_dt_classification_predictions))
print(_dt_classification_cm)
print(sum([_dt_classification_cm[i][j] for i in range(len(_dt_classification_cm)) for j in range(len(_dt_classification_cm[i]))  if i==j]))
print(metrics.classification_report(y_test, _dt_classification_predictions, digits=3))

[[   0   44   73    6    0]
 [   0  130  380   26    0]
 [   0  159 1617   87    0]
 [   0   71  490   51    0]
 [   0   19  112   35    0]]
1798
              precision    recall  f1-score   support

           0      0.000     0.000     0.000       123
           1      0.307     0.243     0.271       536
           2      0.605     0.868     0.713      1863
           3      0.249     0.083     0.125       612
           4      0.000     0.000     0.000       166

    accuracy                          0.545      3300
   macro avg      0.232     0.239     0.222      3300
weighted avg      0.438     0.545     0.470      3300



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Multi Layer Perceptron

In [286]:
_ann_classification = ClassificationModel(model_name="ANN", parameter_dict={})
_ann_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_ann_classification_predictions = _ann_classification.predict(np.array(X_test["Embeds"].tolist()))
_ann_classification_cm = _ann_classification.get_confusion_matrix(y_test, list(_ann_classification_predictions))
print("CONFUSION MATRIX\n")
_ann_classification_cm

CONFUSION MATRIX





array([[  34,   67,   21,    0,    1],
       [  40,  285,  192,   16,    3],
       [   8,  152, 1526,  167,   10],
       [   2,   34,  252,  271,   53],
       [   0,    0,   15,   93,   58]], dtype=int64)

In [287]:
print(metrics.classification_report(y_test, _ann_classification_predictions, digits=3))

              precision    recall  f1-score   support

           0      0.405     0.276     0.329       123
           1      0.530     0.532     0.531       536
           2      0.761     0.819     0.789      1863
           3      0.495     0.443     0.468       612
           4      0.464     0.349     0.399       166

    accuracy                          0.659      3300
   macro avg      0.531     0.484     0.503      3300
weighted avg      0.646     0.659     0.651      3300



In [294]:
sum([_ann_classification_cm[i][j] for i in range(len(_ann_classification_cm)) for j in range(len(_ann_classification_cm[i]))  if i==j])

2174

In [255]:
_ann_classification = ClassificationModel(model_name="ANN", parameter_dict={'activation':'tanh'})
_ann_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_ann_classification_predictions = _ann_classification.predict(np.array(X_test["Embeds"].tolist()))
_ann_classification_cm = _ann_classification.get_confusion_matrix(y_test, list(_ann_classification_predictions))
print(_ann_classification_cm)
#print(sum([_ann_classification_cm[i][j] for i in range(len(_ann_classification_cm)) for j in range(len(_ann_classification_cm[i]))  if i==j]))
print(metrics.classification_report(y_test, _ann_classification_predictions, digits=3))

[[  33   68   17    3    2]
 [  41  221  225   40    9]
 [  20  140 1535  153   15]
 [   7   41  244  256   64]
 [   1    5   13   88   59]]
2104
              precision    recall  f1-score   support

           0      0.324     0.268     0.293       123
           1      0.465     0.412     0.437       536
           2      0.755     0.824     0.788      1863
           3      0.474     0.418     0.444       612
           4      0.396     0.355     0.375       166

    accuracy                          0.638      3300
   macro avg      0.483     0.456     0.467      3300
weighted avg      0.622     0.638     0.628      3300





In [258]:
_ann_classification = ClassificationModel(model_name="ANN", parameter_dict={'early_stopping':True})
_ann_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_ann_classification_predictions = _ann_classification.predict(np.array(X_test["Embeds"].tolist()))
_ann_classification_cm = _ann_classification.get_confusion_matrix(y_test, list(_ann_classification_predictions))
print(_ann_classification_cm)
print(sum([_ann_classification_cm[i][j] for i in range(len(_ann_classification_cm)) for j in range(len(_ann_classification_cm[i]))  if i==j]))
print(metrics.classification_report(y_test, _ann_classification_predictions, digits=3))

[[  23   74   23    2    1]
 [  25  232  251   22    6]
 [   6  111 1633  108    5]
 [   4   34  275  274   25]
 [   0    5   22   95   44]]
2206
              precision    recall  f1-score   support

           0      0.397     0.187     0.254       123
           1      0.509     0.433     0.468       536
           2      0.741     0.877     0.803      1863
           3      0.547     0.448     0.492       612
           4      0.543     0.265     0.356       166

    accuracy                          0.668      3300
   macro avg      0.547     0.442     0.475      3300
weighted avg      0.644     0.668     0.648      3300



In [261]:
_ann_classification = ClassificationModel(model_name="ANN", parameter_dict={'early_stopping':True, 'validation_fraction': 0.1})
_ann_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_ann_classification_predictions = _ann_classification.predict(np.array(X_test["Embeds"].tolist()))
_ann_classification_cm = _ann_classification.get_confusion_matrix(y_test, list(_ann_classification_predictions))
print(_ann_classification_cm)
print(sum([_ann_classification_cm[i][j] for i in range(len(_ann_classification_cm)) for j in range(len(_ann_classification_cm[i]))  if i==j]))
print(metrics.classification_report(y_test, _ann_classification_predictions, digits=3))

[[  28   72   19    4    0]
 [  29  245  233   26    3]
 [   5  124 1611  119    4]
 [   4   28  265  288   27]
 [   1    4   21   85   55]]
2227
              precision    recall  f1-score   support

           0      0.418     0.228     0.295       123
           1      0.518     0.457     0.486       536
           2      0.750     0.865     0.803      1863
           3      0.552     0.471     0.508       612
           4      0.618     0.331     0.431       166

    accuracy                          0.675      3300
   macro avg      0.571     0.470     0.505      3300
weighted avg      0.656     0.675     0.659      3300



In [269]:
_ann_classification = ClassificationModel(model_name="ANN", parameter_dict={'learning_rate':'invscaling', 'early_stopping':True, 'validation_fraction': 0.1, 'shuffle':True})
_ann_classification.fit(np.array(X_train["Embeds"].tolist()), y_train)
_ann_classification_predictions = _ann_classification.predict(np.array(X_test["Embeds"].tolist()))
_ann_classification_cm = _ann_classification.get_confusion_matrix(y_test, list(_ann_classification_predictions))
print(_ann_classification_cm)
print(sum([_ann_classification_cm[i][j] for i in range(len(_ann_classification_cm)) for j in range(len(_ann_classification_cm[i]))  if i==j]))
print(metrics.classification_report(y_test, _ann_classification_predictions, digits=3))

[[  26   73   21    3    0]
 [  26  223  256   29    2]
 [   7  111 1633  110    2]
 [   3   38  265  289   17]
 [   1    4   27  106   28]]
2199
              precision    recall  f1-score   support

           0      0.413     0.211     0.280       123
           1      0.497     0.416     0.453       536
           2      0.742     0.877     0.803      1863
           3      0.538     0.472     0.503       612
           4      0.571     0.169     0.260       166

    accuracy                          0.666      3300
   macro avg      0.552     0.429     0.460      3300
weighted avg      0.643     0.666     0.644      3300

