In [4]:
import numpy as np
import matplotlib.pyplot as plt

import sklearn

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [11]:
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes') # load the file into a dictionary
    return dict

seed = 1234
np.random.seed(seed)

## KNN Classification Training

In [6]:
data = [1,2,3,4,5]

# initialize the accuracy lists
tr_acclst = []
val_acclst = []

seed = 1234
np.random.seed(seed)

for i in data:
    d = unpickle(f'data_batch_{i}')
    X = d[b'data'] # assign into X
    y = d[b'labels'] # assign into Y
    X_tr, X_val, y_tr, y_val = train_test_split(d[b'data'], d[b'labels'], 
                                test_size=0.2, random_state=seed, shuffle=True)
    # Transform the data
    sclr = StandardScaler()
    X_tr = sclr.fit_transform(X_tr)
    X_val = sclr.fit_transform(X_val)
    
    knn = KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree', weights='uniform',
                               leaf_size=30, n_jobs=-1)
    knn.fit(X_tr, y_tr)
    
    knn_tr_pred = knn.predict(X_tr)
    knn_val_pred = knn.predict(X_val)
    
    # Gets the accuracy score
    tr_acc = accuracy_score(knn_tr_pred, y_tr)
    val_acc = accuracy_score(knn_val_pred, y_val)
    
    tr_acclst.append(tr_acc)
    val_acclst.append(val_acc)
    
    print(f'KNN Results in data_batch_{i}')
    print(classification_report(knn_val_pred, y_val))


KNN Results in data_batch_1
              precision    recall  f1-score   support

           0       0.52      0.30      0.38       326
           1       0.15      0.57      0.23        47
           2       0.40      0.21      0.27       433
           3       0.17      0.29      0.21       120
           4       0.42      0.18      0.25       419
           5       0.18      0.33      0.23        88
           6       0.20      0.26      0.22       173
           7       0.13      0.64      0.21        44
           8       0.56      0.37      0.44       313
           9       0.10      0.51      0.16        37

    accuracy                           0.28      2000
   macro avg       0.28      0.37      0.26      2000
weighted avg       0.39      0.28      0.30      2000

KNN Results in data_batch_2
              precision    recall  f1-score   support

           0       0.54      0.34      0.42       327
           1       0.12      0.47      0.19        51
           2       0.3

## Logistic Regression Training

In [8]:
data = [1,2,3,4,5]

# initialize the accuracy lists
tr_acclst = []
val_acclst = []

seed = 1234
np.random.seed(seed)

for i in data:
    d = unpickle(f'data_batch_{i}')
    X = d[b'data'] # assign into X
    y = d[b'labels'] # assign into Y
    X_tr, X_val, y_tr, y_val = train_test_split(d[b'data'], d[b'labels'], 
                                test_size=0.2, random_state=seed, shuffle=True)
    # Transform the data
    sclr = StandardScaler()
    X_tr = sclr.fit_transform(X_tr)
    X_val = sclr.fit_transform(X_val)
    
    log = LogisticRegression(penalty='l2', multi_class='auto', solver='lbfgs', 
                             C=0.001, max_iter=100, random_state=seed)

    log.fit(X_tr, y_tr)

    log_tr_pred = log.predict(X_tr)
    log_val_pred = log.predict(X_val)
    
    # Gets the accuracy score
    tr_acc = accuracy_score(log_tr_pred, y_tr)
    val_acc = accuracy_score(log_val_pred, y_val)
    
    tr_acclst.append(tr_acc)
    val_acclst.append(val_acc)


    print(f'Logistic Classifier Results in data_batch_{i}')
    print(classification_report(log_val_pred, y_val))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Classifier Results in data_batch_1
              precision    recall  f1-score   support

           0       0.52      0.44      0.48       223
           1       0.51      0.42      0.46       225
           2       0.24      0.30      0.27       181
           3       0.26      0.31      0.28       175
           4       0.32      0.30      0.31       186
           5       0.28      0.25      0.27       182
           6       0.43      0.42      0.43       234
           7       0.43      0.51      0.47       186
           8       0.47      0.45      0.46       215
           9       0.42      0.44      0.43       193

    accuracy                           0.39      2000
   macro avg       0.39      0.38      0.39      2000
weighted avg       0.40      0.39      0.39      2000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Classifier Results in data_batch_2
              precision    recall  f1-score   support

           0       0.43      0.44      0.43       203
           1       0.50      0.42      0.46       233
           2       0.29      0.28      0.28       192
           3       0.25      0.32      0.28       170
           4       0.27      0.32      0.29       162
           5       0.31      0.35      0.33       182
           6       0.46      0.41      0.43       228
           7       0.42      0.42      0.42       204
           8       0.54      0.50      0.52       227
           9       0.39      0.37      0.38       199

    accuracy                           0.39      2000
   macro avg       0.39      0.38      0.38      2000
weighted avg       0.40      0.39      0.39      2000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Classifier Results in data_batch_3
              precision    recall  f1-score   support

           0       0.44      0.43      0.43       203
           1       0.56      0.44      0.49       239
           2       0.26      0.29      0.28       156
           3       0.27      0.26      0.27       199
           4       0.35      0.40      0.37       172
           5       0.32      0.33      0.33       209
           6       0.42      0.38      0.40       224
           7       0.40      0.43      0.41       185
           8       0.52      0.51      0.52       216
           9       0.46      0.53      0.49       197

    accuracy                           0.40      2000
   macro avg       0.40      0.40      0.40      2000
weighted avg       0.41      0.40      0.40      2000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Classifier Results in data_batch_4
              precision    recall  f1-score   support

           0       0.41      0.37      0.39       210
           1       0.47      0.51      0.49       200
           2       0.29      0.30      0.30       188
           3       0.37      0.30      0.33       210
           4       0.25      0.33      0.29       162
           5       0.37      0.36      0.36       215
           6       0.51      0.43      0.47       239
           7       0.34      0.45      0.39       158
           8       0.51      0.45      0.48       229
           9       0.42      0.44      0.43       189

    accuracy                           0.40      2000
   macro avg       0.40      0.39      0.39      2000
weighted avg       0.40      0.40      0.40      2000

Logistic Classifier Results in data_batch_5
              precision    recall  f1-score   support

           0       0.48      0.45      0.46       211
           1       0.45      0.44      0.45 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## MLP Classifier Training

In [9]:
data = [1,2,3,4,5]

# initialize the accuracy lists
tr_acclst = []
val_acclst = []

seed = 1234
np.random.seed(seed)

for i in data:
    d = unpickle(f'data_batch_{i}')
    X = d[b'data'] # assign into X
    y = d[b'labels'] # assign into Y
    X_tr, X_val, y_tr, y_val = train_test_split(d[b'data'], d[b'labels'], 
                                test_size=0.2, random_state=seed, shuffle=True)
    # Transform the data
    sclr = StandardScaler()
    X_tr = sclr.fit_transform(X_tr)
    X_val = sclr.fit_transform(X_val)

    mlp = MLPClassifier(hidden_layer_sizes=(100, 100, 100, 100), solver='sgd',
                    alpha=0.001, batch_size=32, activation='relu', 
                    max_iter=100, learning_rate='constant', 
                    learning_rate_init=0.001,
                    random_state=seed)


    mlp.fit(X_tr, y_tr)
    mlp_tr_pred = mlp.predict(X_tr)
    mlp_val_pred = mlp.predict(X_val)
    
    # Gets the accuracy score
    tr_acc = accuracy_score(mlp_tr_pred, y_tr)
    val_acc = accuracy_score(mlp_val_pred, y_val)

    tr_acclst.append(tr_acc)
    val_acclst.append(val_acc)

    print(f'MLP Classifier Results in data_batch_{i}')
    print(classification_report(mlp_val_pred, y_val))

MLP Classifier Results in data_batch_1
              precision    recall  f1-score   support

           0       0.49      0.48      0.49       197
           1       0.51      0.56      0.53       169
           2       0.32      0.36      0.34       202
           3       0.29      0.26      0.27       225
           4       0.43      0.34      0.38       222
           5       0.33      0.27      0.30       197
           6       0.43      0.47      0.45       207
           7       0.45      0.55      0.50       183
           8       0.58      0.53      0.56       225
           9       0.41      0.47      0.44       173

    accuracy                           0.42      2000
   macro avg       0.42      0.43      0.42      2000
weighted avg       0.42      0.42      0.42      2000

MLP Classifier Results in data_batch_2
              precision    recall  f1-score   support

           0       0.52      0.52      0.52       209
           1       0.48      0.52      0.50       183


## Random Forest Classifier Training

In [10]:
data = [1,2,3,4,5]

# initialize the accuracy lists
tr_acclst = []
val_acclst = []

seed = 1234
np.random.seed(seed)

for i in data:
    d = unpickle(f'data_batch_{i}')
    X = d[b'data'] # assign into X
    y = d[b'labels'] # assign into Y
    X_tr, X_val, y_tr, y_val = train_test_split(d[b'data'], d[b'labels'], 
                                test_size=0.2, random_state=seed, shuffle=True)
    # Transform the data
    sclr = StandardScaler()
    X_tr = sclr.fit_transform(X_tr)
    X_val = sclr.fit_transform(X_val)

    dt = RandomForestClassifier(n_estimators=500, criterion='gini', max_depth=20, min_samples_split=5,
                                min_samples_leaf=2, n_jobs=-1, random_state=seed)
    dt.fit(X_tr, y_tr)

    dt_tr_pred = dt.predict(X_tr)
    dt_val_pred = dt.predict(X_val)
    
    # Gets the accuracy score
    tr_acc = accuracy_score(dt_tr_pred, y_tr)
    val_acc = accuracy_score(dt_val_pred, y_val)
    
    tr_acclst.append(tr_acc)
    val_acclst.append(val_acc)
    
    print(f'Random Forest Results in data_batch_{i}')
    print(classification_report(dt_val_pred, y_val))

Random Forest Results in data_batch_1
              precision    recall  f1-score   support

           0       0.53      0.51      0.52       199
           1       0.44      0.50      0.47       165
           2       0.24      0.44      0.31       122
           3       0.21      0.33      0.26       134
           4       0.42      0.31      0.36       239
           5       0.43      0.35      0.38       199
           6       0.53      0.40      0.45       304
           7       0.40      0.55      0.46       160
           8       0.60      0.51      0.55       245
           9       0.55      0.47      0.51       233

    accuracy                           0.43      2000
   macro avg       0.44      0.44      0.43      2000
weighted avg       0.46      0.43      0.44      2000

Random Forest Results in data_batch_2
              precision    recall  f1-score   support

           0       0.54      0.58      0.56       193
           1       0.50      0.44      0.47       222
  