# Loading data & Importing

In [8]:
# Exporting Models
import pickle

# Allow multiple df print-outs within a single code block
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# dataframe
import pandas as pd

# Metrics
from sklearn.metrics import accuracy_score, mean_squared_error, f1_score, classification_report

# K-Cross Validation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import numpy as np

# Decision Trees
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,HistGradientBoostingClassifier

# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

# Feed-Forward Neural Network
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.utils import set_random_seed

from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam

In [None]:
# Enable this if you do not have tensorflow installed
import sys
!pip install tensorflow



In [9]:
#loading features from skin_cancer_features.data --> train and test data (tubular and image)
X_train, X_test = pickle.load(open("skin_cancer_features.data", "rb"))
print(f"\n✓ Feature Data loaded sucessfully!")

# Checking Shapes
print(X_train.shape)
print(X_test.shape)

# Loading lables
y_train, y_test = pickle.load(open("skin_cancer_labels.data", "rb"))
print(f"\n✓ Label Data loaded sucessfully!")

# Copy labels for test data
just_in_case = y_test

# Checking Shapes
print(y_train.shape)
print(y_test.shape)

# Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
label_encoder = pickle.load(open("skin_cancer_encode_labels.preprocess", "rb"))
class_labels = label_encoder.classes_

# Class names - Label Encoding
print(class_labels)


✓ Feature Data loaded sucessfully!
(3983, 222)
(996, 222)

✓ Label Data loaded sucessfully!
(3983,)
(996,)
['actinic_keratoses' 'basal_cell_carcinoma'
 'benign_keratosis-like_lesions' 'dermatofibroma' 'melanocytic_Nevi'
 'melanoma' 'vascular_lesions']


# K-Fold Cross Validation

In [12]:
# Splitting data into Folds - Used by all Models: Spliting into 5 total folds
def split_data_into_k_fold(X, Y, k = 5, test_size=0.10, random_state=42):
  _x, x_test, _y, y_test = train_test_split(X, Y, test_size=0.10, random_state=42)

  kfold_spliiter = KFold(n_splits=k)

  folds_data = []

  fold = 1
  for train_index, validation_index in kfold_spliiter.split(_x):
      x_train , x_valid = _x[train_index,:],_x[validation_index,:]
      y_train , y_valid = _y[train_index] , _y[validation_index]
      print (f"Fold {fold} training data shape = {(x_train.shape,y_train.shape)}")
      print (f"Fold {fold} validation data shape = {(x_valid.shape,y_valid.shape)}")
      fold+=1
      folds_data.append((x_train,y_train,x_valid,y_valid))
  return folds_data, x_test, y_test


# K-Fold Cross Validation for Decision Tree Models
def perform_k_fold_cross_validation(model, folds_data, model_name=""):
    train_acc_for_all_folds = []
    valid_acc_for_all_folds = []

    # Iterate over all folds
    for i, fold in enumerate(folds_data):
        x_train, y_train, x_valid, y_valid = fold

        # Train the model
        model.fit(x_train, y_train.flatten())

        # Evaluate model on training data
        y_pred_train = model.predict(x_train)

        # Evaluate the model on validation data
        y_pred_valid = model.predict(x_valid)

        # Compute training accuracy
        train_acc = accuracy_score(y_train, y_pred_train)

        # Store training accuracy for each fold
        train_acc_for_all_folds.append(train_acc)

        # Compute validation accuracy
        valid_acc = accuracy_score(y_valid.flatten(), y_pred_valid)

        # Store validation accuracy for each fold
        valid_acc_for_all_folds.append(valid_acc)

    # Average training accuracy across k folds
    avg_training_acc = sum(train_acc_for_all_folds) / len(folds_data)
    print(f"Average training accuracy for model {model_name} = {avg_training_acc}")

    # Average validation accuracy across k folds
    avg_validation_acc = sum(valid_acc_for_all_folds) / len(folds_data)
    print(f"Average validation accuracy for model {model_name} = {avg_validation_acc}")

    return avg_training_acc, avg_validation_acc

# Documentation: https://www.tensorflow.org/api_docs/python/tf/keras/Sequential
# Documentation: https://numpy.org/doc/stable/reference/generated/numpy.average.html
# K-Fold Cross Validation for Neural Network Models
def perform_k_fold_cross_validation_NN(model, folds_data, epochs=50, batch_size=32):
    train_acc_for_all_folds = []
    valid_acc_for_all_folds = []

    for i, fold in enumerate(folds_data):
        x_train, y_train, x_valid, y_valid = fold
        # Train the model
        _ = model.fit(x_train,y_train, epochs=epochs, batch_size=batch_size)

        #predict classes for training set
        y_pred_train = np.argmax(model.predict(x_train), axis=1)
        #predict classes for validation set
        y_pred_valid = np.argmax(model.predict(x_valid), axis=1)

        # Compute accuracies
        train_acc = np.mean(y_pred_train == y_train)
        valid_acc = np.mean(y_pred_valid == y_valid)

        train_acc_for_all_folds.append(train_acc)
        valid_acc_for_all_folds.append(valid_acc)

        print(f"Fold {i+1}: train_acc = {train_acc:.4f}, valid_acc = {valid_acc:.4f}")
    avg_training_acc = np.mean(train_acc_for_all_folds)
    avg_validation_acc = np.mean(valid_acc_for_all_folds)

    print(f"\nAverage training accuracy" , avg_training_acc)
    print(f"Average validation accuracy", avg_validation_acc)

    return avg_training_acc, avg_validation_acc

# K-Fold Cross Validation for Logistic Regression Models
def perform_k_fold_cross_validation_LR(model, folds_data):
    train_acc_for_all_folds = []
    valid_acc_for_all_folds = []

    for i, fold in enumerate(folds_data):
        x_train, y_train, x_valid, y_valid = fold

        _ = model.fit(x_train, y_train)

        y_pred_train = model.predict(x_train)
        y_pred_valid = model.predict(x_valid)

        # Compute training accuracy
        train_acc = accuracy_score(y_pred_train , y_train)
        valid_acc = accuracy_score(y_pred_valid , y_valid)

        train_acc_for_all_folds.append(train_acc)
        valid_acc_for_all_folds.append(valid_acc)
        print(f"Fold {i+1} - Training acc: {train_acc:.4f}, Validation acc: {valid_acc:.4f}")

    avg_training_acc = np.mean(train_acc_for_all_folds)
    avg_validation_acc = np.mean(valid_acc_for_all_folds)

    print(f"\nAverage training accuracy" , avg_training_acc)
    print(f"Average validation accuracy", avg_validation_acc)

    return avg_training_acc, avg_validation_acc




# Decision Tree Models

In [None]:
# instantiating models we will be using
dt = DecisionTreeClassifier()
rf = RandomForestClassifier(random_state=23) # some random seed for reproducibility
grad_boost = GradientBoostingClassifier()
hg = HistGradientBoostingClassifier()
all_models = {"decision_tree":dt,
              "random_forest":rf,
              "grad_boost":grad_boost,
              "hist_gradient_boosting":hg}

print (f"We are working with classifiers {all_models.keys()}")

We are working with classifiers dict_keys(['decision_tree', 'random_forest', 'grad_boost', 'hist_gradient_boosting'])


In [None]:
# Spliting into 5 folds
folds_data, inner_x_test, inner_y_test = split_data_into_k_fold(X_train,y_train,k=5)

# Variables to define the best model from model family
best_validation_accuracy = 0
best_model_name = ""
best_model = None

# Iterate over all models
for model_name in all_models.keys():
    print (f"Evaluating {model_name} ...")
    model = all_models[model_name]
    avg_training_acc, avg_validation_acc = perform_k_fold_cross_validation(model,folds_data)
    # Select best model based on average validation accuracy
    if avg_validation_acc > best_validation_accuracy:
        best_validation_accuracy = avg_validation_acc
        best_model_name = model_name
        best_model = model
    print (f"-----------------------------------")

print (f"Best model for the task is {best_model_name} which offers the validation accuracy of {best_validation_accuracy}")

Fold 1 training data shape = ((2867, 222), (2867,))
Fold 1 validation data shape = ((717, 222), (717,))
Fold 2 training data shape = ((2867, 222), (2867,))
Fold 2 validation data shape = ((717, 222), (717,))
Fold 3 training data shape = ((2867, 222), (2867,))
Fold 3 validation data shape = ((717, 222), (717,))
Fold 4 training data shape = ((2867, 222), (2867,))
Fold 4 validation data shape = ((717, 222), (717,))
Fold 5 training data shape = ((2868, 222), (2868,))
Fold 5 validation data shape = ((716, 222), (716,))
Evaluating decision_tree ...
Average training accuracy for model  = 1.0
Average validation accuracy for model  = 0.6046371831732156
-----------------------------------
Evaluating random_forest ...
Average training accuracy for model  = 1.0
Average validation accuracy for model  = 0.6975471977435466
-----------------------------------
Evaluating grad_boost ...
Average training accuracy for model  = 0.984165872509716
Average validation accuracy for model  = 0.7455342324863842
-

In [None]:
# Sanity Check
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape: ", X_test.shape)
print("y_test (current) shape:", y_test.shape)
print("original y_test (just_in_case) shape:", just_in_case.shape)

# HistGradientBoostingClassifier Model
best_model = HistGradientBoostingClassifier()

# Training Model
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
print(y_pred)

#Accuracy
accuracy_hist_1 = accuracy_score (y_test, y_pred)
print("Accuracy for hist_gradient_boosting:", accuracy_hist_1)

#Export hist_gradient_boosting.model
file_to_write = open("hist_gradient_boosting.model", "wb")
pickle.dump(best_model, file_to_write)
file_to_write.close()

X_train shape: (3983, 222)
y_train shape: (3983,)
X_test shape:  (996, 222)
y_test (current) shape: (996,)
original y_test (just_in_case) shape: (996,)


[4 4 4 4 4 2 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 2 4 4 4 4 4 4 5
 4 4 4 4 5 4 4 4 4 4 4 4 4 4 4 4 2 4 4 5 4 4 4 5 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 4 5 4 4 4 4 1 1 4 4 4 4 5 4 4 4 4 2 4 4 4 4 4 4 4 4 4 4 4 4 4 4 5 4 4
 4 4 4 4 4 4 4 4 4 4 4 4 5 4 4 4 4 4 4 4 4 2 2 1 4 4 4 4 4 4 4 5 4 4 2 2 1
 4 2 2 4 4 4 4 1 4 4 4 4 4 2 4 4 4 4 4 4 2 4 5 4 4 4 4 4 4 4 1 4 4 4 0 4 4
 5 4 4 1 4 4 4 4 4 4 5 4 2 4 4 4 4 4 1 4 4 4 4 2 4 4 5 4 4 4 4 4 1 4 4 2 4
 4 4 4 4 1 4 2 5 4 4 4 5 4 5 4 4 2 4 4 4 4 4 4 5 4 2 4 5 4 4 5 4 4 4 4 0 4
 4 4 4 4 4 4 4 4 4 4 4 5 4 5 5 4 5 4 4 4 4 4 2 4 4 4 2 4 4 5 4 4 4 4 4 4 4
 4 5 2 4 4 4 4 2 4 4 4 4 4 0 4 4 4 4 4 4 4 4 5 4 4 1 4 4 4 4 4 4 4 5 4 4 4
 4 4 4 4 4 1 5 2 4 4 4 4 4 4 2 4 4 4 4 4 4 4 5 4 4 4 1 4 4 4 4 4 4 0 4 4 4
 4 2 4 2 4 4 4 4 4 1 4 4 4 4 5 2 4 4 1 5 4 4 4 4 4 4 4 4 4 4 4 4 5 2 4 2 0
 4 4 5 4 4 4 4 5 5 2 2 5 4 4 4 4 4 4 4 4 4 4 2 4 4 4 4 4 4 4 4 5 1 4 4 4 4
 4 4 2 4 4 2 2 4 4 4 5 4 2 4 4 4 4 4 4 4 4 4 2 4 4 4 4 4 4 4 4 4 4 4 4 2 4
 4 4 4 4 4 4 4 4 4 4 4 4 

In [None]:
# Sanity Check
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape: ", X_test.shape)
print("y_test (current) shape:", y_test.shape)
print("original y_test (just_in_case) shape:", just_in_case.shape)

# Importing hist_gradient_boosting.model
with open ("hist_gradient_boosting.model", 'rb') as file:
  hist_model = pickle.load(file)

# Accuracy
y_predicted_hist_gradient_boosting_model = hist_model.predict(X_test)
accuracy_hist = accuracy_score (just_in_case, y_predicted_hist_gradient_boosting_model)
print("Accuracy for hist_gradient_boostin:", accuracy_hist)
print("Printing classification report for hist_gradient_boosting:")
print(classification_report(just_in_case, y_predicted_hist_gradient_boosting_model))

X_train shape: (3983, 222)
y_train shape: (3983,)
X_test shape:  (996, 222)
y_test (current) shape: (996,)
original y_test (just_in_case) shape: (996,)
Accuracy for hist_gradient_boostin: 0.7720883534136547
Printing classification report for hist_gradient_boosting:
              precision    recall  f1-score   support

           0       0.75      0.18      0.29        34
           1       0.65      0.29      0.40        58
           2       0.62      0.43      0.51       111
           3       0.00      0.00      0.00        12
           4       0.82      0.97      0.89       674
           5       0.48      0.41      0.44        95
           6       1.00      0.17      0.29        12

    accuracy                           0.77       996
   macro avg       0.62      0.35      0.40       996
weighted avg       0.75      0.77      0.74       996

Might be overfitting?


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Logistic Regression

In [14]:
# models we will be using
lr_vanilla = LogisticRegression(penalty=None, max_iter= 10000)

lr_l2 = LogisticRegression(penalty="l2", max_iter= 10000)

lr_l1 = LogisticRegression(penalty="l1", solver='liblinear', max_iter= 10000)

lr_en = LogisticRegression(penalty="elasticnet", solver='saga', l1_ratio=0.5, max_iter= 10000)

ovr_none = OneVsRestClassifier(lr_vanilla)
ovr_l2 = OneVsRestClassifier(lr_l2)
ovr_l1 = OneVsRestClassifier(lr_l1)
ovr_en = OneVsRestClassifier(lr_en)

all_models = {"OvR – None": ovr_none, "OvR – L2": ovr_l2, "OvR – L1": ovr_l1, "OvR – EN": ovr_en}

print (f"We are working with models {all_models.keys()}")

We are working with models dict_keys(['OvR – None', 'OvR – L2', 'OvR – L1', 'OvR – EN'])


In [13]:
# K-fold Cross Validation

# Spliting into folds
folds_data, x_test, y_test = split_data_into_k_fold(X_train, y_train, k=5)

best_validation_accuracy = 0
best_model_name = ""
best_model = None

# Iterate over all models
for model_name in all_models.keys():

    print (f"Evaluating {model_name} ...")
    model = all_models[model_name]
    avg_training_acc, avg_validation_acc = perform_k_fold_cross_validation_LR(model,folds_data)
    if avg_validation_acc > best_validation_accuracy:
        best_validation_accuracy = avg_validation_acc
        best_model_name = model_name
        best_model = model
    print (f"-----------------------------------")

print (f"Best model for the task is {best_model_name} which offers the validation accuracy of {best_validation_accuracy}")


Fold 1 training data shape = ((2867, 222), (2867,))
Fold 1 validation data shape = ((717, 222), (717,))
Fold 2 training data shape = ((2867, 222), (2867,))
Fold 2 validation data shape = ((717, 222), (717,))
Fold 3 training data shape = ((2867, 222), (2867,))
Fold 3 validation data shape = ((717, 222), (717,))
Fold 4 training data shape = ((2867, 222), (2867,))
Fold 4 validation data shape = ((717, 222), (717,))
Fold 5 training data shape = ((2868, 222), (2868,))
Fold 5 validation data shape = ((716, 222), (716,))
Evaluating OvR – None ...
Fold 1 - Training acc: 0.8975, Validation acc: 0.7225
Fold 2 - Training acc: 0.9044, Validation acc: 0.7155
Fold 3 - Training acc: 0.8943, Validation acc: 0.7001
Fold 4 - Training acc: 0.8950, Validation acc: 0.7252
Fold 5 - Training acc: 0.8961, Validation acc: 0.7388

Average training accuracy 0.897461032798074
Average validation accuracy 0.7204292403948793
-----------------------------------
Evaluating OvR – L2 ...
Fold 1 - Training acc: 0.8894, V



Fold 3 - Training acc: 0.8706, Validation acc: 0.7629
Fold 4 - Training acc: 0.8748, Validation acc: 0.7755
Fold 5 - Training acc: 0.8741, Validation acc: 0.7723

Average training accuracy 0.8752093387992735
Average validation accuracy 0.7625585345519429
-----------------------------------
Best model for the task is OvR – EN which offers the validation accuracy of 0.7625585345519429




In [None]:
best_model = ovr_en

best_model.fit(X_train, y_train)

# Predict on test data
y_pred = best_model.predict(X_test)
print(y_pred)

# Compute accuracy on the test set
accuracy_ovr_en = accuracy_score(just_in_case, y_pred)
print("Accuracy for OvR Logistic Regression (EN):", accuracy_ovr_en)

# Save the trained model to file
import pickle
with open("ovr_en_logreg.model", "wb") as file_to_write:
    pickle.dump(best_model, file_to_write)



[4 1 1 4 5 2 4 5 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 5 4 1 4 4 4 4 5
 0 4 4 4 5 4 4 4 4 4 4 4 4 4 4 4 5 4 4 5 4 4 2 4 4 4 4 4 4 4 4 5 4 4 4 4 4
 4 4 4 1 4 4 4 4 5 0 4 4 4 4 4 4 4 1 5 2 4 4 4 4 4 4 2 4 4 4 4 4 4 4 0 4 4
 4 4 5 2 4 4 4 4 4 4 4 4 5 4 4 4 1 4 4 4 4 2 2 1 4 4 4 4 4 4 4 5 4 4 5 4 1
 4 2 0 4 4 4 4 1 4 4 4 4 4 0 4 4 4 4 4 4 2 4 5 2 4 4 4 4 4 4 1 4 5 4 0 4 4
 1 4 4 5 4 1 4 4 4 4 5 4 2 6 4 4 4 4 5 4 4 4 1 0 4 4 5 4 4 4 5 4 1 4 4 0 4
 4 4 4 4 1 4 2 5 4 4 6 5 4 0 4 4 2 4 4 4 4 4 4 5 4 4 4 5 4 4 0 4 4 4 4 2 4
 4 4 4 4 4 4 4 4 4 4 4 5 4 5 5 1 4 4 4 4 4 4 2 4 4 4 2 4 4 4 5 4 4 4 4 4 4
 5 5 0 4 4 4 4 2 4 4 4 4 4 5 2 4 4 4 4 4 4 4 5 4 4 1 4 4 4 4 4 4 4 5 2 4 6
 4 4 2 4 5 1 5 2 4 4 4 4 4 4 0 4 4 4 4 4 6 4 2 4 0 4 1 4 4 4 4 4 4 5 4 5 4
 4 1 4 2 4 4 4 4 4 0 1 4 4 4 5 2 4 4 0 0 4 4 4 4 4 4 4 4 4 4 4 4 5 2 4 2 0
 4 4 2 4 4 4 4 4 5 0 2 5 4 4 2 0 4 4 4 4 4 4 2 4 4 4 4 4 4 4 4 5 3 4 4 4 4
 4 2 4 4 4 2 5 4 4 4 4 4 2 4 4 5 4 4 4 4 4 4 2 4 4 4 4 4 4 4 4 5 4 4 4 2 4
 1 4 4 4 4 4 4 4 4 4 4 4 

In [None]:
# Sanity Check
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape: ", X_test.shape)
print("y_test (current) shape:", y_test.shape)
print("original y_test (just_in_case) shape:", just_in_case.shape)

with open("ovr_en_logreg.model", "rb") as file:
    best_model = pickle.load(file)

y_pred_ovr_en = best_model.predict(X_test)

# Accuracy
accuracy_ovr_en = accuracy_score(just_in_case, y_pred_ovr_en)
print("Accuracy for OvR EN Logistic Regression:", accuracy_ovr_en)
print("Classification report for OvR EN Logistic Regression:")
print(classification_report(just_in_case, y_pred_ovr_en))

X_train shape: (3983, 222)
y_train shape: (3983,)
X_test shape:  (996, 222)
y_test (current) shape: (399,)
original y_test (just_in_case) shape: (996,)
Accuracy for OvR EN Logistic Regression: 0.8102409638554217
Classification report for OvR EN Logistic Regression:
              precision    recall  f1-score   support

           0       0.45      0.38      0.41        34
           1       0.69      0.53      0.60        58
           2       0.68      0.54      0.60       111
           3       0.25      0.08      0.12        12
           4       0.89      0.96      0.92       674
           5       0.53      0.51      0.52        95
           6       0.75      0.50      0.60        12

    accuracy                           0.81       996
   macro avg       0.61      0.50      0.54       996
weighted avg       0.79      0.81      0.80       996





# Neural Networks

In [None]:
# This is needed for replicability
set_random_seed(555)
total_num_classes = 7
num_features = X_train.shape[1] # Dynamically get the number of features

model = Sequential()
model.add(Dense(221, input_shape=(222,), activation='relu'))
model.add(Dense(222, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(total_num_classes, activation='softmax'))
optim = Adam(learning_rate=0.01)
loss_fn = SparseCategoricalCrossentropy()
model.compile(loss=loss_fn,optimizer=optim,metrics=["accuracy"])
model.summary()

set_random_seed(555)
model_2 = Sequential()
total_num_classes = 7

model_2.add(Dense(256, input_shape=(222,), activation='relu'))
model_2.add(BatchNormalization())
model_2.add(Dropout(0.3))
model_2.add(Dense(64, activation='relu'))
model_2.add(Dense(32, activation='relu'))
model_2.add(Dense(8, activation='relu'))
model_2.add(Dense(total_num_classes, activation='softmax'))
optim = Adam(learning_rate=0.001)
loss_fn = SparseCategoricalCrossentropy()
model_2.compile(loss=loss_fn,optimizer=optim,metrics=["accuracy"])
model_2.summary()


# Sanity Check
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

all_models_in_NN = {"Model_1": model,
                    "Model_2": model_2}

print (f"We are working with models {all_models_in_NN.keys()}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


X_train shape: (3983, 222)
y_train shape: (3983,)
We are working with models dict_keys(['Model_1', 'Model_2'])


In [None]:
#spliting into 5 folds
folds_data, x_test, y_test = split_data_into_k_fold(X_train, y_train, k=5)

best_validation_accuracy_NN = 0
best_model_name_NN = ""
best_model_NN = None

# Iterate over all models
for model_name in all_models_in_NN.keys():

    print (f"Evaluating {model_name} ...")
    model = all_models_in_NN[model_name]
    avg_training_acc, avg_validation_acc = perform_k_fold_cross_validation_NN(model,folds_data)
    # Select best model based on average validation accuracy
    if avg_validation_acc > best_validation_accuracy_NN:
        best_validation_accuracy_NN = avg_validation_acc
        best_model_name_NN = model_name
        best_model_NN = model
    print (f"-----------------------------------")

print (f"Best model for the task is {best_model_name_NN} which offers the validation accuracy of {best_validation_accuracy_NN}")

Fold 1 training data shape = ((2867, 222), (2867,))
Fold 1 validation data shape = ((717, 222), (717,))
Fold 2 training data shape = ((2867, 222), (2867,))
Fold 2 validation data shape = ((717, 222), (717,))
Fold 3 training data shape = ((2867, 222), (2867,))
Fold 3 validation data shape = ((717, 222), (717,))
Fold 4 training data shape = ((2867, 222), (2867,))
Fold 4 validation data shape = ((717, 222), (717,))
Fold 5 training data shape = ((2868, 222), (2868,))
Fold 5 validation data shape = ((716, 222), (716,))
Evaluating Model_1 ...
Epoch 1/50
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - accuracy: 0.5378 - loss: 1.8809
Epoch 2/50
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6810 - loss: 1.2599
Epoch 3/50
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6810 - loss: 1.1368
Epoch 4/50
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6810

In [None]:
# Creating the Best Model for Neural Network
set_random_seed(555)
best_model_nn = Sequential()
total_num_classes = 7

best_model_nn.add(Dense(256, input_shape=(222,), activation='relu'))
best_model_nn.add(BatchNormalization())
best_model_nn.add(Dropout(0.3))
best_model_nn.add(Dense(64, activation='relu'))
best_model_nn.add(Dense(32, activation='relu'))
best_model_nn.add(Dense(8, activation='relu'))
best_model_nn.add(Dense(total_num_classes, activation='softmax'))
optim = Adam(learning_rate=0.001)

# visualize the model design
best_model_nn.summary()

# Sanity Check
print(X_train.shape)
print(y_train.shape)

# Documentation: https://www.tensorflow.org/api_docs/python/tf/keras/losses/SparseCategoricalCrossentropy
loss_fn = SparseCategoricalCrossentropy()

# From tutorial
best_model_nn.compile(loss=loss_fn,optimizer=optim,metrics=["accuracy"])
best_model_nn.fit(X_train, y_train, epochs=50, batch_size=32)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


(3983, 222)
(3983,)
Epoch 1/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 17ms/step - accuracy: 0.2025 - loss: 1.8331
Epoch 2/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7038 - loss: 0.8661
Epoch 3/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7498 - loss: 0.6991
Epoch 4/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7908 - loss: 0.5903
Epoch 5/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8192 - loss: 0.5051
Epoch 6/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8405 - loss: 0.4319
Epoch 7/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8623 - loss: 0.3792
Epoch 8/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8852 - loss: 0.3272
Epoch 9/50
[1m125/

<keras.src.callbacks.history.History at 0x7fd207360260>

In [None]:
# Predict on test data
y_pred_nn = best_model_nn.predict(X_test)
print(y_pred_nn)

# Compute accuracy on test data
accuracy_nn = accuracy_score(just_in_case, np.argmax(y_pred_nn, axis=1))
print("Accuracy for Neural Network:", accuracy_nn)

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step
[[9.2553327e-24 3.3609514e-13 2.4518552e-11 ... 1.0000000e+00
  2.8283204e-10 2.5538985e-17]
 [9.9002904e-01 9.3390439e-03 4.2516018e-05 ... 3.6002216e-06
  1.3503065e-05 3.9131667e-09]
 [6.1056018e-01 3.8884810e-01 1.2344326e-04 ... 3.8052406e-04
  8.4160041e-05 5.1299018e-09]
 ...
 [4.8328343e-07 2.5884492e-05 1.3138336e-04 ... 9.9840695e-01
  1.4349819e-03 3.4489375e-07]
 [2.9595254e-02 3.7241750e-03 8.3483821e-01 ... 3.0794930e-02
  6.6916281e-03 1.3093683e-07]
 [1.7099135e-17 1.4308695e-15 3.0897788e-15 ... 9.9999821e-01
  1.8367100e-06 2.3719614e-13]]
Accuracy for Neural Network: 0.7871485943775101


In [None]:
predicted_classes_best_model_nn = np.argmax(y_pred_nn, axis = 1)
print("Predicted Class Indices:", predicted_classes_best_model_nn)

predicted_classes_in_string_format_best_model_nn = [class_labels[i] for i in predicted_classes_best_model_nn]
print("Predicted Class Labels:", predicted_classes_in_string_format_best_model_nn)

#Converting y_test (just_in_case to string labels)
y_test_string_labels = [class_labels[i] for i in just_in_case]

acc_best_model_nn = accuracy_score(y_test_string_labels, predicted_classes_in_string_format_best_model_nn)
print("Accuracy of NN Model 1:", acc_best_model_nn)

Predicted Class Indices: [4 0 0 4 5 5 4 5 4 4 4 4 4 4 4 1 4 4 4 4 4 4 4 4 4 4 2 5 4 4 4 1 4 4 4 4 5
 5 2 4 3 5 4 5 4 4 4 4 4 4 4 4 1 5 4 4 5 4 4 4 4 4 4 4 4 5 4 4 4 4 4 4 4 4
 4 4 4 5 1 4 4 4 1 1 5 4 4 4 5 4 4 1 5 2 4 4 4 4 4 4 2 4 4 4 4 5 4 4 0 4 4
 4 4 4 2 4 4 4 4 4 4 4 4 1 4 4 4 4 4 4 4 4 4 1 1 4 4 4 4 4 4 4 5 4 4 0 0 1
 4 0 0 4 4 4 4 1 4 4 4 4 4 0 4 4 4 4 4 4 2 4 4 0 4 4 4 2 4 4 1 4 5 4 0 4 4
 1 4 4 6 4 5 4 4 4 4 5 4 2 1 4 4 1 4 5 4 4 4 1 0 4 4 5 2 4 4 4 4 1 4 4 0 4
 4 4 4 4 1 4 0 5 4 4 4 5 4 5 5 4 2 4 4 4 4 4 4 5 4 5 4 1 4 4 0 4 4 4 4 0 4
 4 4 4 4 4 4 4 4 4 4 4 1 4 1 5 1 4 4 4 4 4 4 2 4 4 4 2 4 5 4 5 4 4 4 4 1 4
 5 5 0 4 5 4 4 2 4 4 4 4 4 0 2 4 2 5 4 4 4 4 5 4 4 1 4 4 4 4 4 2 4 5 2 4 6
 4 4 2 4 5 1 1 2 4 4 4 4 4 4 0 4 4 4 4 4 6 4 2 5 2 4 1 4 4 5 4 4 4 0 4 5 4
 4 1 4 1 4 4 4 4 4 0 4 4 4 4 5 2 4 4 0 0 4 5 4 4 4 4 4 4 4 4 4 4 5 0 4 5 0
 4 4 2 4 4 4 4 5 5 3 1 5 4 4 2 0 4 4 4 4 4 4 2 4 4 4 2 4 4 4 4 0 0 4 4 4 4
 4 2 2 4 4 2 2 4 4 4 2 4 2 4 4 4 4 4 4 4 4 4 2 4 4 4 4 4 2 4 4 4 4 4 4 2 4


In [None]:
#Export Best_Neural_Network.model
file_to_write = open("Best_Neural_Network.model", "wb")
pickle.dump(best_model_nn, file_to_write)
file_to_write.close()

In [None]:
# Sanity Check
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape: ", X_test.shape)
print("y_test (current) shape:", y_test.shape)
print("original y_test (just_in_case) shape:", just_in_case.shape)

# Importing Best_Neural_Network.model
with open ("Best_Neural_Network.model", 'rb') as file:
  best_nn_model = pickle.load(file)

# Accuracy
best_nn_model_y_pred = best_nn_model.predict(X_test)
predicted_classes_nn = np.argmax(best_nn_model_y_pred, axis=1)
best_model_accuracy = accuracy_score (just_in_case, predicted_classes_nn)

print("Accuracy for Neural Network Model:", best_model_accuracy)
print("Classification report for Neural Network Model:")
print(classification_report(just_in_case, predicted_classes_nn))

X_train shape: (3983, 222)
y_train shape: (3983,)
X_test shape:  (996, 222)
y_test (current) shape: (996,)
original y_test (just_in_case) shape: (996,)
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 26ms/step
Accuracy for Neural Network Model: 0.7871485943775101
Classification report for Neural Network Model:
              precision    recall  f1-score   support

           0       0.45      0.62      0.52        34
           1       0.53      0.53      0.53        58
           2       0.64      0.42      0.51       111
           3       0.33      0.17      0.22        12
           4       0.89      0.94      0.92       674
           5       0.48      0.46      0.47        95
           6       0.67      0.33      0.44        12

    accuracy                           0.79       996
   macro avg       0.57      0.50      0.52       996
weighted avg       0.78      0.79      0.78       996

