In [1]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, classification_report

In [2]:
df_train = pd.read_csv(r'C:\Users\nadin\OneDrive\Documents\Machine Learning Project\train_dump.csv')

In [3]:
columns_to_convert = df_train.columns[8:-1]
df_train[columns_to_convert] = df_train[columns_to_convert].astype('int8')

In [4]:
df_test = pd.read_csv(r'C:\Users\nadin\OneDrive\Documents\Machine Learning Project\test_dump.csv')

In [5]:
df_test[columns_to_convert] = df_test[columns_to_convert].astype('int8')

In [6]:
# remove the index column thats read from the csv
df_train = df_train.iloc[:, 1:]
df_test = df_test.iloc[:, 1:]

In [7]:
x_train = df_train[df_train.columns[:-1]].to_numpy()
y_train = df_train['label'].to_numpy()

x_test = df_test[df_test.columns[:-1]].to_numpy()
y_test = df_test['label'].to_numpy()

In [8]:
kf = KFold(n_splits=5)

f1s = []
accuracies = []
precisions = []
recalls = []
roc_aucs = []

In [9]:
model = GaussianNB()

fold = 1
for train, valid in kf.split(x_train, y_train):
    print(f"##### FOLD: {fold} #####")

    # Fit the model
    model.fit(x_train[train], y_train[train])

    # Predict on the test set
    predictions = model.predict(x_train[valid])

    # Evaluate the model
    precision = precision_score(y_true=y_train[valid], y_pred=predictions, zero_division=0)
    recall = recall_score(y_true=y_train[valid], y_pred=predictions, zero_division=0)
    accuracy = accuracy_score(y_true=y_train[valid], y_pred=predictions)
    f1 = f1_score(y_true=y_train[valid], y_pred=predictions, zero_division=0)

    # Store the result
    f1s.append(f1)
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)

    # Print the scores for each fold
    print(f"Precision = {precision}")
    print(f"Recall = {recall}")
    print(f"Accuracy = {accuracy}")
    print(f"F1 score = {f1}\n")

    fold += 1

print("\nMean Scores: ")
print(f"Mean F1 score = {np.mean(f1s)}")
print(f"Mean Accuracy = {np.mean(accuracies)}")
print(f"Mean Precision = {np.mean(precisions)}")
print(f"Mean Recall = {np.mean(recalls)}")

##### FOLD: 1 #####
Precision = 0.997566199376947
Recall = 0.04368400185871229
Accuracy = 0.0652125
F1 score = 0.08370261759576544

##### FOLD: 2 #####
Precision = 0.9976171564733916
Recall = 0.04285117235144102
Accuracy = 0.06474166666666667
F1 score = 0.08217273611985705

##### FOLD: 3 #####
Precision = 0.9978310164645569
Recall = 0.0431569701001211
Accuracy = 0.06492916666666666
F1 score = 0.08273556255849979

##### FOLD: 4 #####
Precision = 0.9982
Recall = 0.042566619616806606
Accuracy = 0.06442083333333333
F1 score = 0.08165134988118755

##### FOLD: 5 #####
Precision = 0.9977615571776156
Recall = 0.04368669328549331
Accuracy = 0.064825
F1 score = 0.08370824589909613


Mean Scores: 
Mean F1 score = 0.0827941024108812
Mean Accuracy = 0.06482583333333333
Mean Precision = 0.9977951858985022
Mean Recall = 0.04318909144251486


In [10]:
y_pred = model.predict(x_test)

In [11]:
classification = classification_report(y_test, y_pred)

print("\nClassification Report:\n", classification)


Classification Report:
               precision    recall  f1-score   support

           0       0.02      0.99      0.05      8062
           1       1.00      0.04      0.08    353872

    accuracy                           0.06    361934
   macro avg       0.51      0.52      0.06    361934
weighted avg       0.98      0.06      0.08    361934

