In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, classification_report

In [2]:
df_train = pd.read_csv(r'C:\Users\nadin\OneDrive\Documents\Machine Learning Project\train_dump.csv')

In [3]:
columns_to_convert = df_train.columns[8:-1]
df_train[columns_to_convert] = df_train[columns_to_convert].astype('int8')

In [4]:
df_test = pd.read_csv(r'C:\Users\nadin\OneDrive\Documents\Machine Learning Project\test_dump.csv')

In [5]:
df_test[columns_to_convert] = df_test[columns_to_convert].astype('int8')

In [6]:
# remove the index column thats read from the csv
df_train = df_train.iloc[:, 1:]
df_test = df_test.iloc[:, 1:]

In [7]:
x_train = df_train[df_train.columns[:-1]].to_numpy()
y_train = df_train['label'].to_numpy()

x_test = df_test[df_test.columns[:-1]].to_numpy()
y_test = df_test['label'].to_numpy()

In [8]:
kf = KFold(n_splits=5)

f1s = []
accuracies = []
precisions = []
recalls = []
roc_aucs = []

In [11]:
model = LinearSVC(dual=False)

fold = 1
for train, valid in kf.split(x_train, y_train):
    print(f"##### FOLD: {fold} #####")

    # Fit the model
    model.fit(x_train[train], y_train[train])

    # Predict on the test set
    predictions = model.predict(x_train[valid])

    # Evaluate the model
    precision = precision_score(y_true=y_train[valid], y_pred=predictions, zero_division=0)
    recall = recall_score(y_true=y_train[valid], y_pred=predictions, zero_division=0)
    accuracy = accuracy_score(y_true=y_train[valid], y_pred=predictions)
    f1 = f1_score(y_true=y_train[valid], y_pred=predictions, zero_division=0)

    # Store the result
    f1s.append(f1)
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)

    # Print the scores for each fold
    print(f"Precision = {precision}")
    print(f"Recall = {recall}")
    print(f"Accuracy = {accuracy}")
    print(f"F1 score = {f1}\n")

    fold += 1

print("\nMean Scores: ")
print(f"Mean F1 score = {np.mean(f1s)}")
print(f"Mean Accuracy = {np.mean(accuracies)}")
print(f"Mean Precision = {np.mean(precisions)}")
print(f"Mean Recall = {np.mean(recalls)}")

##### FOLD: 1 #####
Precision = 0.9984633851497454
Recall = 1.0
Accuracy = 0.9984958333333334
F1 score = 0.9992311018246954

##### FOLD: 2 #####
Precision = 0.9984883389186726
Recall = 1.0
Accuracy = 0.9985208333333333
F1 score = 0.9992435977474128

##### FOLD: 3 #####
Precision = 0.9985012858286356
Recall = 1.0
Accuracy = 0.9985333333333334
F1 score = 0.9992500809571694

##### FOLD: 4 #####
Precision = 0.9985309647090884
Recall = 1.0
Accuracy = 0.9985625
F1 score = 0.9992649424417973

##### FOLD: 5 #####
Precision = 0.9985065227936109
Recall = 1.0
Accuracy = 0.9985375
F1 score = 0.9992527033615575


Mean Scores: 
Mean F1 score = 0.9992484852665264
Mean Accuracy = 0.99853
Mean Precision = 0.9984980994799505
Mean Recall = 1.0


In [12]:
y_pred = model.predict(x_test)

In [14]:
classification = classification_report(y_test, y_pred, zero_division=1)

print("\nClassification Report:\n", classification)


Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.94      0.97      8062
           1       1.00      1.00      1.00    353872

    accuracy                           1.00    361934
   macro avg       1.00      0.97      0.98    361934
weighted avg       1.00      1.00      1.00    361934

