In [6]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold

# Load the dataset
data = pd.read_csv("/nb.csv")

# Separate the features and the target variable
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Convert string features to numerical features
encoder = LabelEncoder()
for column in X.columns:
    if X[column].dtype == object:
        X[column] = encoder.fit_transform(X[column])
onehot_encoder = OneHotEncoder()
X = onehot_encoder.fit_transform(X)

# Initialize lists to store evaluation metrics for each split
accuracies = []
precisions = []
recalls = []
f1_scores = []
cms = []

# Split the dataset into training and testing sets using stratified k-fold cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Train the model
    k = 5
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)

    # Test the model
    y_pred = knn.predict(X_test)
    accuracy = np.mean(y_pred == y_test)
    cm = confusion_matrix(y_test, y_pred)
    precision = precision_score(y_test, y_pred, pos_label='M')
    recall = recall_score(y_test, y_pred, pos_label='M')
    f1 = f1_score(y_test, y_pred, pos_label='M')
    
    # Append evaluation metrics to lists
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)
    cms.append(cm)
    
    # Print evaluation metrics for current split
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("Confusion matrix:", cm)
    print("")

# Print average evaluation metrics across all splits
print("Average evaluation metrics across all splits:")
print("Accuracy:", np.mean(accuracies))
print("Precision:", np.mean(precisions))
print("Recall:", np.mean(recalls))
print("F1 Score:", np.mean(f1_scores))


Accuracy: 0.8246602585349685
Precision: 0.8617242815493544
Recall: 0.9130626654898499
F1 Score: 0.8866509535033211
Confusion matrix: [[ 419  332]
 [ 197 2069]]

Accuracy: 0.8276433543254889
Precision: 0.8711734693877551
Recall: 0.9042365401588702
F1 Score: 0.8873971416197488
Confusion matrix: [[ 448  303]
 [ 217 2049]]

Accuracy: 0.8239389920424404
Precision: 0.867429055484964
Recall: 0.9037952338923213
F1 Score: 0.8852388156472876
Confusion matrix: [[ 437  313]
 [ 218 2048]]

Accuracy: 0.8149867374005305
Precision: 0.8579212070410729
Recall: 0.9033539276257723
F1 Score: 0.8800515907136714
Confusion matrix: [[ 411  339]
 [ 219 2047]]

Accuracy: 0.8199602122015915
Precision: 0.8566694283347142
Recall: 0.9130242825607064
F1 Score: 0.883949561872195
Confusion matrix: [[ 405  346]
 [ 197 2068]]

Accuracy: 0.8156498673740054
Precision: 0.8529533250722842
Recall: 0.9116997792494481
F1 Score: 0.8813486982501068
Confusion matrix: [[ 395  356]
 [ 200 2065]]

Accuracy: 0.8156498673740054
Precisi