Import necessary libraries and modules

In [105]:
import pandas as pd
import math
from sklearn.utils import shuffle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

Load the dataset and set column names

In [106]:
data = pd.read_csv("D:\\term 7\\intro machine learning\\assignment 1\\magic04.data")
column_names=['Feature1','Feature2','Feature3','Feature4','Feature5','Feature6','Feature7','Feature8','Feature9','Feature10','class']
data.columns=column_names

Separate the dataset into 'gamma_data' and 'hadron_data' based on the 'class' column.

In [107]:
gamma_data = data[data['class'] == 'g']
hadron_data = data[data['class'] == 'h']

Randomly undersample gamma events to balance the dataset

In [108]:
gamma_data = shuffle(gamma_data, random_state=42)
gamma_data = gamma_data.sample(n=len(hadron_data), random_state=42)

Combine the two classes again to create a balanced dataset

In [109]:
balanced_data = pd.concat([gamma_data, hadron_data])

split the balanced data set into feature x and label y 

In [110]:
X = balanced_data.drop(columns=['class'])
y = balanced_data['class']

Split the dataset into training, validation, and test sets.
- 70% for training
- 15% for validation
- 15% for testing

In [111]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

Perform KNN classification according to different values of hyperparameter k and calculate Accuracy , ​Precision ,Recall ,F1-Score and Confusion Matrix:


Accuracy= Number of Correct Predictions / Total Number of Predictions 

​Precision= TP / (TP+FP)

Recall= TP/(TP+FN)

F1-Score= (2⋅Precision⋅Recall) /(Precision+Recall)

Confusion Matrix:

                    predicted negative      predicted positive 

Actual negative            TP                       FP

Actual positive            FN                       TN


In [145]:
k_values = range(1, math.floor(math.sqrt(len(balanced_data)))+1)
results = []
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred_val = knn.predict(X_val)

    accuracy = accuracy_score(y_val, y_pred_val)
    precision = precision_score(y_val, y_pred_val, average='weighted')
    recall = recall_score(y_val, y_pred_val, average='weighted')
    f1 = f1_score(y_val, y_pred_val, average='weighted')
    cm = confusion_matrix(y_val, y_pred_val)

    results.append({
        'K': k,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Confusion Matrix': cm
    })

 Print and compare the results 

In [146]:
for result in results:
    print(f"K = {result['K']}:")
    print(f"Accuracy: {result['Accuracy']:.4f}")
    #print(f"Recall: {result['Recall']:.2f}")
    #print(f"F1-Score: {result['F1-Score']:.2f}")
    #print("Confusion Matrix:\n", result['Confusion Matrix'])
    print("-" * 25)


K = 1:
Accuracy: 0.7483
-------------------------
K = 2:
Accuracy: 0.7343
-------------------------
K = 3:
Accuracy: 0.7672
-------------------------
K = 4:
Accuracy: 0.7672
-------------------------
K = 5:
Accuracy: 0.7782
-------------------------
K = 6:
Accuracy: 0.7667
-------------------------
K = 7:
Accuracy: 0.7797
-------------------------
K = 8:
Accuracy: 0.7747
-------------------------
K = 9:
Accuracy: 0.7881
-------------------------
K = 10:
Accuracy: 0.7787
-------------------------
K = 11:
Accuracy: 0.7866
-------------------------
K = 12:
Accuracy: 0.7777
-------------------------
K = 13:
Accuracy: 0.7856
-------------------------
K = 14:
Accuracy: 0.7876
-------------------------
K = 15:
Accuracy: 0.7886
-------------------------
K = 16:
Accuracy: 0.7846
-------------------------
K = 17:
Accuracy: 0.7911
-------------------------
K = 18:
Accuracy: 0.7846
-------------------------
K = 19:
Accuracy: 0.7876
-------------------------
K = 20:
Accuracy: 0.7841
---------------

Find the best K value based on accuracy.

In [147]:
best_result = max(results, key=lambda x: x['Accuracy'])
best_k = best_result['K']

print(f"Best K value: K = {best_k}")


Best K value: K = 17


Evaluate the model with the best K value 

In [148]:
best_knn = KNeighborsClassifier(n_neighbors=best_k)
best_knn.fit(X_train, y_train)

calculate test_accuracy test_precision ,test_recall ,test_f1 ,test_cm on test set with best k

In [149]:
y_pred_test = best_knn.predict(X_test)

test_accuracy = accuracy_score(y_test, y_pred_test)
test_precision = precision_score(y_test, y_pred_test, average='weighted')
test_recall = recall_score(y_test, y_pred_test, average='weighted')
test_f1 = f1_score(y_test, y_pred_test, average='weighted')
test_cm = confusion_matrix(y_test, y_pred_test)

Print the test set results for the best K value

In [150]:
print("Test Set Results for Best K:")
print(f"Accuracy: {test_accuracy:.2f}")
print(f"Precision: {test_precision:.2f}")
print(f"Recall: {test_recall:.2f}")
print(f"F1-Score: {test_f1:.2f}")
print("Confusion Matrix:\n", test_cm)

Test Set Results for Best K:
Accuracy: 0.77
Precision: 0.78
Recall: 0.77
F1-Score: 0.77
Confusion Matrix:
 [[873 137]
 [318 679]]


calculate test_accuracy test_precision ,test_recall ,test_f1 ,test_cm on validate set with best k

In [151]:

y_pred_val = best_knn.predict(X_val)

val_accuracy = accuracy_score(y_val, y_pred_val)
val_precision = precision_score(y_val, y_pred_val, average='weighted')
val_recall = recall_score(y_val, y_pred_val, average='weighted')
val_f1 = f1_score(y_val, y_pred_val, average='weighted')
val_cm = confusion_matrix(y_val, y_pred_val)

In [152]:
print("validate Set Results for Best K:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Precision: {val_precision:.2f}")
print(f"Recall: {val_recall:.2f}")
print(f"F1-Score: {val_f1:.2f}")
print("Confusion Matrix:\n", val_cm)

validate Set Results for Best K:
Accuracy: 0.79
Precision: 0.80
Recall: 0.79
F1-Score: 0.79
Confusion Matrix:
 [[904 110]
 [309 683]]


calculate test_accuracy test_precision ,test_recall ,test_f1 ,test_cm on train set with best k

In [153]:

y_pred_train = best_knn.predict(X_train)

train_accuracy = accuracy_score(y_train, y_pred_train)
train_precision = precision_score(y_train, y_pred_train, average='weighted')
train_recall = recall_score(y_train, y_pred_train, average='weighted')
train_f1 = f1_score(y_train, y_pred_train, average='weighted')
train_cm = confusion_matrix(y_train, y_pred_train)

In [154]:
print("train Set Results for Best K:")
print(f"Accuracy: {train_accuracy:.2f}")
print(f"Precision: {train_precision:.2f}")
print(f"Recall: {train_recall:.2f}")
print(f"F1-Score: {train_f1:.2f}")
print("Confusion Matrix:\n", train_cm)

train Set Results for Best K:
Accuracy: 0.79
Precision: 0.80
Recall: 0.79
F1-Score: 0.79
Confusion Matrix:
 [[4120  544]
 [1400 3299]]
