In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [20]:
# Load the dataset
file_path = 'Raw Data/output.csv'
data = pd.read_csv(file_path)

In [22]:
# Convert the target variable 'Anaemic' to binary format if not already done
data['Anaemic'] = data['Anaemic'].map({'Yes': 1, 'No': 0})

# One-hot encode the 'Sex' column
data = pd.get_dummies(data, columns=['Sex'], drop_first=True)

# Define features and target variable
X = data.drop(columns=['Anaemic'])
y = data['Anaemic']

In [24]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
# Scale numerical features
scaler = StandardScaler()

numerical_features = ['%Red Pixel', '%Green pixel', '%Blue pixel', 'Hb']

X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])

X_test[numerical_features] = scaler.transform(X_test[numerical_features])

# Display the first few rows of the training data
X_train.head(), y_train.head()

(    Number  %Red Pixel  %Green pixel  %Blue pixel        Hb  Sex_F   Sex_M  \
 55      56    0.232238     -0.396178    -0.016480  0.364549   False  False   
 22      23   -0.849241      0.601300     0.706724 -0.014724   False  False   
 76      77    1.486522      0.031227    -2.063656 -0.941836   False  False   
 44      45   -0.520533      0.031895     0.689954  0.828104   False   True   
 72      73    2.720115     -1.430409    -2.641536 -0.267573    True  False   
 
     Sex_M   
 55    True  
 22   False  
 76   False  
 44   False  
 72   False  ,
 55    0
 22    0
 76    1
 44    0
 72    0
 Name: Anaemic, dtype: int64)

In [28]:
# Initialize k-NN classifier
knn = KNeighborsClassifier(n_neighbors=5)

In [29]:
# Fit the model
knn.fit(X_train, y_train)

In [30]:
# Predict on the test set
y_pred = knn.predict(X_test)

In [31]:
# Evaluate the model
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)
print(f"Accuracy: {accuracy:.2f}")


Confusion Matrix:
[[14  2]
 [ 2  3]]

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.88      0.88        16
           1       0.60      0.60      0.60         5

    accuracy                           0.81        21
   macro avg       0.74      0.74      0.74        21
weighted avg       0.81      0.81      0.81        21

Accuracy: 0.81
