# Lab 03 - KNN and ML Pipeline

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler, LabelEncoder

## Task 1: Occupancy Dataset (Modified)

In [2]:
# Load Data
train = pd.read_csv(r"C:\Ali\Programming\MLFall25\Lab03\occupancy_train.txt", sep = ",")
test = pd.read_csv(r"C:\Ali\Programming\MLFall25\Lab03\occupancy_test.txt", sep = ",")

# Drop unused columns (date, Temperature, CO2)
# Attributes: date, Temperature, Humidity, Light, CO2, HumidityRatio, Occupancy
train = train.drop(columns = ["date","Temperature","CO2"], axis=1)
test = test.drop(columns = ["date","Temperature","CO2"], axis=1)

y_train = train["Occupancy"]
X_train = train.drop(columns = ["Occupancy"])
y_test = test["Occupancy"]
X_test = test.drop(columns = ["Occupancy"])

acc = []
print("Task 1 Accuracies:")
for n in range(1,11):
    knn = KNeighborsClassifier(n_neighbors=n)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    acc.append(accuracy)
    print(f"K={n}: {accuracy}")

best_acc = max(acc)
best_k = acc.index(best_acc) + 1
print(f"\nHighest Accuracy: {best_acc} at K={best_k}")

Task 1 Accuracies:
K=1: 0.9403377110694184
K=2: 0.925328330206379
K=3: 0.9572232645403377
K=4: 0.9448405253283302
K=5: 0.9553470919324578
K=6: 0.9527204502814259
K=7: 0.9632270168855535
K=8: 0.9613508442776736
K=9: 0.9647279549718574
K=10: 0.9647279549718574

Highest Accuracy: 0.9647279549718574 at K=9


## Task 2: KNN from Scratch (Chi-squared Distance)

In [3]:
# Load Iris
iris = load_iris()
X = iris.data
y = iris.target

# Split 80/20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Chi-squared distance function
def chi_squared_distance(x1, x2):
    # Add small epsilon to avoid division by zero
    return 0.5 * np.sum(((x1 - x2) ** 2) / (x1 + x2 + 1e-10))

# KNN Predict function
def knn_predict(X_train, y_train, X_test, k=3):
    y_pred = []
    for test_point in X_test:
        distances = []
        for i, train_point in enumerate(X_train):
            dist = chi_squared_distance(test_point, train_point)
            distances.append((dist, y_train[i]))
        
        distances.sort(key=lambda x: x[0])
        k_nearest = distances[:k]
        k_nearest_labels = [label for _, label in k_nearest]
        
        # Majority vote
        most_common = max(set(k_nearest_labels), key=k_nearest_labels.count)
        y_pred.append(most_common)
    return np.array(y_pred)

# Run KNN
k = 3
y_pred_scratch = knn_predict(X_train, y_train, X_test, k=k)

print("Task 2 Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_scratch)}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_scratch))

Task 2 Results:
Accuracy: 0.9666666666666667
Confusion Matrix:
[[11  0  0]
 [ 0 13  0]
 [ 0  1  5]]


## Task 3: Comprehensive ML Pipeline

In [4]:
# Load Dataset
df = pd.read_csv(r"C:\Ali\Programming\MLFall25\Lab03\cancer patient data sets.csv")

# EDA
print("--- EDA ---")
print("Missing Values:")
print(df.isnull().sum())
print("\nDuplicates:", df.duplicated().sum())

# Handle Duplicates (if any)
df = df.drop_duplicates()

# Check Balance
print("\nTarget Balance (Level):")
print(df['Level'].value_counts())

# Feature Selection (Pearson)
# Encode Target
le = LabelEncoder()
df['Level_Encoded'] = le.fit_transform(df['Level'])

corr = df.corr(numeric_only=True)
print("\nCorrelation with Target:")
print(corr['Level_Encoded'].sort_values(ascending=False))

# Select Features (Dropping non-predictive/ID columns)
# Dropping 'index', 'Patient Id', 'Level', 'Level_Encoded'
X = df.drop(columns=['index', 'Patient Id', 'Level', 'Level_Encoded'], errors='ignore')
y = df['Level_Encoded']

# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split: Train 80%, Test 20%
X_train_full, X_test, y_train_full, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=0)

# Validation Split: Train 70%, Val 30% (from Train split)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.3, random_state=0)

print(f"\nSplit Sizes: Train={X_train.shape[0]}, Val={X_val.shape[0]}, Test={X_test.shape[0]}")

# KNN Training & Validation
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

print(f"Validation Accuracy: {knn.score(X_val, y_val)}")
print(f"Test Accuracy: {knn.score(X_test, y_test)}")

# Compare Metrics
print("\n--- Metric Comparison ---")
metrics = ['euclidean', 'manhattan', 'chebyshev']
for m in metrics:
    knn_m = KNeighborsClassifier(n_neighbors=5, metric=m)
    knn_m.fit(X_train, y_train)
    print(f"Metric: {m}, Test Accuracy: {knn_m.score(X_test, y_test)}")

print("\nAnalysis: Different metrics may perform differently based on the data distribution. Euclidean is standard, Manhattan often works well for high dimensions, and Chebyshev for specific grid-like data.")

--- EDA ---
Missing Values:
index                       0
Patient Id                  0
Age                         0
Gender                      0
Air Pollution               0
Alcohol use                 0
Dust Allergy                0
OccuPational Hazards        0
Genetic Risk                0
chronic Lung Disease        0
Balanced Diet               0
Obesity                     0
Smoking                     0
Passive Smoker              0
Chest Pain                  0
Coughing of Blood           0
Fatigue                     0
Weight Loss                 0
Shortness of Breath         0
Wheezing                    0
Swallowing Difficulty       0
Clubbing of Finger Nails    0
Frequent Cold               0
Dry Cough                   0
Snoring                     0
Level                       0
dtype: int64

Duplicates: 0

Target Balance (Level):
Level
High      365
Medium    332
Low       303
Name: count, dtype: int64

Correlation with Target:
Level_Encoded               1.000000
Wh