## K Nearest Neighbors Model

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
train_data = pd.read_csv("data/clean/clean_train.csv")
test_data = pd.read_csv("data/clean/clean_test.csv")

X_train = train_data.drop(["cluster"], axis = 1)
y_train = train_data["cluster"]
X_test = test_data.drop(["cluster"], axis = 1)
y_test = test_data["cluster"]

X_train.head(10)



Unnamed: 0,MedInc,Latitude,Longitude
0,3.2596,32.71,-117.03
1,3.8125,33.77,-118.16
2,4.1563,34.66,-120.48
3,1.9425,32.69,-117.11
4,3.5542,36.78,-119.8
5,6.6227,37.42,-121.86
6,2.5192,34.04,-117.97
7,7.9892,37.91,-122.53
8,1.5,34.13,-117.9
9,6.4266,34.02,-117.79


In [3]:
#Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [4]:
# Initialize KNN with k=5
model = KNeighborsClassifier(n_neighbors=5)

# Fit the model on the training data
model.fit(X_train_scaled, y_train)

# Predict the labels for the test set
y_pred = model.predict(X_test_scaled)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.9922480620155039
Confusion Matrix:
[[ 555    1    0    0    0    5]
 [   0  977    0    6    0    0]
 [   0    6  293    0    0    0]
 [   0    3    0 1105    0    3]
 [   0    0    0    0   62    0]
 [   2    1    0    5    0 1104]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       561
           1       0.99      0.99      0.99       983
           2       1.00      0.98      0.99       299
           3       0.99      0.99      0.99      1111
           4       1.00      1.00      1.00        62
           5       0.99      0.99      0.99      1112

    accuracy                           0.99      4128
   macro avg       0.99      0.99      0.99      4128
weighted avg       0.99      0.99      0.99      4128



In [5]:
#Saving the model
from pickle import dump

dump(model, open("models/supervised/knn_n_neighbors-5.sav", "wb"))