In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sb 

from sklearn import preprocessing 
from sklearn.neighbors import KNeighborsClassifier 

from sklearn.metrics import accuracy_score, confusion_matrix  
from sklearn.model_selection import train_test_split, GridSearchCV  
from sklearn.preprocessing import StandardScaler 

# from sklearn.pipeline import Pipeline, make_pipeline 
# from sklearn.compose import ColumnTransformer 

import nbformat 
from IPython import get_ipython 


In [None]:
# %run "../Data_Preprocessing/data_preprocess.ipynb" 

with open("../Data_Preprocessing/data_preprocess.ipynb", "r", encoding="utf-8") as f:
    notebook1 = nbformat.read(f, as_version=4)

ipython = get_ipython() 

for cell in notebook1.cells:
    if cell.cell_type == "code":
        print(cell.source) 
        if ("hp_cleaned" in cell.source or "hp_d" in cell.source):
            ipython.run_cell(cell.source, silent=True) 
            # ipython.run_cell_async(cell.source, silent=True) 

try: 
    print("\nHouse Price Cleaned Data : ")
    print(hp_cleaned.head(), sep='\n')     # type: ignore 
except NameError as e:
    print(f"Variable not found: {e}")

In [None]:
hp_cleaned        # type: ignore 

In [None]:
X_multi = hp_cleaned.loc[:, hp_cleaned.columns != 'Sold']      # type: ignore 
X_multi 

In [None]:
y = hp_cleaned['Sold']        # type: ignore
y 

Train - Test Split :  

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_multi, y, test_size = 0.2, random_state = 0)    
# test_size = 0.2 ~ 20% of the dataset  
# random_state : occurance of the sets 
print(f" X_train size : {X_train.shape} \n X_test size : {X_test.shape} \n y_train size : {y_train.shape} \n y_test size : {y_test.shape}") 

Standard scalar creation - In standardization, we convert the avg. mean to zero and avg. standard deviation to one. 

In [7]:
scaler = preprocessing.StandardScaler().fit(X_train) 
X_train_s = scaler.transform(X_train) 

scaler = preprocessing.StandardScaler().fit(X_test) 
X_test_s = scaler.transform(X_test) 

In [None]:
X_train_s 

In [None]:
X_test_s 

KNN classifications 

In [None]:
cls_knn_n1 = KNeighborsClassifier(n_neighbors = 1) 
cls_knn_n1.fit(X_train_s, y_train)  

cls_knn_n3 = KNeighborsClassifier(n_neighbors = 3) 
cls_knn_n3.fit(X_train_s, y_train) 

knn1_y_test_prd = cls_knn_n1.predict(X_test_s) 
knn3_y_test_prd = cls_knn_n3.predict(X_test_s) 

print("For Neighbors_num = 1") 
print(f" KNN Accuracy score (Train) : {accuracy_score(y_train, cls_knn_n1.predict(X_train_s))} ") 
print(f" KNN Accuracy score (Test) : {accuracy_score(y_test, knn1_y_test_prd)} ") 

print("For Neighbors_num = 3") 
print(f" KNN Accuracy score (Train) : {accuracy_score(y_train, cls_knn_n3.predict(X_train_s))} ") 
print(f" KNN Accuracy score (Test) : {accuracy_score(y_test, knn3_y_test_prd)} ") 

In [None]:
# Training set // K == 1 
conf_mtx_n1 = confusion_matrix(y_train, cls_knn_n1.predict(X_train_s)) 
conf_mtx_n1 

In [None]:
# Training set // K == 3  
conf_mtx_n1 = confusion_matrix(y_train, cls_knn_n3.predict(X_train_s)) 
conf_mtx_n1 

In [None]:
# Testing set // K == 1  
conf_mtx_n1 = confusion_matrix(y_test, knn1_y_test_prd) 
conf_mtx_n1 

In [None]:
# Testing set // K == 3 
conf_mtx_n1 = confusion_matrix(y_test, knn3_y_test_prd) 
conf_mtx_n1 

In [None]:
n = int(input("Enter a random K-value for grouping - ")) 
cls_knn_n = KNeighborsClassifier(n_neighbors = n)  
cls_knn_n.fit(X_train_s, y_train) 

knn_y_test_prd = cls_knn_n.predict(X_test_s) 

print(f"For Neighbors_num = {n}")  
print(f" KNN Accuracy score (Train) : {accuracy_score(y_train, cls_knn_n.predict(X_train_s))} ") 
print(f" KNN Accuracy score (Test) : {accuracy_score(y_test, knn_y_test_prd)} ") 

In [None]:
# Training set 
conf_mtx_n = confusion_matrix(y_train, cls_knn_n.predict(X_train_s)) 
conf_mtx_n 

In [None]:
# Testing set 
conf_mtx_n = confusion_matrix(y_test, cls_knn_n.predict(X_test_s)) 
conf_mtx_n 

----->>>>  || For multiple values of K, single KNN model operation - 

In [None]:
params = {'n_neighbors': [i for i in range(1,15)]} 
params 

In [None]:
grid_search_cv = GridSearchCV(KNeighborsClassifier(), params) 
grid_search_cv.fit(X_train_s, y_train) 

print(f"Best parameter(s) for KNN : {grid_search_cv.best_params_}") 

optimised_knn = grid_search_cv.best_estimator_ 
print(f"Best model for the classification of attributes : {optimised_knn}") 

In [None]:
optm_y_test_prd = optimised_knn.predict(X_test_s) 

print(f"OPtimised KNN accuracy score (Train) : {accuracy_score(y_train, optimised_knn.predict(X_train_s))}") 
print(f"OPtimised KNN accuracy score (Test) : {accuracy_score(y_test, optm_y_test_prd)}")   

In [None]:
# Training set 
conf_mtx = confusion_matrix(y_train, optimised_knn.predict(X_train_s)) 
conf_mtx 

In [None]:
# Testing set 
conf_mtx = confusion_matrix(y_test, optm_y_test_prd) 
conf_mtx 