In [9]:
import numpy as np
import skfuzzy as fuzz
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder


KNN Test Error

In [10]:

# Load dataset
data = pd.read_csv('dataset/kidney_disease.csv')


# Identify categorical columns (based on the description and data inspection)
categorical_cols = ["rbc", "pc", "pcc",	"ba", "htn", "dm", "cad", "appet", "pe", "ane"]
numerical_cols = ["age", "bp", "sg", "al", "su", "bgr", "bu", "sc", "sod", "pot", "hemo", "pcv", "wc","rc"]

#select features
#select all column except classification and id
features = data.drop(columns = ['id', 'classification']) #x
target = data['classification'] #y

# Convert non-numeric placeholders to NaN
features.replace({'\t?': np.nan, '?': np.nan}, inplace=True)

# Handling missing values for numerical columns
# Convert features back to DataFrame after imputation to retain column names
int_imputer = SimpleImputer(strategy='mean')# You can choose 'median', 'most_frequent', etc.
for col in numerical_cols:
    features[[col]] = int_imputer.fit_transform(features[[col]])


# Initialize label encoder
le = LabelEncoder()
str_imputer = SimpleImputer(strategy="most_frequent")

# Apply LabelEncoder to each categorical column
for col in categorical_cols:
    # Replace NaN with a placeholder string, as LabelEncoder does not handle NaN
    features[[col]] = str_imputer.fit_transform(features[[col]])
    features[col] = le.fit_transform(features[col])


# Split the data
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, stratify=target)


# Evaluate the model
# Hyperparameter tuning using GridSearchCV
param_grid = {'n_neighbors': range(1, 31)}
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_train, y_train)

# Best parameters and model
best_k = grid_search.best_params_['n_neighbors']
knn = grid_search.best_estimator_

# Train and evaluate the model
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)

print(f'Best k: {best_k}')
print(f'Accuracy: {accuracy}')
print(y_pred)



Best k: 1
Accuracy: 0.7875
['ckd' 'ckd' 'ckd' 'ckd' 'ckd' 'ckd' 'notckd' 'ckd' 'notckd' 'ckd' 'ckd'
 'ckd' 'notckd' 'ckd' 'ckd' 'notckd' 'notckd' 'ckd' 'notckd' 'ckd' 'ckd'
 'ckd' 'notckd' 'ckd' 'notckd' 'notckd' 'notckd' 'ckd' 'notckd' 'ckd'
 'notckd' 'ckd' 'ckd' 'notckd' 'notckd' 'ckd' 'notckd' 'notckd' 'notckd'
 'ckd' 'notckd' 'notckd' 'notckd' 'notckd' 'notckd' 'notckd' 'ckd'
 'notckd' 'notckd' 'ckd' 'ckd' 'notckd' 'notckd' 'notckd' 'ckd' 'notckd'
 'ckd' 'notckd' 'ckd' 'ckd' 'notckd' 'notckd' 'notckd' 'ckd' 'notckd'
 'notckd' 'notckd' 'ckd' 'ckd' 'ckd' 'ckd' 'ckd' 'ckd' 'notckd' 'notckd'
 'ckd' 'ckd' 'notckd' 'notckd' 'notckd']
