In [15]:
import pandas as pd

df = pd.read_csv('data/clean/cleaned_loan_data.csv')

In [16]:
df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,graduate,no,9600000,29900000,12,778,2400000,17600000,22700000,8000000,approved
1,2,0,not-graduate,yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,rejected
2,3,3,graduate,no,9100000,29700000,20,506,7100000,4500000,33300000,12800000,rejected
3,4,3,graduate,no,8200000,30700000,8,467,18200000,3300000,23300000,7900000,rejected
4,5,5,not-graduate,yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,rejected


In [17]:
df['education'] = df['education'].astype(str).str.lower().str.strip().map({'graduate': 1, 'not-graduate': 0})
df['self_employed'] = df['self_employed'].astype(str).str.lower().str.strip().map({'yes': 1, 'no': 0})
df['loan_status'] = df['loan_status'].astype(str).str.lower().str.strip().map({'approved': 1, 'rejected': 0})

# Check results
print(df[['education', 'self_employed', 'loan_status']].head())

   education  self_employed  loan_status
0          1              0            1
1          0              1            0
2          1              0            0
3          1              0            0
4          0              1            0


In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from scipy.stats import t
import time

# Load and preprocess
X = df.drop(columns=["loan_id", "loan_status"])
y = df["loan_status"]

X_encoded = pd.get_dummies(X, drop_first=True)


X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=123)

scaler = StandardScaler()
X_train_norm_df = scaler.fit_transform(X_train)
X_test_norm_df = scaler.transform(X_test)

# Parameter grid for KNN
parameter_grid = {
    "n_neighbors": [int(x) for x in np.linspace(start=2, stop=30, num=15)],
    "weights": ['uniform', 'distance'],
    "p": [1, 2]
}

knn = KNeighborsClassifier()

folds = 5
confidence_level = 0.95

rs = RandomizedSearchCV(knn, param_distributions=parameter_grid, n_iter=16, cv=folds, verbose=10, random_state=123)

start_time = time.time()
rs.fit(X_train_norm_df, y_train)
end_time = time.time()

print("\n")
print(f"Time taken to find the best combination of hyperparameters: {end_time - start_time:.4f} seconds")
print("\n")
print(f"The best combination of hyperparameters has been: {rs.best_params_}")
print(f"The cross-validated accuracy is: {rs.best_score_:.4f}")

# Confidence interval
results_rs_df = pd.DataFrame(rs.cv_results_).sort_values(by="mean_test_score", ascending=False)
rs_mean_score = results_rs_df.iloc[0, -3]
rs_sem = results_rs_df.iloc[0, -2] / np.sqrt(folds)

rs_tc = t.ppf(1 - ((1 - confidence_level) / 2), df=folds - 1)
rs_lower_bound = rs_mean_score - (rs_tc * rs_sem)
rs_upper_bound = rs_mean_score + (rs_tc * rs_sem)

print(f"The accuracy confidence interval for the best combination of hyperparameters is: \
({rs_lower_bound:.4f}, {rs_mean_score:.4f}, {rs_upper_bound:.4f})")

# Evaluate on test set
best_model = rs.best_estimator_
y_pred_test_df = best_model.predict(X_test_norm_df)

print("\n")
print("Classification Report on Test Set:")
print(classification_report(y_test, y_pred_test_df))
print(f"Test Accuracy: {accuracy_score(y_test, y_pred_test_df):.4f}")

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5; 1/16] START n_neighbors=14, p=1, weights=uniform.......................
[CV 1/5; 1/16] END n_neighbors=14, p=1, weights=uniform;, score=0.918 total time=   0.0s
[CV 2/5; 1/16] START n_neighbors=14, p=1, weights=uniform.......................
[CV 2/5; 1/16] END n_neighbors=14, p=1, weights=uniform;, score=0.921 total time=   0.0s
[CV 3/5; 1/16] START n_neighbors=14, p=1, weights=uniform.......................
[CV 3/5; 1/16] END n_neighbors=14, p=1, weights=uniform;, score=0.900 total time=   0.0s
[CV 4/5; 1/16] START n_neighbors=14, p=1, weights=uniform.......................
[CV 4/5; 1/16] END n_neighbors=14, p=1, weights=uniform;, score=0.922 total time=   0.0s
[CV 5/5; 1/16] START n_neighbors=14, p=1, weights=uniform.......................
[CV 5/5; 1/16] END n_neighbors=14, p=1, weights=uniform;, score=0.906 total time=   0.0s
[CV 1/5; 2/16] START n_neighbors=20, p=2, weights=distance......................
[CV 1/5;