In [59]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import SelectKBest, chi2
from imblearn.under_sampling import RandomUnderSampler

In [60]:
df = pd.read_csv('credit_clients.csv')

In [61]:
df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_SEP,PAY_AUG,PAY_JUL,PAY_JUN,PAY_MAY,PAY_APR,PAY_AMT_SEP,PAY_AMT_AUG,PAY_AMT_JUL,PAY_AMT_JUN,PAY_AMT_MAY,PAY_AMT_APR,default payment next month,BILL_AMT
0,20000,2,2,1,24,2,2,-1,-1,-2,-2,0,689,0,0,0,0,1,1284.0
1,120000,2,2,2,26,-1,2,0,0,0,2,0,1000,1000,1000,0,2000,1,2846.17
2,90000,2,2,2,34,0,0,0,0,0,0,1518,1500,1000,1000,1000,5000,0,16942.17
3,50000,2,2,1,37,0,0,0,0,0,0,2000,2019,1200,1100,1069,1000,0,38555.67
4,50000,1,2,1,57,-1,0,-1,0,0,0,2000,36681,10000,9000,689,679,0,18223.17


In [62]:
# I am going to drop the Age, SEX, MARRIAGE, and EDUCATION columns because they have the lowest chi-squared scores, I moved this cell to the top of the notebook
df = df.drop(['SEX', 'MARRIAGE', 'EDUCATION', 'AGE'], axis=1)

In [63]:
df['default payment next month'].value_counts()

default payment next month
0    23364
1     6636
Name: count, dtype: int64

In [64]:
X = df.drop('default payment next month', axis=1)
y = df['default payment next month']

In [65]:
rus = RandomUnderSampler(random_state=42)
X, y = rus.fit_resample(X, y)

In [66]:
X_cat = X.astype(int)
X_cat = X_cat.clip(lower=0)  

chi_features = SelectKBest(chi2, k=len(X_cat.columns))

# fit our data to the SelectKBest
best_features = chi_features.fit(X_cat, y.astype(int))

# use decimal format in table
pd.options.display.float_format = '{:.2f}'.format

df_features = pd.DataFrame(best_features.scores_)
df_columns = pd.DataFrame(X_cat.columns)
f_scores = pd.concat([df_columns, df_features], axis=1)
f_scores.columns = ['Features', 'Score']
f_scores.sort_values(by='Score', ascending=False)

Unnamed: 0,Features,Score
0,LIMIT_BAL,49554958.57
8,PAY_AMT_AUG,8066474.97
7,PAY_AMT_SEP,4914280.8
9,PAY_AMT_JUL,4680222.56
12,PAY_AMT_APR,3661739.83
10,PAY_AMT_JUN,3374266.12
11,PAY_AMT_MAY,3348653.18
13,BILL_AMT,573738.64
1,PAY_SEP,3096.73
2,PAY_AUG,2619.89


In [67]:
x_train , x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [68]:
#knn_model = KNeighborsClassifier(n_neighbors=5) 
#knn_model.fit(x_train, y_train)

In [69]:
"""param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

knn = KNeighborsClassifier()
grid = GridSearchCV(knn, param_grid, cv=5, scoring='balanced_accuracy')
grid.fit(x_train, y_train)

print(grid.best_params_)
print(grid.best_score_)"""

# I tried this datasets with many different models and the best one was the XGBoost model, even KNN with the best parameters had a lower accuracy score

"param_grid = {\n    'n_neighbors': [3, 5, 7, 9, 11],\n    'weights': ['uniform', 'distance'],\n    'metric': ['euclidean', 'manhattan']\n}\n\nknn = KNeighborsClassifier()\ngrid = GridSearchCV(knn, param_grid, cv=5, scoring='balanced_accuracy')\ngrid.fit(x_train, y_train)\n\nprint(grid.best_params_)\nprint(grid.best_score_)"

In [70]:
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('knn', KNeighborsClassifier())
])

In [71]:
param_grid = {
    'knn__n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19, 21],
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan'],
    'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'knn__leaf_size': [20, 30, 40]
}

In [72]:
grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,                 
    scoring='balanced_accuracy',
    n_jobs=-1               
)

grid.fit(x_train, y_train)

# 5. Evaluate
print("Best Params:", grid.best_params_)
print("Best CV Score:", grid.best_score_)

Best Params: {'knn__algorithm': 'ball_tree', 'knn__leaf_size': 30, 'knn__metric': 'euclidean', 'knn__n_neighbors': 21, 'knn__weights': 'uniform'}
Best CV Score: 0.6950611306497361


In [75]:
forecast = grid.predict(x_test)

In [77]:
accuracy = accuracy_score(y_test, forecast)
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("\nClassification Report:")
print(classification_report(y_test, forecast))

Accuracy: 68.68%

Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.76      0.71      2010
           1       0.71      0.61      0.66      1972

    accuracy                           0.69      3982
   macro avg       0.69      0.69      0.68      3982
weighted avg       0.69      0.69      0.69      3982



## <span style="color: #20C997;">Personal Analysis</span>
KNN cannot get a higher score than other models.