In [1]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
import numpy as np
from sklearn.compose import make_column_transformer 
from sklearn.compose import make_column_selector
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB
from sklearn import datasets, linear_model, metrics
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

In [2]:
loan = pd.read_csv('loan.csv',index_col=0)
loan.info()

<class 'pandas.core.frame.DataFrame'>
Index: 614 entries, LP001002 to LP002990
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Dependents         599 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      582 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 62.4+ KB


In [3]:
X = loan.drop('Loan_Status',axis=1)
y = loan['Loan_Status']

In [4]:
loan.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

#### Imputing

In [5]:
imp_cat = SimpleImputer(strategy='constant',fill_value="unknown").set_output(transform='pandas')
imp_num = SimpleImputer(strategy='median').set_output(transform='pandas')
trans_imp = make_column_transformer((imp_cat,make_column_selector(dtype_include=object)) ,
                                   (imp_num,make_column_selector(dtype_exclude=object)))
trans_imp = trans_imp.set_output(transform='pandas')
# X_imp = trans_imp.fit_transform(X)
# print(X_imp.isnull().sum().sum())
# print(X_imp.columns)

#### One Hot Encoding

In [7]:
ohe = OneHotEncoder(handle_unknown = 'ignore',sparse_output=False,drop='first').set_output(transform='pandas')
trans_ohe = make_column_transformer((ohe,make_column_selector(dtype_include=object)) ,
                                   ('passthrough',make_column_selector(dtype_exclude=object)))
trans_ohe = trans_ohe.set_output(transform='pandas')
X_imp_ohe = trans_ohe.fit_transform(X)
X_imp_ohe.dtypes

onehotencoder__Gender_Male                float64
onehotencoder__Gender_nan                 float64
onehotencoder__Married_Yes                float64
onehotencoder__Married_nan                float64
onehotencoder__Dependents_1               float64
onehotencoder__Dependents_2               float64
onehotencoder__Dependents_3+              float64
onehotencoder__Dependents_nan             float64
onehotencoder__Education_Not Graduate     float64
onehotencoder__Self_Employed_Yes          float64
onehotencoder__Self_Employed_nan          float64
onehotencoder__Property_Area_Semiurban    float64
onehotencoder__Property_Area_Urban        float64
passthrough__ApplicantIncome                int64
passthrough__CoapplicantIncome            float64
passthrough__LoanAmount                   float64
passthrough__Loan_Amount_Term             float64
passthrough__Credit_History               float64
dtype: object

#### Model

In [8]:
lr = LogisticRegression(random_state=24)
pipe = Pipeline([('IMP',trans_imp),('OHE',trans_ohe),('LR',lr)])
kfold = StratifiedKFold(n_splits=5,random_state=24,shuffle=True)
params = {'LR__solver':['lbfgs','liblinear','newton-cg','newton-cholesky','sag','saga'] ,
         'LR__C':np.linspace(0.001,10,5)}
gcv = GridSearchCV(pipe,param_grid=params,scoring='roc_auc',cv=kfold,verbose=3)
gcv.fit(X,y)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END .....LR__C=0.001, LR__solver=lbfgs;, score=0.431 total time=   0.0s
[CV 2/5] END .....LR__C=0.001, LR__solver=lbfgs;, score=0.495 total time=   0.0s
[CV 3/5] END .....LR__C=0.001, LR__solver=lbfgs;, score=0.440 total time=   0.0s
[CV 4/5] END .....LR__C=0.001, LR__solver=lbfgs;, score=0.603 total time=   0.0s
[CV 5/5] END .....LR__C=0.001, LR__solver=lbfgs;, score=0.564 total time=   0.0s
[CV 1/5] END .LR__C=0.001, LR__solver=liblinear;, score=0.437 total time=   0.0s
[CV 2/5] END .LR__C=0.001, LR__solver=liblinear;, score=0.636 total time=   0.0s
[CV 3/5] END .LR__C=0.001, LR__solver=liblinear;, score=0.458 total time=   0.0s
[CV 4/5] END .LR__C=0.001, LR__solver=liblinear;, score=0.629 total time=   0.0s
[CV 5/5] END .LR__C=0.001, LR__solver=liblinear;, score=0.554 total time=   0.0s
[CV 1/5] END .LR__C=0.001, LR__solver=newton-cg;, score=0.485 total time=   0.0s
[CV 2/5] END .LR__C=0.001, LR__solver=newton-cg

In [9]:
gcv.best_score_

0.759250955442906

In [10]:
gcv.best_params_

{'LR__C': 2.50075, 'LR__solver': 'lbfgs'}

In [11]:
bm = gcv.best_estimator_

In [12]:
bm

#### Inferencing

In [13]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [14]:
submit = pd.read_csv('sample_submission.csv')

In [15]:
submit['Loan_Status'] = bm.predict(test)
submit.head()

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,Y
1,LP001022,Y
2,LP001031,Y
3,LP001035,Y
4,LP001051,Y


This `submit`'s can be exported to csv and uploaded in solutions checker

#### KNN

In [16]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler
knn = KNeighborsClassifier()
scl_mm , scl_std = MinMaxScaler() , StandardScaler()
pipe = Pipeline([('IMP',trans_imp),('OHE',trans_ohe),('SCL',None),('KNN',knn)])
kfold = StratifiedKFold(n_splits=5,random_state=24,shuffle=True)
params = {'KNN__n_neighbors':np.arange(1,8),
         'SCL':[None,scl_mm,scl_std]}
gcv = GridSearchCV(pipe,param_grid=params,scoring='roc_auc',cv=kfold,verbose=3)
gcv.fit(X,y)

Fitting 5 folds for each of 21 candidates, totalling 105 fits
[CV 1/5] END ......KNN__n_neighbors=1, SCL=None;, score=0.512 total time=   0.3s
[CV 2/5] END ......KNN__n_neighbors=1, SCL=None;, score=0.512 total time=   0.0s
[CV 3/5] END ......KNN__n_neighbors=1, SCL=None;, score=0.484 total time=   0.0s
[CV 4/5] END ......KNN__n_neighbors=1, SCL=None;, score=0.467 total time=   0.0s
[CV 5/5] END ......KNN__n_neighbors=1, SCL=None;, score=0.439 total time=   0.0s
[CV 1/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=0.668 total time=   0.0s
[CV 2/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=0.646 total time=   0.0s
[CV 3/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=0.641 total time=   0.0s
[CV 4/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=0.691 total time=   0.0s
[CV 5/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=0.692 total time=   0.0s
[CV 1/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=0.586 total time=   0.0s
[CV 2/5] END KNN__n_n

In [17]:
gcv.best_params_

{'KNN__n_neighbors': 3, 'SCL': MinMaxScaler()}

In [18]:
gcv.best_score_

0.7246499733496637