In [1]:
import lightgbm as lgb
import xgboost as xgb 
import catboost as catb

import pandas as pd
import numpy as np
from sklearn.metrics import auc

# Label Encoding our target variable 
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

In [2]:
#loading the dataset 
df_train = pd.read_csv('../data/loan_prediction_train.csv')
df_test = pd.read_csv('../data/loan_prediction_test.csv')

In [3]:
df_train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
df_test.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [5]:
#drop the ID column
df_train.drop(['Loan_ID'], inplace=True, axis=1)
df_test.drop(['Loan_ID'], inplace=True, axis=1)

objList_train = df_train.select_dtypes(include = "object").columns
print (objList_train)

objList_test = df_test.select_dtypes(include = "object").columns
print (objList_test)

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'Property_Area', 'Loan_Status'],
      dtype='object')
Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'Property_Area'],
      dtype='object')


In [6]:
#Label Encoding for object to numeric conversion
le = LabelEncoder()

for feat in objList_train:
    df_train[feat] = le.fit_transform(df_train[feat].astype(str))

df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             614 non-null    int32  
 1   Married            614 non-null    int32  
 2   Dependents         614 non-null    int32  
 3   Education          614 non-null    int32  
 4   Self_Employed      614 non-null    int32  
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    int32  
 11  Loan_Status        614 non-null    int32  
dtypes: float64(4), int32(7), int64(1)
memory usage: 40.9 KB


In [7]:
for feat_test in objList_test:
    df_test[feat_test] = le.fit_transform(df_test[feat_test].astype(str))

df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             367 non-null    int32  
 1   Married            367 non-null    int32  
 2   Dependents         367 non-null    int32  
 3   Education          367 non-null    int32  
 4   Self_Employed      367 non-null    int32  
 5   ApplicantIncome    367 non-null    int64  
 6   CoapplicantIncome  367 non-null    int64  
 7   LoanAmount         362 non-null    float64
 8   Loan_Amount_Term   361 non-null    float64
 9   Credit_History     338 non-null    float64
 10  Property_Area      367 non-null    int32  
dtypes: float64(3), int32(6), int64(2)
memory usage: 23.1 KB


In [8]:
df_train.Loan_Status.value_counts()

1    422
0    192
Name: Loan_Status, dtype: int64

In [9]:
df_test.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,1,0,0,0,5720,0,110.0,360.0,1.0,2
1,1,1,1,0,0,3076,1500,126.0,360.0,1.0,2
2,1,1,2,0,0,5000,1800,208.0,360.0,1.0,2
3,1,1,2,0,0,2340,2546,100.0,360.0,,2
4,1,0,0,1,0,3276,0,78.0,360.0,1.0,2


In [10]:
#Imputing missing values 
#df_train.fillna(df_train.mode()[0],inplace=True) 

In [11]:
# get the labels
y = df_train.Loan_Status.values
df_train.drop(['Loan_Status'], inplace=True, axis=1)
x = df_train.values

In [12]:
#Now splitting our dataset into test and train 
from sklearn.model_selection import train_test_split 
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.3)

In [13]:
# create dataset for lightgbm
lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)

In [14]:
# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}


In [15]:
print('Starting training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20)

print('Saving model...')
# save model to file
gbm.save_model('../models/model.txt')

Starting training...
Saving model...


<lightgbm.basic.Booster at 0x1d9963ca7b8>

In [16]:
print('Starting predicting...')
# predict
y_pred = gbm.predict(x_test)


Starting predicting...


In [17]:
y_pred[0:5]
y_pred

array([0.27759092, 0.78178269, 0.33037884, 0.60768412, 0.78527998,
       0.27264964, 0.64840428, 0.7831769 , 0.26759203, 0.73084741,
       0.25791245, 0.78897524, 0.77924882, 0.83049335, 0.70863638,
       0.63275467, 0.80389192, 0.81681431, 0.72460357, 0.78199879,
       0.2746668 , 0.80437475, 0.33037884, 0.26759203, 0.72251753,
       0.75093083, 0.83395345, 0.77727274, 0.87624494, 0.76472115,
       0.83578246, 0.7397864 , 0.7622595 , 0.26279776, 0.70259153,
       0.7657715 , 0.7831769 , 0.25791245, 0.71726057, 0.81458252,
       0.25851344, 0.75602425, 0.87379323, 0.71282338, 0.76906761,
       0.34299388, 0.823424  , 0.75918398, 0.80584762, 0.68870667,
       0.85438505, 0.2587831 , 0.7980222 , 0.70690345, 0.7397864 ,
       0.8223945 , 0.76670655, 0.84286675, 0.75860103, 0.72491873,
       0.27264964, 0.33746107, 0.70499502, 0.3304444 , 0.65289796,
       0.7942906 , 0.62751662, 0.85904908, 0.87383351, 0.79671528,
       0.78465381, 0.73183491, 0.72874822, 0.75743988, 0.77439

In [18]:
for i in range(0,len(y_pred)):
    if y_pred[i]>=.5:       # setting threshold to .5
        y_pred[i]=1
    else:  
        y_pred[i]=0

In [19]:
#calculating accuracy
from sklearn.metrics import accuracy_score 
accuracy_lgbm = accuracy_score(y_pred,y_test)
accuracy_lgbm

0.8

In [20]:
from sklearn.metrics import roc_auc_score

In [21]:
#calculating roc_auc_score for xgboost
auc_lgbm =  roc_auc_score(y_test,y_pred)
auc_lgbm

0.7176108051403095