In [1]:
import warnings
warnings.filterwarnings('ignore')

import lightgbm as lgb
import xgboost as xgb 
import catboost as catb

import pandas as pd
import numpy as np
from sklearn.metrics import auc

# Label Encoding our target variable 
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

In [2]:
#loading the dataset 
df_train = pd.read_csv('../data/loan_prediction_train.csv')

In [3]:
df_train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
df_train['LoanAmount_log']= np.log(df_train['LoanAmount'])

df_train['Total_Income']= df_train['ApplicantIncome']+ df_train['CoapplicantIncome']
df_train['TotalIncome_log'] = np.log(df_train['Total_Income'])

In [5]:
df_train['Debt_Income_Ratio'] = df_train['Total_Income']/df_train['LoanAmount']

In [6]:
#drop the ID column
df_train.drop(['Loan_ID',"Total_Income","LoanAmount","ApplicantIncome","CoapplicantIncome"], inplace=True, axis=1)


objList_train = df_train.select_dtypes(include = "object").columns
objList_train

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'Property_Area', 'Loan_Status'],
      dtype='object')

In [7]:
#Label Encoding for object to numeric conversion
le = LabelEncoder()

for feat in objList_train:
    df_train[feat] = le.fit_transform(df_train[feat].astype(str))

df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             614 non-null    int32  
 1   Married            614 non-null    int32  
 2   Dependents         614 non-null    int32  
 3   Education          614 non-null    int32  
 4   Self_Employed      614 non-null    int32  
 5   Loan_Amount_Term   600 non-null    float64
 6   Credit_History     564 non-null    float64
 7   Property_Area      614 non-null    int32  
 8   Loan_Status        614 non-null    int32  
 9   LoanAmount_log     592 non-null    float64
 10  TotalIncome_log    614 non-null    float64
 11  Debt_Income_Ratio  592 non-null    float64
dtypes: float64(5), int32(7)
memory usage: 40.9 KB


In [8]:
df_train.Loan_Status.value_counts()

1    422
0    192
Name: Loan_Status, dtype: int64

In [9]:
#Imputing missing values 
#df_train.fillna(df_train.mode()[0],inplace=True) 

In [10]:
# get the labels
y = df_train.Loan_Status.values
df_train.drop(['Loan_Status'], inplace=True, axis=1)
x = df_train.values

In [11]:
#Now splitting our dataset into test and train 
from sklearn.model_selection import train_test_split 
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.3)

In [12]:
# create dataset for lightgbm
lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)

In [13]:
# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}


In [14]:
print('Starting training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20)

print('Saving model...')
# save model to file
gbm.save_model('../models/Lightgbm_model.txt')

Starting training...
Saving model...


<lightgbm.basic.Booster at 0x154c498a828>

In [15]:
print('Starting predicting...')
# predict
y_pred = gbm.predict(x_test)


Starting predicting...


In [16]:
y_pred[0:5]

array([0.76943831, 0.65795492, 0.87805432, 0.80552064, 0.66002149])

In [17]:
for i in range(0,len(y_pred)):
    if y_pred[i]>=.5:       # setting threshold to .5
        y_pred[i]=1
    else:  
        y_pred[i]=0

In [18]:
#calculating accuracy
from sklearn.metrics import accuracy_score 
accuracy_lgbm = accuracy_score(y_pred,y_test)
accuracy_lgbm

0.7945945945945946

In [19]:
from sklearn.metrics import roc_auc_score

In [20]:
#calculating roc_auc_score for xgboost
auc_lgbm =  roc_auc_score(y_test,y_pred)
auc_lgbm

0.7015473380540258

In [21]:
df_test = pd.read_csv('../data/loan_prediction_test.csv')

In [22]:
df_test['LoanAmount_log'] = np.log(df_test['LoanAmount'])

df_test['Total_Income'] = df_test['ApplicantIncome'] + df_test['CoapplicantIncome']
df_test['TotalIncome_log'] = np.log(df_test['Total_Income'])

In [23]:
df_test['Debt_Income_Ratio'] = df_test['Total_Income']/df_test['LoanAmount']

In [24]:
ids = df_test['Loan_ID'].values
df_test.drop(['Loan_ID',"Total_Income","LoanAmount","ApplicantIncome","CoapplicantIncome"], inplace=True, axis=1)

In [25]:
objList_test = df_test.select_dtypes(include = "object").columns
objList_test

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'Property_Area'],
      dtype='object')

In [26]:
for feat_test in objList_test:
    df_test[feat_test] = le.fit_transform(df_test[feat_test].astype(str))

df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             367 non-null    int32  
 1   Married            367 non-null    int32  
 2   Dependents         367 non-null    int32  
 3   Education          367 non-null    int32  
 4   Self_Employed      367 non-null    int32  
 5   Loan_Amount_Term   361 non-null    float64
 6   Credit_History     338 non-null    float64
 7   Property_Area      367 non-null    int32  
 8   LoanAmount_log     362 non-null    float64
 9   TotalIncome_log    367 non-null    float64
 10  Debt_Income_Ratio  362 non-null    float64
dtypes: float64(5), int32(6)
memory usage: 23.1 KB


In [27]:
df_test.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Loan_Amount_Term,Credit_History,Property_Area,LoanAmount_log,TotalIncome_log,Debt_Income_Ratio
0,1,1,0,0,0,360.0,1.0,2,4.70048,8.651724,52.0
1,1,1,1,0,0,360.0,1.0,2,4.836282,8.428581,36.31746
2,1,1,2,0,0,360.0,1.0,2,5.337538,8.824678,32.692308
3,1,1,2,0,0,360.0,,2,4.60517,8.494129,48.86
4,1,0,0,1,0,360.0,1.0,2,4.356709,8.094378,42.0


In [28]:

df_test_values = df_test.values
y = gbm.predict(df_test_values)

output = pd.DataFrame({'Loan_ID': ids, 'target': y})


In [29]:
output.head()

Unnamed: 0,Loan_ID,target
0,LP001015,0.767831
1,LP001022,0.802181
2,LP001031,0.805137
3,LP001035,0.771195
4,LP001051,0.584956


In [30]:
output['Loan_Status'] = np.where(output['target'] >= 0.5 , 1, 0)

In [31]:
output.Loan_Status.value_counts()

1    308
0     59
Name: Loan_Status, dtype: int64

In [32]:
output.drop('target',inplace =True, axis =1)
output['Loan_Status'] = np.where(output['Loan_Status'] == 1 , 'Y', 'N')
output.head()

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,Y
1,LP001022,Y
2,LP001031,Y
3,LP001035,Y
4,LP001051,Y


In [33]:
output.to_csv("../predictions/lgbm_predictions.csv", index=False)