In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('loan_approval_dataset.csv')

In [3]:
data.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [4]:
data.drop(columns='loan_id',inplace=True)

In [5]:
data.columns

Index([' no_of_dependents', ' education', ' self_employed', ' income_annum',
       ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')

In [6]:
data.columns = data.columns.str.strip()

In [7]:
data.columns

Index(['no_of_dependents', 'education', 'self_employed', 'income_annum',
       'loan_amount', 'loan_term', 'cibil_score', 'residential_assets_value',
       'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value',
       'loan_status'],
      dtype='object')

In [8]:
data['Assets'] = data.residential_assets_value + data.luxury_assets_value + data.commercial_assets_value + data.bank_asset_value

In [9]:
data.drop(columns = ['bank_asset_value' , 'residential_assets_value' , 'luxury_assets_value' , 'commercial_assets_value'],inplace=True)

In [10]:
data.isnull().sum()

no_of_dependents    0
education           0
self_employed       0
income_annum        0
loan_amount         0
loan_term           0
cibil_score         0
loan_status         0
Assets              0
dtype: int64

In [11]:
data.education.unique()

array([' Graduate', ' Not Graduate'], dtype=object)

In [12]:
def clean_data(st):
    st = st.strip()
    return st

In [13]:
clean_data('  hello ')

'hello'

In [14]:
data.education = data.education.apply(clean_data)

In [15]:
data.education.unique()

array(['Graduate', 'Not Graduate'], dtype=object)

In [16]:
data['education'] = data['education'].replace(['Graduate' , 'Not Graduate'] , [1,0])

In [17]:
data.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,loan_status,Assets
0,2,1,No,9600000,29900000,12,778,Approved,50700000
1,0,0,Yes,4100000,12200000,8,417,Rejected,17000000
2,3,1,No,9100000,29700000,20,506,Rejected,57700000
3,3,1,No,8200000,30700000,8,467,Rejected,52700000
4,5,0,Yes,9800000,24200000,20,382,Rejected,55000000


In [18]:
data.self_employed = data.self_employed.apply(clean_data)

In [19]:
data['self_employed'] = data['self_employed'].replace(['No' , 'Yes'],[0,1])

In [20]:
data.loan_status = data.loan_status.apply(clean_data)

In [21]:
data.loan_status = data.loan_status.replace(['Approved' , 'Rejected'],[1,0])

In [22]:
data.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,loan_status,Assets
0,2,1,0,9600000,29900000,12,778,1,50700000
1,0,0,1,4100000,12200000,8,417,0,17000000
2,3,1,0,9100000,29700000,20,506,0,57700000
3,3,1,0,8200000,30700000,8,467,0,52700000
4,5,0,1,9800000,24200000,20,382,0,55000000


In [23]:
from sklearn.model_selection import train_test_split

In [24]:
input_data = data.drop(columns = ['loan_status'])
output_data = data['loan_status']

In [25]:
x_train,x_test,y_train,y_test = train_test_split(input_data,output_data,test_size=0.2,random_state=45)

In [26]:
x_train.shape , x_test.shape , y_train.shape , y_test.shape

((3415, 8), (854, 8), (3415,), (854,))

In [27]:
from sklearn.preprocessing import StandardScaler

In [28]:
scaler = StandardScaler()

In [29]:
x_train_sclaed = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [30]:
from sklearn.linear_model import LogisticRegression

In [31]:
model = LogisticRegression()

In [32]:
model.fit(x_train_sclaed , y_train)

In [33]:
model.score(x_test_scaled,y_test)

0.9086651053864169

In [34]:
pred_data = pd.DataFrame([['2','1','0','9600000','29900000','12','778','50700000']] , columns = ['no_of_dependents','education','self_employed','income_annum','loan_amount','loan_term','cibil_score','Assets'])

In [35]:
pred_data = scaler.transform(pred_data)

In [36]:
model.predict(pred_data)

array([1], dtype=int64)

In [37]:
import pickle as pk

In [38]:
pk.dump(model , open('model.pkl','wb'))

In [39]:
pk.dump(scaler , open('scaler.pkl','wb'))