In [1]:
import pandas as pd

In [2]:
loan_data  = pd.read_csv("https://raw.githubusercontent.com/dphi-official/Datasets/master/Loan_Data/loan_train.csv" )

In [3]:
loan_data.head()

Unnamed: 0.1,Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,LP002305,Female,No,0,Graduate,No,4547,0.0,115.0,360.0,1.0,Semiurban,1
1,1,LP001715,Male,Yes,3+,Not Graduate,Yes,5703,0.0,130.0,360.0,1.0,Rural,1
2,2,LP002086,Female,Yes,0,Graduate,No,4333,2451.0,110.0,360.0,1.0,Urban,0
3,3,LP001136,Male,Yes,0,Not Graduate,Yes,4695,0.0,96.0,,1.0,Urban,1
4,4,LP002529,Male,Yes,2,Graduate,No,6700,1750.0,230.0,300.0,1.0,Semiurban,1


In [4]:
# drop Loan_ID, Unnamed: 0
loan_data.drop(['Unnamed: 0', 'Loan_ID'], axis=1, inplace=True)

In [5]:
loan_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 491 entries, 0 to 490
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             481 non-null    object 
 1   Married            490 non-null    object 
 2   Dependents         482 non-null    object 
 3   Education          491 non-null    object 
 4   Self_Employed      462 non-null    object 
 5   ApplicantIncome    491 non-null    int64  
 6   CoapplicantIncome  491 non-null    float64
 7   LoanAmount         475 non-null    float64
 8   Loan_Amount_Term   478 non-null    float64
 9   Credit_History     448 non-null    float64
 10  Property_Area      491 non-null    object 
 11  Loan_Status        491 non-null    int64  
dtypes: float64(4), int64(2), object(6)
memory usage: 46.2+ KB


In [6]:
categorical_variables = loan_data.select_dtypes(include=['object']).columns

In [7]:
categorical_variables

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'Property_Area'],
      dtype='object')

In [8]:
# encode the categorical variables
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for i in categorical_variables:
    loan_data[i] = le.fit_transform(loan_data[i])

In [9]:
# check the data
loan_data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,0,0,0,0,4547,0.0,115.0,360.0,1.0,1,1
1,1,1,3,1,1,5703,0.0,130.0,360.0,1.0,0,1
2,0,1,0,0,0,4333,2451.0,110.0,360.0,1.0,2,0
3,1,1,0,1,1,4695,0.0,96.0,,1.0,2,1
4,1,1,2,0,0,6700,1750.0,230.0,300.0,1.0,1,1


In [10]:
# check for missing values
loan_data.isnull().sum()

Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           16
Loan_Amount_Term     13
Credit_History       43
Property_Area         0
Loan_Status           0
dtype: int64

In [11]:
# drop the missing values in LoanAmount Loan_Amount_Term 
loan_data.dropna(subset=['LoanAmount', 'Loan_Amount_Term'], inplace=True)

In [12]:
# fill the missing Credit_History values with 0
loan_data['Credit_History'].fillna(0, inplace=True)

In [13]:
# split the data into X and y
X = loan_data.drop('Loan_Status', axis=1)
y = loan_data['Loan_Status']

In [19]:
# get y values counts
y.value_counts()

Loan_Status
1    330
0    132
Name: count, dtype: int64

In [14]:
# standardize the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [15]:
# voting classifier
from sklearn.ensemble import VotingClassifier
# random forest
from sklearn.ensemble import RandomForestClassifier

# pipeline
from sklearn.pipeline import Pipeline

# create the pipeline
clf_pipeline = Pipeline(
    [   
        ('clf', VotingClassifier(
            estimators=[
                ('rf', RandomForestClassifier()),
            ],
            voting='soft'
        ))
    ]
)

# randomCV
from sklearn.model_selection import RandomizedSearchCV

params = {
    'clf__rf__n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900],
    'clf__rf__max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9],
}

randomCV = RandomizedSearchCV(
    estimator=clf_pipeline,
    param_distributions=params,
    cv=5,
    n_jobs=-1,
    verbose=1
)

In [16]:
# fit the data
randomCV.fit(X, y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [17]:
test_data = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/Loan_Data/loan_test.csv')

In [18]:
# evaluate the model
from sklearn.metrics import accuracy_score
y_pred = randomCV.predict(X)
accuracy_score(y, y_pred)

0.8658008658008658

In [None]:
# save the model
import pickle
pickle.dump(randomCV, open('model.pkl', 'wb'))