In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
loan_data  = pd.read_csv("https://raw.githubusercontent.com/dphi-official/Datasets/master/Loan_Data/loan_train.csv", index_col=0)
loan_data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP002305,Female,No,0,Graduate,No,4547,0.0,115.0,360.0,1.0,Semiurban,1
1,LP001715,Male,Yes,3+,Not Graduate,Yes,5703,0.0,130.0,360.0,1.0,Rural,1
2,LP002086,Female,Yes,0,Graduate,No,4333,2451.0,110.0,360.0,1.0,Urban,0
3,LP001136,Male,Yes,0,Not Graduate,Yes,4695,0.0,96.0,,1.0,Urban,1
4,LP002529,Male,Yes,2,Graduate,No,6700,1750.0,230.0,300.0,1.0,Semiurban,1


In [3]:
test_data = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/Loan_Data/loan_test.csv')
test_data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001116,Male,No,0,Not Graduate,No,3748,1668.0,110.0,360.0,1.0,Semiurban
1,LP001488,Male,Yes,3+,Graduate,No,4000,7750.0,290.0,360.0,1.0,Semiurban
2,LP002138,Male,Yes,0,Graduate,No,2625,6250.0,187.0,360.0,1.0,Rural
3,LP002284,Male,No,0,Not Graduate,No,3902,1666.0,109.0,360.0,1.0,Rural
4,LP002328,Male,Yes,0,Not Graduate,No,6096,0.0,218.0,360.0,0.0,Rural


In [4]:
loan_data.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
count,491.0,491.0,475.0,478.0,448.0,491.0
mean,5401.189409,1589.730998,145.014737,341.297071,0.848214,0.698574
std,6419.427177,2919.320624,86.310534,66.964051,0.359214,0.459345
min,150.0,0.0,17.0,12.0,0.0,0.0
25%,2923.5,0.0,100.0,360.0,1.0,0.0
50%,3865.0,1229.0,126.0,360.0,1.0,1.0
75%,5705.5,2251.5,162.0,360.0,1.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0,1.0


In [5]:
loan_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 491 entries, 0 to 490
Data columns (total 13 columns):
Loan_ID              491 non-null object
Gender               481 non-null object
Married              490 non-null object
Dependents           482 non-null object
Education            491 non-null object
Self_Employed        462 non-null object
ApplicantIncome      491 non-null int64
CoapplicantIncome    491 non-null float64
LoanAmount           475 non-null float64
Loan_Amount_Term     478 non-null float64
Credit_History       448 non-null float64
Property_Area        491 non-null object
Loan_Status          491 non-null int64
dtypes: float64(4), int64(2), object(7)
memory usage: 53.7+ KB


In [6]:
loan_data.isnull().sum()

Loan_ID               0
Gender               10
Married               1
Dependents            9
Education             0
Self_Employed        29
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           16
Loan_Amount_Term     13
Credit_History       43
Property_Area         0
Loan_Status           0
dtype: int64

In [7]:
X = loan_data.drop(['Loan_ID','Loan_Status'] , axis=1)
y = loan_data['Loan_Status']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

According to data description from [here](https://dphi.tech/practice/challenge/54#data). I will use different approach to impute the missing value. One using mode for the given columns:
* Gender
* Married
* Dependents
* Self_Employed
* Loan_Amount_Term
* Credit_History

Another method is using mean for continuous value:
* LoanAmount

## Transform training data

In [11]:
X_train['LoanAmount'] = X_train['LoanAmount'].fillna(X_train['LoanAmount'].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [12]:
mode_col = ['Gender', 'Married', 'Dependents', 'Self_Employed', 'Loan_Amount_Term', 'Credit_History']
for col in mode_col:
    X_train[col] = X_train[col].fillna(X_train[col].mode()[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [13]:
X_train.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64

In [14]:
num_col = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
cat_col = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']

In [15]:
X_train = pd.get_dummies(X_train, columns=cat_col)

In [16]:
X_train.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Male,Married_No,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
58,5000,0.0,103.0,360.0,0.0,1,0,1,0,1,0,0,0,1,0,1,0,0,1,0
372,3033,1459.0,95.0,360.0,1.0,0,1,0,1,1,0,0,0,1,0,1,0,0,0,1
78,3917,0.0,124.0,360.0,1.0,0,1,0,1,0,0,1,0,0,1,1,0,0,1,0
440,2400,2167.0,115.0,360.0,1.0,0,1,0,1,1,0,0,0,1,0,1,0,0,1,0
249,1963,0.0,53.0,360.0,1.0,1,0,1,0,1,0,0,0,0,1,1,0,0,1,0


In [17]:
scaler = MinMaxScaler()

In [18]:
X_train[num_col] = scaler.fit_transform(X_train[num_col])

In [19]:
X_train.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Male,Married_No,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
58,0.059988,0.0,0.125915,0.74359,0.0,1,0,1,0,1,0,0,0,1,0,1,0,0,1,0
372,0.035659,0.035016,0.114202,0.74359,1.0,0,1,0,1,1,0,0,0,1,0,1,0,0,0,1
78,0.046592,0.0,0.156662,0.74359,1.0,0,1,0,1,0,0,1,0,0,1,1,0,0,1,0
440,0.027829,0.052008,0.143485,0.74359,1.0,0,1,0,1,1,0,0,0,1,0,1,0,0,1,0
249,0.022424,0.0,0.052709,0.74359,1.0,1,0,1,0,1,0,0,0,0,1,1,0,0,1,0


In [20]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

LogisticRegression()

In [21]:
print(f'F1 Score of training data: {f1_score(y_train, lr.predict(X_train))}')

F1 Score of training data: 0.8910569105691057


## Preprocess Validation Data

In [22]:
X_test['LoanAmount'] = X_test['LoanAmount'].fillna(X_test['LoanAmount'].mean())
for col in mode_col:
    X_test[col] = X_test[col].fillna(X_test[col].mode()[0])
    
X_test = pd.get_dummies(X_test, columns=cat_col)
X_test[num_col] = scaler.fit_transform(X_test[num_col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [23]:
y_pred = lr.predict(X_test)

In [30]:
print(f'F1 Score of validation data: {f1_score(y_test, y_pred)}')

F1 Score of validation data: 0.8400000000000001


Because the model is quite descent with F1 score over 80% and the margin between train data and validation data, we will use this model for prediction of test_data.

## Test Data Preprocessing

In [31]:
test_data['LoanAmount'] = test_data['LoanAmount'].fillna(test_data['LoanAmount'].mean())
for col in mode_col:
    test_data[col] = test_data[col].fillna(test_data[col].mode()[0])
    
test_data = pd.get_dummies(test_data, columns=cat_col)
test_data[num_col] = scaler.fit_transform(test_data[num_col])

In [33]:
test_data.head()

Unnamed: 0,Loan_ID,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Male,Married_No,Married_Yes,...,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,LP001116,0.105185,0.0834,0.180036,0.714286,1.0,0,1,1,0,...,0,0,0,0,1,1,0,0,1,0
1,LP001488,0.112677,0.3875,0.500891,0.714286,1.0,0,1,0,1,...,0,0,1,1,0,1,0,0,1,0
2,LP002138,0.071798,0.3125,0.317291,0.714286,1.0,0,1,0,1,...,0,0,0,1,0,1,0,1,0,0
3,LP002284,0.109763,0.0833,0.178253,0.714286,1.0,0,1,1,0,...,0,0,0,0,1,1,0,1,0,0
4,LP002328,0.174991,0.0,0.372549,0.714286,0.0,0,1,0,1,...,0,0,0,0,1,1,0,1,0,0


In [34]:
test_data = test_data.drop('Loan_ID', axis=1)

In [35]:
prediction = lr.predict(test_data)

In [36]:
prediction_df = pd.DataFrame(prediction, columns = ['prediction'])

In [37]:
prediction_df.head()

Unnamed: 0,prediction
0,1
1,1
2,1
3,1
4,0


In [38]:
prediction_df.to_csv('prediction.csv', index=False)

## Model Serialization

In [42]:
import pickle

In [43]:
with open('lr_model.pkl', 'wb') as file:
    pickle.dump(lr, file)

In [8]:
import json

columns = {
    'data_columns': [col.lower() for col in X.columns]
}

with open('columns.json', 'w') as file:
    file.write(json.dumps(columns))