In [20]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.linear_model import LogisticRegression

In [2]:
loan_data  = pd.read_csv("https://raw.githubusercontent.com/dphi-official/Datasets/master/Loan_Data/loan_train.csv", index_col=0)
loan_data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP002305,Female,No,0,Graduate,No,4547,0.0,115.0,360.0,1.0,Semiurban,1
1,LP001715,Male,Yes,3+,Not Graduate,Yes,5703,0.0,130.0,360.0,1.0,Rural,1
2,LP002086,Female,Yes,0,Graduate,No,4333,2451.0,110.0,360.0,1.0,Urban,0
3,LP001136,Male,Yes,0,Not Graduate,Yes,4695,0.0,96.0,,1.0,Urban,1
4,LP002529,Male,Yes,2,Graduate,No,6700,1750.0,230.0,300.0,1.0,Semiurban,1


In [3]:
test_data = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/Loan_Data/loan_test.csv')
test_data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001116,Male,No,0,Not Graduate,No,3748,1668.0,110.0,360.0,1.0,Semiurban
1,LP001488,Male,Yes,3+,Graduate,No,4000,7750.0,290.0,360.0,1.0,Semiurban
2,LP002138,Male,Yes,0,Graduate,No,2625,6250.0,187.0,360.0,1.0,Rural
3,LP002284,Male,No,0,Not Graduate,No,3902,1666.0,109.0,360.0,1.0,Rural
4,LP002328,Male,Yes,0,Not Graduate,No,6096,0.0,218.0,360.0,0.0,Rural


In [5]:
loan_data.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
count,491.0,491.0,475.0,478.0,448.0,491.0
mean,5401.189409,1589.730998,145.014737,341.297071,0.848214,0.698574
std,6419.427177,2919.320624,86.310534,66.964051,0.359214,0.459345
min,150.0,0.0,17.0,12.0,0.0,0.0
25%,2923.5,0.0,100.0,360.0,1.0,0.0
50%,3865.0,1229.0,126.0,360.0,1.0,1.0
75%,5705.5,2251.5,162.0,360.0,1.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0,1.0


In [6]:
loan_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 491 entries, 0 to 490
Data columns (total 13 columns):
Loan_ID              491 non-null object
Gender               481 non-null object
Married              490 non-null object
Dependents           482 non-null object
Education            491 non-null object
Self_Employed        462 non-null object
ApplicantIncome      491 non-null int64
CoapplicantIncome    491 non-null float64
LoanAmount           475 non-null float64
Loan_Amount_Term     478 non-null float64
Credit_History       448 non-null float64
Property_Area        491 non-null object
Loan_Status          491 non-null int64
dtypes: float64(4), int64(2), object(7)
memory usage: 53.7+ KB


In [4]:
loan_data.isnull().sum()

Loan_ID               0
Gender               10
Married               1
Dependents            9
Education             0
Self_Employed        29
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           16
Loan_Amount_Term     13
Credit_History       43
Property_Area         0
Loan_Status           0
dtype: int64

In [7]:
X = loan_data.drop('Loan_Status', axis=1)
y = loan_data['Loan_Status']

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

According to data description from [here](https://dphi.tech/practice/challenge/54#data). I will use different approach to impute the missing value. One using mode for the given columns:
* Gender
* Married
* Dependents
* Self_Employed
* Loan_Amount_Term
* Credit_History

Another method is using mean for continuous value:
* LoanAmount

In [41]:
pre_process = ColumnTransformer(remainder='passthrough',
                               transformers=[('impute_mode', SimpleImputer(strategy='most_frequent'), ['Gender',
                                                                                                      'Married',
                                                                                                      'Dependents',
                                                                                                      'Self_Employed',
                                                                                                      'Loan_Amount_Term',
                                                                                                      'Credit_History']),
                                            ('impute_mean', SimpleImputer(strategy='mean'), ['LoanAmount']),
                                            ('scaling', StandardScaler(), ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']),
                                            ('encoder', OneHotEncoder(handle_unknown='ignore'), ['Gender',
                                                                        'Married',
                                                                        'Dependents',
                                                                        'Self_Employed',
                                                                        'Loan_Amount_Term',
                                                                        'Credit_History',
                                                                        'Property_Area'])])

In [42]:
model_clf = Pipeline(steps=[('preprocess', pre_process),
                           ('classifier', LogisticRegression())])

In [43]:
model_clf.fit(X_train, y_train)

ValueError: Input contains NaN