In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline


In [2]:
train_data= pd.read_csv("train_data.csv")
train_data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
train_data.shape

(614, 13)

In [4]:
from sklearn.model_selection import train_test_split

X = train_data.iloc[:,1:13] # Leave out the Loan ID and the target column 
y= train_data.iloc[:,-1]
X_train, X_val , Y_train , Y_val = train_test_split(X,y, test_size=0.2)

In [5]:
# Missing values

missing_values_feature_list = [feature for feature in train_data.columns if train_data[feature].isnull().sum()>0]
print(missing_values_feature_list)

['Gender', 'Married', 'Dependents', 'Self_Employed', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']


In [6]:
for feature in missing_values_feature_list:
    print("the feature is {} and it has missing values {}%".format(feature,np.round(train_data[feature].isnull().sum()*100/len(train_data[feature]))))

the feature is Gender and it has missing values 2.0%
the feature is Married and it has missing values 0.0%
the feature is Dependents and it has missing values 2.0%
the feature is Self_Employed and it has missing values 5.0%
the feature is LoanAmount and it has missing values 4.0%
the feature is Loan_Amount_Term and it has missing values 2.0%
the feature is Credit_History and it has missing values 8.0%


In [7]:
categorical_features = [feature for feature in train_data.columns if train_data[feature].dtype == 'O']
def replace_cat_features(dataset, categorical_features):
    dataset[categorical_features] = dataset[categorical_features].fillna(data[categorical_features].mode())
    return data



In [8]:
data = train_data.copy()
replace_cat_features(data,categorical_features=categorical_features)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [9]:
missing_numerical_features = [feature for feature in train_data.columns if train_data[feature].dtype != 'O' and train_data[feature].isnull().sum()>0]
print(missing_numerical_features)

['LoanAmount', 'Loan_Amount_Term', 'Credit_History']


In [10]:
for feature in missing_numerical_features:
    # since there are outliers lets use median for replacing
    median_value = data[feature].median()
    #create new feature
    data[feature + "_nan"] = np.where(data[feature].isnull(),1,0)
    data[feature] = data[feature].fillna(median_value)

In [13]:
for feature in categorical_features:
    print(feature, len(data[feature].unique()))

Loan_ID 614
Gender 3
Married 3
Dependents 5
Education 2
Self_Employed 3
Property_Area 3
Loan_Status 2
