In [82]:
import numpy as np
import pandas as pd

In [96]:
df = pd.read_csv('../data/LoanData.csv')
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [97]:
df.drop(columns=['Loan_ID'],inplace=True)

### Data Preprocessing ###

In [98]:
df['Income'] = df['ApplicantIncome'] + df['CoapplicantIncome']

df.drop(columns=['ApplicantIncome','CoapplicantIncome'],inplace=True)

### Modifying the wrong data ###

In [99]:
df['Dependents'] = df['Dependents'].replace({'3+':3})

In [100]:
df['Dependents'].unique()

array(['0', '1', '2', 3, nan], dtype=object)

### Missing values treatment ###

In [101]:
df.isnull().sum()

Gender              13
Married              3
Dependents          15
Education            0
Self_Employed       32
LoanAmount          22
Loan_Amount_Term    14
Credit_History      50
Property_Area        0
Loan_Status          0
Income               0
dtype: int64

In [102]:
df.isnull().sum()/len(df)*100

Gender              2.117264
Married             0.488599
Dependents          2.442997
Education           0.000000
Self_Employed       5.211726
LoanAmount          3.583062
Loan_Amount_Term    2.280130
Credit_History      8.143322
Property_Area       0.000000
Loan_Status         0.000000
Income              0.000000
dtype: float64

In [103]:
df = df.dropna(subset=['Income','Credit_History','LoanAmount','Loan_Amount_Term'])

In [104]:
df['Dependents'] = df['Dependents'].fillna(0)

In [105]:
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])
df['Married'] = df['Married'].fillna(df['Married'].mode()[0])
df['Self_Employed'] = df['Self_Employed'].fillna(df['Self_Employed'].mode()[0])

In [106]:
df.isnull().sum()

Gender              0
Married             0
Dependents          0
Education           0
Self_Employed       0
LoanAmount          0
Loan_Amount_Term    0
Credit_History      0
Property_Area       0
Loan_Status         0
Income              0
dtype: int64

In [107]:
df['Dependents'] = df['Dependents'].astype('int')
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].astype('int')

In [108]:
df[['Income','LoanAmount']].skew()

Income        5.777628
LoanAmount    2.607945
dtype: float64

In [110]:
#Lets apply boxcox transformation to remove skewness
from scipy.stats import boxcox
df['Income'],a = boxcox(df['Income'])
df['LoanAmount'],b = boxcox(df['LoanAmount'])
df[['Income','LoanAmount']].skew()

Income        0.001568
LoanAmount    0.113040
dtype: float64

In [111]:
df['Loan_Amount_Term'] = df['Loan_Amount_Term']/12
df['Loan_Amount_Term']

1      30.0
2      30.0
3      30.0
4      30.0
5      30.0
       ... 
609    30.0
610    15.0
611    30.0
612    30.0
613    30.0
Name: Loan_Amount_Term, Length: 529, dtype: float64

In [114]:
df['Loan_Status'] = df['Loan_Status'].map({'N':0,'Y':1}).astype('int')

In [115]:
df.to_csv('../data/cleaned_loan_approval_dataset.csv',index=False)