In [67]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

#### Data Loading

In [63]:
df = pd.read_csv("train_dataset.csv")
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [65]:
# Check the number of missing values in each column
print(df.isnull().sum())

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64


#### Data Cleaning

In [66]:
df.Dependents = df.Dependents.fillna(df.Dependents.mode()[0])
df.Dependents = df.Dependents.replace('3+', 3).astype(int)

In [68]:
#df.Gender = df.Gender.fillna(df.Gender.mode()[0])
#Using Ratio of Male and female in available data to fill Nan values by same ratio
male_percentage = df['Gender'].value_counts(normalize=True).get('Male', 0)
female_percentage = df['Gender'].value_counts(normalize=True).get('Female', 0)

num_missing = df['Gender'].isna().sum()

num_males_to_add = int(male_percentage * num_missing)
num_females_to_add = num_missing - num_males_to_add

fill_values = ['Male'] * num_males_to_add + ['Female'] * num_females_to_add

np.random.shuffle(fill_values)

df.loc[df['Gender'].isna(), 'Gender'] = fill_values

In [69]:
df.Married = df.Married.fillna(df.Married.mode()[0])

In [70]:
#No information about self-employyement to be treated as 'No'
df.Self_Employed = df.Self_Employed.fillna('No')

In [71]:
df.LoanAmount = df.LoanAmount.fillna(df.LoanAmount.mean())
df.Loan_Amount_Term = df.Loan_Amount_Term.fillna(df.Loan_Amount_Term.mean())
df.Credit_History = df.Credit_History.fillna(df.Credit_History.mode()[0])

In [72]:
print(df.isnull().sum())

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64


#### Data Preprocessing

In [73]:
df.Gender.unique()

array(['Male', 'Female'], dtype=object)

In [74]:
df.Education = df.Education.replace({'Graduate': 1, 'Not Graduate': 0}).astype(int)
df.Self_Employed = df.Self_Employed.replace({'Yes': 1, 'No': 0}).astype(int)
df.Gender = df.Gender.replace({'Male': 1, 'Female': 0}).astype(int)
df.Married = df.Married.replace({'Yes': 1, 'No': 0}).astype(int)

In [75]:
df.Property_Area.unique()

array(['Urban', 'Rural', 'Semiurban'], dtype=object)

In [76]:
df = pd.get_dummies(df, columns=['Property_Area'], drop_first=True)

In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Loan_ID                  614 non-null    object 
 1   Gender                   614 non-null    int32  
 2   Married                  614 non-null    int32  
 3   Dependents               614 non-null    int32  
 4   Education                614 non-null    int32  
 5   Self_Employed            614 non-null    int32  
 6   ApplicantIncome          614 non-null    int64  
 7   CoapplicantIncome        614 non-null    float64
 8   LoanAmount               614 non-null    float64
 9   Loan_Amount_Term         614 non-null    float64
 10  Credit_History           614 non-null    float64
 11  Loan_Status              614 non-null    object 
 12  Property_Area_Semiurban  614 non-null    uint8  
 13  Property_Area_Urban      614 non-null    uint8  
dtypes: float64(4), int32(5), i

In [78]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Property_Area_Semiurban,Property_Area_Urban
0,LP001002,1,0,0,1,0,5849,0.0,146.412162,360.0,1.0,Y,0,1
1,LP001003,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,N,0,0
2,LP001005,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,Y,0,1
3,LP001006,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,Y,0,1
4,LP001008,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,Y,0,1


In [79]:
df.Credit_History = df.Credit_History.astype(int)

In [80]:
df.Credit_History.unique()

array([1, 0])