In [21]:
import pandas as pd

In [22]:
df = pd.read_csv('xloan_approval_dataset.csv')

df.columns = df.columns.str.strip()
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [23]:
print(df.columns)

Index(['loan_id', 'no_of_dependents', 'education', 'self_employed',
       'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
       'residential_assets_value', 'commercial_assets_value',
       'luxury_assets_value', 'bank_asset_value', 'loan_status'],
      dtype='object')


In [24]:
# modify loan status values to 0 and 1
df = df.drop(['loan_id'], axis=1)

df['loan_status'] = df['loan_status'].replace({'Approved': 1, 'Rejected': 0})
df['education'] = df['education'].replace({'Not Graduate': 0, 'Graduate': 1})
df['self_employed'] = df['self_employed'].replace({'No': 0, 'Yes': 1})

df.head()   

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,1,0,9600000,29900000,12,778,2400000,17600000,22700000,8000000,1
1,0,0,1,4100000,12200000,8,417,2700000,2200000,8800000,3300000,0
2,3,1,0,9100000,29700000,20,506,7100000,4500000,33300000,12800000,0
3,3,1,0,8200000,30700000,8,467,18200000,3300000,23300000,7900000,0
4,5,0,1,9800000,24200000,20,382,12400000,8200000,29400000,5000000,0


In [25]:
# get all the max of each column
max_values = df.max()
print(max_values)

no_of_dependents                   5
education                          1
self_employed                      1
income_annum                 9900000
loan_amount                 39500000
loan_term                         20
cibil_score                      900
residential_assets_value    29100000
commercial_assets_value     19400000
luxury_assets_value         39200000
bank_asset_value            14700000
loan_status                        1
dtype: int64


In [26]:
import json

scaling_factors = {
    'income_annum': int(df['income_annum'].max()),
    'loan_amount': int(df['loan_amount'].max()),
    'residential_assets_value': int(df['residential_assets_value'].max()),
    'commercial_assets_value': int(df['commercial_assets_value'].max()),
    'luxury_assets_value': int(df['luxury_assets_value'].max()),
    'bank_asset_value': int(df['bank_asset_value'].max()),
    'loan_term': int(df['loan_term'].max()),
    'cibil_score': int(df['cibil_score'].max())
}


with open('scaling_factors.json', 'w') as file:
    json.dump(scaling_factors, file)

In [27]:
# normalize the data except for the loan status , no_of_dependents, self_employed and education
df['income_annum'] = df['income_annum'] / df['income_annum'].max()
df['loan_amount'] = df['loan_amount'] / df['loan_amount'].max()
df['residential_assets_value'] = df['residential_assets_value'] / df['residential_assets_value'].max()
df['commercial_assets_value'] = df['commercial_assets_value'] / df['commercial_assets_value'].max()
df['luxury_assets_value'] = df['luxury_assets_value'] / df['luxury_assets_value'].max()
df['bank_asset_value'] = df['bank_asset_value'] / df['bank_asset_value'].max()

df['loan_term'] = df['loan_term'] / df['loan_term'].max()
df['cibil_score'] = df['cibil_score'] / df['cibil_score'].max()

df.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,1,0,0.969697,0.756962,0.6,0.864444,0.082474,0.907216,0.579082,0.544218,1
1,0,0,1,0.414141,0.308861,0.4,0.463333,0.092784,0.113402,0.22449,0.22449,0
2,3,1,0,0.919192,0.751899,1.0,0.562222,0.243986,0.231959,0.84949,0.870748,0
3,3,1,0,0.828283,0.777215,0.4,0.518889,0.62543,0.170103,0.594388,0.537415,0
4,5,0,1,0.989899,0.612658,1.0,0.424444,0.426117,0.42268,0.75,0.340136,0


In [29]:
df.to_csv('loan_approval_dataset_normalized.csv', index=False)