# Imports

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

# Load Loan Dataset

In [1]:
df = pd.read_csv('loan.csv')

In [2]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,0,67,male,2,own,,little,1169,6,radio/TV
1,1,22,female,2,own,little,moderate,5951,48,radio/TV
2,2,49,male,1,own,little,,2096,12,education
3,3,45,male,2,free,little,little,7882,42,furniture/equipment
4,4,53,male,2,free,little,little,4870,24,car


In [3]:
df.describe(include='all')

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
count,1000.0,1000.0,1000,1000.0,1000,817,606,1000.0,1000.0,1000
unique,,,2,,3,4,3,,,8
top,,,male,,own,little,little,,,car
freq,,,690,,713,603,274,,,337
mean,499.5,35.546,,1.904,,,,3271.258,20.903,
std,288.819436,11.375469,,0.653614,,,,2822.736876,12.058814,
min,0.0,19.0,,0.0,,,,250.0,4.0,
25%,249.75,27.0,,2.0,,,,1365.5,12.0,
50%,499.5,33.0,,2.0,,,,2319.5,18.0,
75%,749.25,42.0,,2.0,,,,3972.25,24.0,


# Applying Onehat Encoding
- Sex
- Housing
- Purpose

In [51]:
# Step 1: Apply one-hot encoding to categorical columns, without dropping any category
df_encoded = pd.get_dummies(df, columns=['Sex', 'Housing', 'Purpose'], drop_first=False)

# Step 2: Convert only the one-hot encoded columns (those that start with 'Sex_', 'Housing_', 'Purpose_') to int
one_hot_columns = [col for col in df_encoded.columns if 'Sex_' in col or 'Housing_' in col or 'Purpose_' in col]
df_encoded[one_hot_columns] = df_encoded[one_hot_columns].astype(int)

# Applying Mapping on Saving and Checking Accounts

In [52]:
mapping = {'little':0 ,'moderate':1,'quite rich':2, 'rich':3}
df_encoded['saving_map'] = df_encoded['Saving accounts'].map(mapping)
df_encoded['checking_map'] = df_encoded['Checking account'].map(mapping)

In [53]:
df_encoded

Unnamed: 0.1,Unnamed: 0,Age,Job,Saving accounts,Checking account,Credit amount,Duration,Sex_female,Sex_male,Housing_free,...,Purpose_business,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others,saving_map,checking_map
0,0,67,2,,little,1169,6,0,1,0,...,0,0,0,0,0,1,0,0,,0.0
1,1,22,2,little,moderate,5951,48,1,0,0,...,0,0,0,0,0,1,0,0,0.0,1.0
2,2,49,1,little,,2096,12,0,1,0,...,0,0,0,1,0,0,0,0,0.0,
3,3,45,2,little,little,7882,42,0,1,1,...,0,0,0,0,1,0,0,0,0.0,0.0
4,4,53,2,little,little,4870,24,0,1,1,...,0,1,0,0,0,0,0,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,31,1,little,,1736,12,1,0,0,...,0,0,0,0,1,0,0,0,0.0,
996,996,40,3,little,little,3857,30,0,1,0,...,0,1,0,0,0,0,0,0,0.0,0.0
997,997,38,2,little,,804,12,0,1,0,...,0,0,0,0,0,1,0,0,0.0,
998,998,23,2,little,little,1845,45,0,1,1,...,0,0,0,0,0,1,0,0,0.0,0.0


# Using Model To Fill Missing Values
- Checking Account
- Saving Account

In [54]:
df_saving = df_encoded.copy().drop(['Saving accounts','Checking account','checking_map'], axis=1)
df_checking = df_encoded.copy().drop(['Checking account','Saving accounts','saving_map'], axis=1)

In [55]:
df_saving_na =df_saving[df_saving.isna().any(axis=1)]
df_checking_na =df_checking[df_checking.isna().any(axis=1)]

In [56]:
df_saving_na

Unnamed: 0.1,Unnamed: 0,Age,Job,Credit amount,Duration,Sex_female,Sex_male,Housing_free,Housing_own,Housing_rent,Purpose_business,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others,saving_map
0,0,67,2,1169,6,0,1,0,1,0,0,0,0,0,0,1,0,0,
5,5,35,1,9055,36,0,1,1,0,0,0,0,0,1,0,0,0,0,
16,16,53,2,2424,24,0,1,0,1,0,0,0,0,0,0,1,0,0,
17,17,25,2,8072,30,0,1,0,1,0,1,0,0,0,0,0,0,0,
24,24,26,2,2069,10,0,1,0,1,0,0,0,0,0,1,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
968,968,29,2,7166,42,0,1,0,0,1,0,0,0,0,0,1,0,0,
977,977,42,2,2427,18,0,1,0,1,0,1,0,0,0,0,0,0,0,
990,990,37,1,3565,12,0,1,0,1,0,0,0,0,1,0,0,0,0,
992,992,23,1,1936,18,0,1,0,0,1,0,0,0,0,0,1,0,0,


In [63]:
X_saving_na = df_saving_na.drop(['Unnamed: 0','saving_map'],axis=1).values
X_checking_na = df_checking_na.drop(['Unnamed: 0','checking_map'],axis=1).values

In [68]:
df_saving_pre = df_saving.dropna()
df_checking_pre = df_checking.dropna()

In [69]:
X_saving = df_saving_pre.drop(['Unnamed: 0','saving_map'], axis=1).values
y_saving = df_saving_pre['saving_map'].values

In [70]:
X_checking = df_checking_pre.drop(['Unnamed: 0','checking_map'], axis=1).values
y_checking = df_checking_pre['checking_map'].values

## Train and Predict on Saving Accounts

In [71]:
model = RandomForestClassifier()
model.fit(X_saving, y_saving)
y_pred_saving = model.predict(X_saving_na)

## Train and Predict on Checking Accounts

In [72]:
model = RandomForestClassifier()
model.fit(X_checking, y_checking)
y_pred_checking = model.predict(X_checking_na)

In [46]:
# df_saving_na.loc[:, 'saving_map'] = y_pred_saving
# df_checking_na.loc[:, 'checking_map'] = y_pred_checking

# Applying Reverse Mapping
- Checking Account
- Saving Account

In [74]:
df_final = df.copy()

In [76]:
rows_saving = df_saving_na['Unnamed: 0'].values
rows_checking = df_checking_na['Unnamed: 0'].values

In [77]:
mapping = {'little': 0, 'moderate': 1, 'quite rich': 2, 'rich': 3}

reverse_mapping = {v: k for k, v in mapping.items()}

rows_to_fill = rows_saving
values_to_fill = y_pred_saving

for row, value in zip(rows_to_fill, values_to_fill):
    df_final.at[row, 'Saving accounts'] = reverse_mapping[value]

In [80]:
mapping = {'little': 0, 'moderate': 1, 'quite rich': 2, 'rich': 3}

reverse_mapping = {v: k for k, v in mapping.items()}

rows_to_fill = rows_checking
values_to_fill = y_pred_checking

for row, value in zip(rows_to_fill, values_to_fill):
    df_final.at[row, 'Checking account'] = reverse_mapping[value]

In [81]:
df_final.describe(include='all')

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
count,1000.0,1000.0,1000,1000.0,1000,1000,1000,1000.0,1000.0,1000
unique,,,2,,3,4,3,,,8
top,,,male,,own,little,little,,,car
freq,,,690,,713,779,462,,,337
mean,499.5,35.546,,1.904,,,,3271.258,20.903,
std,288.819436,11.375469,,0.653614,,,,2822.736876,12.058814,
min,0.0,19.0,,0.0,,,,250.0,4.0,
25%,249.75,27.0,,2.0,,,,1365.5,12.0,
50%,499.5,33.0,,2.0,,,,2319.5,18.0,
75%,749.25,42.0,,2.0,,,,3972.25,24.0,


# Saving

In [83]:
df_final.to_csv('loan_cleaned.csv')