### Preprocessing the data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import joblib

### Read the dataset

In [2]:
raw_data = pd.read_csv('loan_approval_dataset.csv')
raw_data.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


### Look at the details for the dataset

In [3]:
raw_data.describe(include='all')

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
count,4269.0,4269.0,4269,4269,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269
unique,,,2,2,,,,,,,,,2
top,,,Graduate,Yes,,,,,,,,,Approved
freq,,,2144,2150,,,,,,,,,2656
mean,2135.0,2.498712,,,5059124.0,15133450.0,10.900445,599.936051,7472617.0,4973155.0,15126310.0,4976692.0,
std,1232.498479,1.69591,,,2806840.0,9043363.0,5.709187,172.430401,6503637.0,4388966.0,9103754.0,3250185.0,
min,1.0,0.0,,,200000.0,300000.0,2.0,300.0,-100000.0,0.0,300000.0,0.0,
25%,1068.0,1.0,,,2700000.0,7700000.0,6.0,453.0,2200000.0,1300000.0,7500000.0,2300000.0,
50%,2135.0,3.0,,,5100000.0,14500000.0,10.0,600.0,5600000.0,3700000.0,14600000.0,4600000.0,
75%,3202.0,4.0,,,7500000.0,21500000.0,16.0,748.0,11300000.0,7600000.0,21700000.0,7100000.0,


In [4]:
raw_data[' no_of_dependents'].unique()

array([2, 0, 3, 5, 4, 1])

In [5]:
raw_data[' no_of_dependents'].max()

5

In [6]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   loan_id                    4269 non-null   int64 
 1    no_of_dependents          4269 non-null   int64 
 2    education                 4269 non-null   object
 3    self_employed             4269 non-null   object
 4    income_annum              4269 non-null   int64 
 5    loan_amount               4269 non-null   int64 
 6    loan_term                 4269 non-null   int64 
 7    cibil_score               4269 non-null   int64 
 8    residential_assets_value  4269 non-null   int64 
 9    commercial_assets_value   4269 non-null   int64 
 10   luxury_assets_value       4269 non-null   int64 
 11   bank_asset_value          4269 non-null   int64 
 12   loan_status               4269 non-null   object
dtypes: int64(10), object(3)
memory usage: 433.7+ KB


From the data we can see there is no missing entries. The columns education, self_employment and loan_status needs maping to int values

### Preprocessing the data

Dropping some column based on corr and relavence, scaling all numeric features then use one hot encoding to map the catagorical data

In [7]:
df = raw_data.copy()

In [8]:
df.columns = df.columns.str.strip()
df.columns.values

array(['loan_id', 'no_of_dependents', 'education', 'self_employed',
       'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
       'residential_assets_value', 'commercial_assets_value',
       'luxury_assets_value', 'bank_asset_value', 'loan_status'],
      dtype=object)

Adding in the feature Payment to income ratio (PTI)

In [9]:
from sklearn.preprocessing import OneHotEncoder


df = df.drop(columns=['loan_id'])
df = df.drop(columns=['luxury_assets_value'])
df['payment_each_year'] = df['loan_amount'] / df['loan_term']
df['PTI'] = df['payment_each_year'] / df['income_annum']
df = df.drop(columns=['payment_each_year', 'loan_amount'])
numeric_features = ['no_of_dependents','PTI', 'income_annum', 'loan_term','cibil_score',
       'residential_assets_value', 'commercial_assets_value','bank_asset_value']
categorical_features = ['education', 'self_employed']


In [10]:
df_sorted = df.sort_values(by='PTI', ascending=False)
df_sorted.head(50)

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_term,cibil_score,residential_assets_value,commercial_assets_value,bank_asset_value,loan_status,PTI
332,2,Not Graduate,Yes,4600000,2,555,7700000,2400000,4200000,Approved,2.0
1067,2,Graduate,Yes,4200000,2,433,3300000,0,3700000,Approved,2.0
727,3,Not Graduate,Yes,2200000,2,877,5000000,1300000,3100000,Approved,2.0
2057,1,Graduate,Yes,6800000,2,702,4600000,2400000,6300000,Approved,2.0
1938,4,Graduate,Yes,1200000,2,752,2800000,900000,600000,Approved,2.0
1132,2,Not Graduate,No,3500000,2,355,8700000,1600000,2900000,Approved,2.0
4149,5,Not Graduate,No,700000,2,561,900000,400000,800000,Approved,2.0
3556,5,Graduate,No,5200000,2,538,3200000,2000000,3600000,Approved,1.990385
2656,3,Not Graduate,No,3300000,2,554,6000000,2500000,3800000,Approved,1.984848
3942,0,Graduate,No,7600000,2,743,20700000,8700000,9600000,Approved,1.980263


### Check if there is highly correlated variables that have a correlation above 0.9 for the purpose of XGboost

In [11]:
corr_matrix = df[numeric_features].corr()
print(corr_matrix)

                          no_of_dependents       PTI  income_annum  loan_term  \
no_of_dependents                  1.000000  0.006366      0.007266  -0.020111   
PTI                               0.006366  1.000000     -0.007520  -0.774125   
income_annum                      0.007266 -0.007520      1.000000   0.011488   
loan_term                        -0.020111 -0.774125      0.011488   1.000000   
cibil_score                      -0.009998 -0.004969     -0.023034   0.007810   
residential_assets_value          0.007376 -0.014898      0.636841   0.008016   
commercial_assets_value          -0.001531  0.002258      0.640328  -0.005478   
bank_asset_value                  0.011163 -0.014025      0.851093   0.017177   

                          cibil_score  residential_assets_value  \
no_of_dependents            -0.009998                  0.007376   
PTI                         -0.004969                 -0.014898   
income_annum                -0.023034                  0.636841   
lo

In [12]:
X = df.drop('loan_status', axis=1)
y = df['loan_status'].map({' Approved': 1, ' Rejected': 0})
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
def remove_outliers(df, numeric_features):
    df_clean = df.copy()
    for col in numeric_features:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        mask = (df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)
        df_clean = df_clean[mask]
    return df_clean

X_train_clean = remove_outliers(X_train, numeric_features)
y_train_clean = y_train[X_train_clean.index]
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ],
    remainder='drop'
)
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
cat_encoder = preprocessor.named_transformers_['cat']
cat_names = cat_encoder.get_feature_names_out(categorical_features)
final_feature_names = list(numeric_features) + list(cat_names)
final_feature_names


['no_of_dependents',
 'PTI',
 'income_annum',
 'loan_term',
 'cibil_score',
 'residential_assets_value',
 'commercial_assets_value',
 'bank_asset_value',
 'education_ Graduate',
 'education_ Not Graduate',
 'self_employed_ No',
 'self_employed_ Yes']

### Save the proprocessed test and train data

In [13]:
np.save('X_train.npy', X_train_processed)
np.save('X_test.npy', X_test_processed)
np.save('y_train.npy', y_train.to_numpy())
np.save('y_test.npy', y_test.to_numpy())

In [14]:
joblib.dump(preprocessor, 'preprocessor.joblib')   

['preprocessor.joblib']