#### **1. Import Libraries and Load Data**

In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Preprocessing liberaries
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_regression
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix

# statistical libraries
from scipy import stats
from scipy.stats import zscore, skew

# set style forbetter visualiztions
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

print("Libraries imported successfully!")

Libraries imported successfully!


In [57]:
# Load dataset from GitHub
# url = "https://raw.githubusercontent.com/ek-chris/Practice_datasets/refs/heads/main/home_loan_train.csv"
# df = pd.read_csv(url)

url = "https://raw.githubusercontent.com/ek-chris/Practice_datasets/refs/heads/main/home_loan_train.csv"

df = pd.read_csv(url)

# Basic info
print(df.shape)
print(df.dtypes)

# Preview first few rows
display(df.head())

# Check missing values
print("\nMissing values per column:")
print(df.isnull().sum())

(614, 13)
Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y



Missing values per column:
Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64


In [58]:
# Creating a copy for preprocessing
df_copy = df.copy()
df_copy['LoanAmount'] = pd.to_numeric(df_copy['LoanAmount'], errors='coerce')

# 1. Checking for missing values (EDA showed on missing values)
print("\n1. Missing Values:")
missing_values = df_copy.isnull().sum()
if missing_values.sum() > 0:
    print(missing_values[missing_values > 0])
else:
    print("No missing values found (as expected from EDA)")

# 2. Checking for duplicates
print("\n2. Duplicate Rows:")
duplicates = df_copy.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")
if duplicates > 0:
    print(f"Percentage of duplicates: {(duplicates/len(df))*100:.2f}%")



1. Missing Values:
Gender              13
Married              3
Dependents          15
Self_Employed       32
LoanAmount          22
Loan_Amount_Term    14
Credit_History      50
dtype: int64

2. Duplicate Rows:
Number of duplicate rows: 0


In [59]:
# Handling missing value
def impute_missing_value(data):
    """
    Imputing missing value for the Home loan dataset
    Categorical columns -> Mode
    Numerical columns -> Median or Mode (depending on type)
    """

    cat_cols = ['Gender', 'Married', 'Dependents', 'Self_Employed']

    # Impute category columns with mode
    for col in cat_cols:
        data[col].fillna(data[col].mode()[0], inplace=True)

    # Impute Numerical columns
    # data['LoanAmount'] = pd.to_numeric(df_copy['LoanAmount'], errors='coerce')

    data['LoanAmount'].fillna(data['LoanAmount'].median(), inplace=True)
    data['Loan_Amount_Term'].fillna(data['Loan_Amount_Term'].mode()[0], inplace=True)
    data['Credit_History'].fillna(data['Credit_History'].mode()[0], inplace=True)

    return data

# Applying imputation function
df_copy = impute_missing_value(df_copy)

# verification
print("Missing values after imputation\n")
print(df_copy.isnull().sum())


Missing values after imputation

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64


In [60]:
# 3. Checking skewness for variables indified in EDA as a right-skewed
print("\n3. Skewness Analysis (EDA identified right-skewed variable):")
skewed_vars = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']
for var in skewed_vars:
    if var in df_copy.columns:
        skewness = skew(df_copy[var])
        print(f"{var}: skewness = {skewness:.3f} ({'right-skewed' if skewness > 0.5 else 'approximately normal'})")



3. Skewness Analysis (EDA identified right-skewed variable):
ApplicantIncome: skewness = 6.524 (right-skewed)
CoapplicantIncome: skewness = 7.473 (right-skewed)
LoanAmount: skewness = 2.736 (right-skewed)


**Encoding Categorical data**

In [68]:
encoded_df = df_copy.copy()

# Ordinal columns
ord_cols = ['Dependents', 'Education', 'Loan_Amount_Term_Cat']
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
encoded_df[ord_cols] = ordinal_encoder.fit_transform(encoded_df[ord_cols])

# Numerical columns
num_cols = ['Gender', 'Married', 'Self_Employed', 'Property_Area']
encoded_df = pd.get_dummies(encoded_df, columns=num_cols, drop_first=True)

# Encoding target variable
le = LabelEncoder()
encoded_df['Loan_Status'] = le.fit_transform(encoded_df['Loan_Status'])

print(encoded_df.shape)
print(encoded_df.head(2))

(614, 19)
   Dependents  Education  ApplicantIncome  CoapplicantIncome  LoanAmount  \
0         0.0        0.0          1418.75          -1148.625        68.0   
1         1.0        0.0          1418.75          -1148.625        68.0   

   Loan_Amount_Term  Credit_History  Loan_Status  ApplicantIncome_log  \
0             360.0             1.0            1             8.674026   
1             360.0             1.0            0             8.430109   

   CoapplicantIncome_log  LoanAmount_log  Loan_Amount_Term_Cat  TotalIncome  \
0               0.000000         4.85203                   1.0     2488.125   
1               7.319202         4.85203                   1.0     2488.125   

   TotalIncome_log  Gender_Male  Married_Yes  Self_Employed_Yes  \
0         8.674197         True        False              False   
1         8.714732         True         True              False   

   Property_Area_Semiurban  Property_Area_Urban  
0                    False                 True  
1

#### **Log-Transform Skewed Variables (EDA Recommedation)**

Based on EDA findings, transform the right-skewd variables identified

In [61]:
# Log-transform skewed variables as recommended by EDA
print("=== LOG-TRANSFORMING SKEWED VARIABLE ===")
print("EDA identied these variables as right-skewed and recommended log transformation:")

# Variable to log-transform based on EDA findings
skewed_vars = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']

for var in skewed_vars:
    if var in df_copy.columns:
        # Check if variable has zero or negative values
        min_val = df_copy[var].min()
        if min_val <= 0:
            # use log1p for variable with zeros
            df_copy[f'{var}_log'] = np.log1p(df_copy[var])
            print(f"{var}: Applied log1p transformation (had {min_val:.3f} minimum value)")
        else:
            # Use log for positive values only
            df_copy[f'{var}_log'] = np.log(df_copy[var])
            print(f"{var}: Applied log transformation")

        # Check skewness before and after
        original_skew = skew(df_copy[var])
        transformed_skew = skew(df_copy[f'{var}_log'])
        print(f"Original skewness: {original_skew:.3f} -> Transformed skewness: {transformed_skew}")

print(f"\nDataset shape after log transformation: {df_copy.shape}")
print("New log-transformed columns:", [col for col in df_copy.columns if '_log' in col])

=== LOG-TRANSFORMING SKEWED VARIABLE ===
EDA identied these variables as right-skewed and recommended log transformation:
ApplicantIncome: Applied log transformation
Original skewness: 6.524 -> Transformed skewness: 0.47840756970181564
CoapplicantIncome: Applied log1p transformation (had 0.000 minimum value)
Original skewness: 7.473 -> Transformed skewness: -0.17265017128703458
LoanAmount: Applied log transformation
Original skewness: 2.736 -> Transformed skewness: -0.19487771381643657

Dataset shape after log transformation: (614, 16)
New log-transformed columns: ['ApplicantIncome_log', 'CoapplicantIncome_log', 'LoanAmount_log']


#### **Feature Engineering**

Implement the specific feature engineering recommendations from the EDA report

In [62]:
df_copy.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,ApplicantIncome_log,CoapplicantIncome_log,LoanAmount_log
0,LP001002,Male,No,0,Graduate,No,5849,0.0,128.0,360.0,1.0,Urban,Y,8.674026,0.0,4.85203
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,8.430109,7.319202,4.85203
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,8.006368,0.0,4.189655
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,7.856707,7.765993,4.787492
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,8.699515,0.0,4.94876


In [63]:
print("FEATURE ENGINEERING BASED ON EDA RECOMMENDATION")

# 1. Loan Amount Term Cat: Grouping into categories (short <= 180, medium <= 360, long > 360)
# Rationale: Simplifies interpretation and may capture nonlinearity
df_copy['Loan_Amount_Term_Cat'] = pd.cut(df_copy['Loan_Amount_Term'], bins=[0, 180, 360, float('inf')], labels=['Short', 'Medium', 'Long'], include_lowest=True)

# 2. TotalIncome: Combing ApplicantIncome income with CoapplicantIncome
# Rationale: Combines household income, more realistic for credit evaluation.
df_copy['TotalIncome'] = df_copy['ApplicantIncome'] + df_copy['CoapplicantIncome']
df_copy['TotalIncome_log'] = np.log1p(df_copy['TotalIncome'])

# 4. Drop Unnecessary or Redundant column
# Rationale: Loan_ID not useful for modeling
if 'Loan_ID' in df_copy.columns:
    df_copy.drop(columns='Loan_ID', inplace=True)

FEATURE ENGINEERING BASED ON EDA RECOMMENDATION


In [64]:
df_copy.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status',
       'ApplicantIncome_log', 'CoapplicantIncome_log', 'LoanAmount_log',
       'Loan_Amount_Term_Cat', 'TotalIncome', 'TotalIncome_log'],
      dtype='object')

#### **Outlier Treatment**

In [65]:
# Outlier treatment
print("=== OUTLIER TREATMENT (IQR-CAPPING METHOD) ===")

# Define numerical column
num_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'TotalIncome']

# Applying IQR-Capping method
outliers_capped = 0
for col in num_cols:
    Q1 = df_copy[col].quantile(0.25)
    Q3 = df_copy[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 - 1.5 * IQR

    # Count outliers before capping
    outliers_before = ((df_copy[col] < lower_bound) | (df_copy[col] > upper_bound)).sum()

    if outliers_before > 0:
        # Cap outliers
        df_copy[col] = np.where(df_copy[col] < lower_bound, lower_bound, df_copy[col])
        df_copy[col] = np.where(df_copy[col] > upper_bound, upper_bound, df_copy[col])
        outliers_capped += outliers_before
        print(f"{col}: Capped {outliers_before} outliers")
    
    

print(f"\nTotal outliers capped: {outliers_capped}")
print(f"Dataset shape after outliers treatment: {df_copy.shape}")

=== OUTLIER TREATMENT (IQR-CAPPING METHOD) ===
ApplicantIncome: Capped 604 outliers
CoapplicantIncome: Capped 614 outliers
LoanAmount: Capped 566 outliers
TotalIncome: Capped 594 outliers

Total outliers capped: 2378
Dataset shape after outliers treatment: (614, 18)


**Encoding categorical value**

(614, 19)
   Dependents  Education  ApplicantIncome  CoapplicantIncome  LoanAmount  \
0         0.0        0.0          1418.75          -1148.625        68.0   
1         1.0        0.0          1418.75          -1148.625        68.0   

   Loan_Amount_Term  Credit_History  Loan_Status  ApplicantIncome_log  \
0             360.0             1.0            1             8.674026   
1             360.0             1.0            0             8.430109   

   CoapplicantIncome_log  LoanAmount_log  Loan_Amount_Term_Cat  TotalIncome  \
0               0.000000         4.85203                   1.0     2488.125   
1               7.319202         4.85203                   1.0     2488.125   

   TotalIncome_log  Gender_Male  Married_Yes  Self_Employed_Yes  \
0         8.674197         True        False              False   
1         8.714732         True         True              False   

   Property_Area_Semiurban  Property_Area_Urban  
0                    False                 True  
1

In [67]:
encoded_df.head(3)

Unnamed: 0,Dependents,Education,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,ApplicantIncome_log,CoapplicantIncome_log,LoanAmount_log,Loan_Amount_Term_Cat,TotalIncome,TotalIncome_log,Gender_Male,Married_Yes,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban
0,0.0,0.0,1418.75,-1148.625,68.0,360.0,1.0,1,8.674026,0.0,4.85203,1.0,2488.125,8.674197,True,False,False,False,True
1,1.0,0.0,1418.75,-1148.625,68.0,360.0,1.0,0,8.430109,7.319202,4.85203,1.0,2488.125,8.714732,True,True,False,False,False
2,0.0,0.0,1418.75,-1148.625,66.0,360.0,1.0,1,8.006368,0.0,4.189655,1.0,2488.125,8.006701,True,True,True,False,True
