In [1]:
# STEP 1: IMPORT LIBRARIES
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
# STEP 2: IMPORT DATASET
dataset = pd.read_csv('Loan_approval_data_2025.csv')

# Drop 'customer_id' as it is not useful for prediction
dataset = dataset.drop(columns=['customer_id'])

# Define Features (X) and Target (y)
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [3]:
# STEP 3: HANDLING MISSING DATA
# We use SimpleImputer to replace them with the Mean (Average) Income.

# Note: In X, column 3 is 'annual_income'. 
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# We apply the imputer ONLY to the numerical column with missing data (Index 3)
X[:, 3:4] = imputer.fit_transform(X[:, 3:4])

#Print to confirm no NaN values remain
print("Missing values handled.")

Missing values handled.


In [4]:
# STEP 4: ENCODING CATEGORICAL DATA
# Columns with text:
# Index 1: occupation_status
# Index 11: product_type
# Index 12: loan_intent

ct = ColumnTransformer(transformers=[
    ('encoder', OneHotEncoder(), [1, 11, 12])
], remainder='passthrough')

X = np.array(ct.fit_transform(X))


In [5]:
# STEP 5: SPLITTING THE DATASET
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [6]:
# STEP 6: FEATURE SCALING
sc = StandardScaler()

# Remember: Fit on Train, Transform on Test (to avoid data leakage)
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

print("Preprocessing Complete. Ready for Model Training.")
print(f"Training Data Size: {len(X_train)}")
print(f"Testing Data Size: {len(X_test)}")

Preprocessing Complete. Ready for Model Training.
Training Data Size: 518
Testing Data Size: 130


In [7]:
# SANITY CHECK: Print the number of missing values in each column
print("Missing values after cleaning:")
print(pd.DataFrame(X).isnull().sum())
# The result should be a list of Zeros (0), proving it is clean.

Missing values after cleaning:
0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
dtype: int64
