#Loan Prediction - End-to-End ML Pipeline

###Libraries to be imported for the project

In [19]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib

**1. Load Dataset**

In [4]:
data = pd.read_csv('/content/train_u6lujuX_CVtuZ9i (1).csv')
data.head(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
6,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y
7,LP001014,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,N
8,LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,Y
9,LP001020,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,N


#The data analysis process.


In [5]:
data.shape

(614, 13)

In [6]:
data.isna().sum()

Unnamed: 0,0
Loan_ID,0
Gender,13
Married,3
Dependents,15
Education,0
Self_Employed,32
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,22
Loan_Amount_Term,14


In [7]:
data.sample(20)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
203,LP001688,Male,Yes,1,Not Graduate,No,3500,1083.0,135.0,360.0,1.0,Urban,Y
518,LP002683,Male,No,0,Graduate,No,4683,1915.0,185.0,360.0,1.0,Semiurban,N
325,LP002067,Male,Yes,1,Graduate,Yes,8666,4983.0,376.0,360.0,0.0,Rural,N
63,LP001213,Male,Yes,1,Graduate,No,4945,0.0,,360.0,0.0,Rural,N
215,LP001720,Male,Yes,3+,Not Graduate,No,3850,983.0,100.0,360.0,1.0,Semiurban,Y
12,LP001028,Male,Yes,2,Graduate,No,3073,8106.0,200.0,360.0,1.0,Urban,Y
365,LP002181,Male,No,0,Not Graduate,No,6216,0.0,133.0,360.0,1.0,Rural,N
328,LP002086,Female,Yes,0,Graduate,No,4333,2451.0,110.0,360.0,1.0,Urban,N
385,LP002237,Male,No,1,Graduate,,3667,0.0,113.0,180.0,1.0,Urban,Y
104,LP001357,Male,,,Graduate,No,3816,754.0,160.0,360.0,1.0,Urban,Y


In [8]:
for col in data.columns:
    print(col, data[col].dtype, data[col].isna().sum())


Loan_ID object 0
Gender object 13
Married object 3
Dependents object 15
Education object 0
Self_Employed object 32
ApplicantIncome int64 0
CoapplicantIncome float64 0
LoanAmount float64 22
Loan_Amount_Term float64 14
Credit_History float64 50
Property_Area object 0
Loan_Status object 0


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


 **2. Basic Cleaning
 Convert '3+' in Dependents column to numeric**

In [10]:
data['Dependents'] = data['Dependents'].replace('3+', 3).astype(float)

**3. Separate Features and Target**

In [11]:
X = data.drop(['Loan_Status', 'Loan_ID'], axis=1)
y = data['Loan_Status'].map({'Y': 1, 'N': 0})

**4. Define Feature Groups**

In [12]:
# Numeric features
num_features = [
    'ApplicantIncome',
    'CoapplicantIncome',
    'LoanAmount',
    'Loan_Amount_Term',
    'Credit_History',
    'Dependents'
]

# Binary categorical features
bin_features = [
    'Gender',
    'Married',
    'Education',
    'Self_Employed'
]

# Nominal categorical features
ohe_features = ['Property_Area']


**5. Build Transformers**

In [13]:
# Numeric pipeline: Median imputation + scaling
num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Binary categorical pipeline: Mode imputation + ordinal encoding
bin_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder())
])

# Nominal categorical pipeline: Mode imputation + One-Hot Encoding
ohe_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


**6. ColumnTransformer**

In [14]:
preprocessor = ColumnTransformer([
    ('num', num_transformer, num_features),
    ('bin', bin_transformer, bin_features),
    ('ohe', ohe_transformer, ohe_features)
])

**7. Train/Test Split**

In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


**8. Logistic Regression Pipeline**

In [16]:
logreg_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

logreg_pipeline.fit(X_train, y_train)
y_pred_lr = logreg_pipeline.predict(X_test)

print("LOGISTIC REGRESSION RESULTS:\n")
print(classification_report(y_test, y_pred_lr))

LOGISTIC REGRESSION RESULTS:

              precision    recall  f1-score   support

           0       0.95      0.42      0.58        43
           1       0.76      0.99      0.86        80

    accuracy                           0.79       123
   macro avg       0.85      0.70      0.72       123
weighted avg       0.83      0.79      0.76       123



**9. Random Forest Pipeline**

In [17]:
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)

print("RANDOM FOREST RESULTS:\n")
print(classification_report(y_test, y_pred_rf))

RANDOM FOREST RESULTS:

              precision    recall  f1-score   support

           0       0.78      0.42      0.55        43
           1       0.75      0.94      0.83        80

    accuracy                           0.76       123
   macro avg       0.77      0.68      0.69       123
weighted avg       0.76      0.76      0.73       123



In [20]:
joblib.dump(logreg_pipeline, 'loan_predication_model.joblib')

['loan_predication_model.joblib']