#🎯 Objective:
Train a model to predict loan approval (binary classification).

Create a pipeline with preprocessing and modeling.

Serialize using joblib, load the model, and simulate deployment

#📌 Dataset Used:
We'll use the Loan Prediction Dataset (preprocessed) available via a public URL.



#✅ Step-by-Step Practical

#🔧 Step 0: Install Required Libraries

In [1]:
# Step 0: Setup Google Colab Environment
!pip install scikit-learn pandas numpy matplotlib seaborn



#📥 Step 1: Create a Synthetic Loan Application Dataset

We'll generate a simple dataset to simulate loan application features and an approval status.

In [2]:
# Step 1: Create a Synthetic Loan Application Dataset

import pandas as pd
import numpy as np

# Set a random seed for reproducibility
np.random.seed(42)

# Number of samples
n_samples = 1000

# Generate synthetic data
data = {
    'ApplicantIncome': np.random.randint(20000, 100000, n_samples),
    'LoanAmount': np.random.randint(50000, 500000, n_samples),
    'CreditHistory': np.random.choice([0, 1], n_samples, p=[0.2, 0.8]), # 1 for good, 0 for bad
    'Education': np.random.choice(['Graduate', 'Not Graduate'], n_samples),
    'SelfEmployed': np.random.choice(['Yes', 'No'], n_samples),
    'PropertyArea': np.random.choice(['Urban', 'Semiurban', 'Rural'], n_samples)
}
df = pd.DataFrame(data)



In [3]:
# Create a target variable 'Loan_Status'
# Simple rule: Good credit history + high income + low loan amount -> higher chance of approval
df['Loan_Status'] = 0 # Default to Not Approved

# Rule 1: Good credit history + high income + reasonable loan
df.loc[(df['CreditHistory'] == 1) &
       (df['ApplicantIncome'] > 40000) &
       (df['LoanAmount'] < 300000), 'Loan_Status'] = 1

# Rule 2: Self-employed graduates with good credit history and moderate loan
df.loc[(df['CreditHistory'] == 1) &
       (df['Education'] == 'Graduate') &
       (df['SelfEmployed'] == 'Yes') &
       (df['LoanAmount'] < 200000), 'Loan_Status'] = 1



In [4]:
# Introduce some noise/exceptions
noise_indices = np.random.choice(df.index, size=int(n_samples * 0.1), replace=False)
df.loc[noise_indices, 'Loan_Status'] = 1 - df.loc[noise_indices, 'Loan_Status'] # Flip status for 10%

print("Synthetic Loan Application Dataset Head:")
print(df.head())
print("\nLoan Status Distribution:")
print(df['Loan_Status'].value_counts())
print("\nDataset Info:")
df.info()

Synthetic Loan Application Dataset Head:
   ApplicantIncome  LoanAmount  CreditHistory     Education SelfEmployed  \
0            35795      289690              1  Not Graduate          Yes   
1            20860      189649              1  Not Graduate          Yes   
2            96820       58070              1  Not Graduate           No   
3            74886      176161              1  Not Graduate          Yes   
4            26265       58666              0      Graduate          Yes   

  PropertyArea  Loan_Status  
0        Urban            0  
1        Rural            1  
2        Rural            1  
3        Rural            1  
4        Rural            1  

Loan Status Distribution:
Loan_Status
0    629
1    371
Name: count, dtype: int64

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   ApplicantIncome  1000 no

#📂 Step 2: Build a Machine Learning Pipeline
We'll build a scikit-learn pipeline that includes preprocessing steps for categorical features and a classifier.

In [5]:
# Step 2: Build a Machine Learning Pipeline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib # For serialization

# Define features (X) and target (y)
X = df.drop('Loan_Status', axis=1)
y = df['Loan_Status']




In [6]:
# Identify numerical and categorical features
numerical_features = X.select_dtypes(include=np.number).columns
categorical_features = X.select_dtypes(include='object').columns

print(f"\nNumerical Features: {list(numerical_features)}")
print(f"Categorical Features: {list(categorical_features)}")

# Create preprocessing pipelines for numerical and categorical features
numerical_transformer = StandardScaler() # Scale numerical features
categorical_transformer = OneHotEncoder(handle_unknown='ignore') # One-hot encode categorical features

# Create a preprocessor using ColumnTransformer
# This allows applying different transformers to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])



Numerical Features: ['ApplicantIncome', 'LoanAmount', 'CreditHistory']
Categorical Features: ['Education', 'SelfEmployed', 'PropertyArea']


In [7]:
# Create the full pipeline: Preprocessing -> Model
# For this example, we use Logistic Regression
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', LogisticRegression(solver='liblinear', random_state=42))])

print("\nMachine Learning Pipeline created successfully.")
print(model_pipeline)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train the pipeline
print("\nTraining the model pipeline...")
model_pipeline.fit(X_train, y_train)
print("Model pipeline training complete.")




Machine Learning Pipeline created successfully.
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  Index(['ApplicantIncome', 'LoanAmount', 'CreditHistory'], dtype='object')),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  Index(['Education', 'SelfEmployed', 'PropertyArea'], dtype='object'))])),
                ('classifier',
                 LogisticRegression(random_state=42, solver='liblinear'))])

Training the model pipeline...
Model pipeline training complete.


In [8]:
# Evaluate the pipeline (optional, but good practice)
y_pred = model_pipeline.predict(X_test)
print(f"\nPipeline Accuracy on Test Set: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report on Test Set:")
print(classification_report(y_test, y_pred))


Pipeline Accuracy on Test Set: 0.8750

Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.87      0.94      0.90       126
           1       0.88      0.77      0.82        74

    accuracy                           0.88       200
   macro avg       0.88      0.85      0.86       200
weighted avg       0.88      0.88      0.87       200



#Step 3: Serialize (Save) the Model Pipeline with joblib
Saving the entire trained pipeline ensures that both the preprocessing logic and the trained model weights are preserved.

In [9]:
# Step 3: Serialize (Save) the Model Pipeline with joblib

# Define the filename for the saved pipeline
pipeline_filename = 'loan_approval_pipeline.pkl'

# Save the trained pipeline to a file using joblib.dump()
joblib.dump(model_pipeline, pipeline_filename)

print(f"\nModel pipeline successfully serialized and saved as '{pipeline_filename}'")
print("You can verify its presence in the Colab file browser (left sidebar).")


Model pipeline successfully serialized and saved as 'loan_approval_pipeline.pkl'
You can verify its presence in the Colab file browser (left sidebar).


#Step 4: Deploy (Load and Use) the Model Pipeline
This step simulates the "deployment" scenario. We'll load the saved pipeline and use it to make predictions on new, unseen loan applications.

In [13]:
# Step 4: Deploy (Load and Use) the Model Pipeline

# Define the filename of the saved pipeline
pipeline_filename = 'loan_approval_pipeline.pkl'

# Load the serialized pipeline using joblib.load()
try:
    loaded_pipeline = joblib.load(pipeline_filename)
    print(f"\nModel pipeline successfully loaded from '{pipeline_filename}'")
except FileNotFoundError:
    print(f"Error: Pipeline file '{pipeline_filename}' not found. Make sure Step 3 was executed.")
    loaded_pipeline = None # Set to None to prevent errors later

if loaded_pipeline:
    # Simulate new, unseen loan application data
    # IMPORTANT: The columns and their order must match the training data
    new_loan_applicants = pd.DataFrame({
        'ApplicantIncome': [60000, 35000, 80000, 25000],
        'LoanAmount': [200000, 100000, 450000, 70000],
        'CreditHistory': [1, 0, 1, 1],
        'Education': ['Graduate', 'Not Graduate', 'Graduate', 'Graduate'],
        'SelfEmployed': ['No', 'Yes', 'No', 'No'],
        'PropertyArea': ['Urban', 'Rural', 'Semiurban', 'Urban']
    })

    print("\nNew Loan Applicants Data:")
    print(new_loan_applicants)

# Make predictions using the loaded pipeline
    # The pipeline automatically handles preprocessing of new_loan_applicants
    # before feeding it to the classifier.
    predictions = loaded_pipeline.predict(new_loan_applicants)
    prediction_probabilities = loaded_pipeline.predict_proba(new_loan_applicants)

    # Map numerical predictions to meaningful labels
    loan_status_map = {0: 'Not Approved', 1: 'Approved'}
    predicted_statuses = [loan_status_map[pred] for pred in predictions]

    print("\nLoan Approval Predictions for New Applicants:")
    for i, status in enumerate(predicted_statuses):
        applicant_income = new_loan_applicants.iloc[i]['ApplicantIncome']
        loan_amount = new_loan_applicants.iloc[i]['LoanAmount']
        prob_not_approved = prediction_probabilities[i][0] * 100
        prob_approved = prediction_probabilities[i][1] * 100

        print(f"  Applicant {i+1} (Income: {applicant_income}, Loan: {loan_amount}):")
        print(f"    -> Predicted Status: {status}")
        print(f"    -> Probability (Not Approved): {prob_not_approved:.2f}%")
        print(f"    -> Probability (Approved): {prob_approved:.2f}%")



Model pipeline successfully loaded from 'loan_approval_pipeline.pkl'

New Loan Applicants Data:
   ApplicantIncome  LoanAmount  CreditHistory     Education SelfEmployed  \
0            60000      200000              1      Graduate           No   
1            35000      100000              0  Not Graduate          Yes   
2            80000      450000              1      Graduate           No   
3            25000       70000              1      Graduate           No   

  PropertyArea  
0        Urban  
1        Rural  
2    Semiurban  
3        Urban  

Loan Approval Predictions for New Applicants:
  Applicant 1 (Income: 60000, Loan: 200000):
    -> Predicted Status: Approved
    -> Probability (Not Approved): 39.08%
    -> Probability (Approved): 60.92%
  Applicant 2 (Income: 35000, Loan: 100000):
    -> Predicted Status: Not Approved
    -> Probability (Not Approved): 71.52%
    -> Probability (Approved): 28.48%
  Applicant 3 (Income: 80000, Loan: 450000):
    -> Predicted Status