# Milestone 2: Model Enhancement & Tuning

Loan Approval Prediction - Milestone 2: Model Enhancement & Tuning

This covers:

- Hyperparameter Tuning: Finding optimal model configuration for best performance.
- Cross-Validation: Robust evaluation technique to prevent data leakage and overfitting.
- Regularization: Techniques to prevent overfitting by penalizing complex models.
- Model Architectures: Experimenting with different algorithms to find the best performer.
- Ensemble Methods: Combining multiple models to improve predictive performance.

In [None]:
#importing the libaries
import pandas as pd #Used for data manipulation and analysis, especially for working with tabular data (DataFrames and Series).
import numpy as np #Provides support for numerical operations, including arrays, matrices, and mathematical functions.
import matplotlib.pyplot as plt #Used for creating visualizations, such as line plots, bar charts, scatter plots, etc.
import seaborn as sns #It provides attractive statistical graphics like heatmaps, violin plots, and boxplots.
from sklearn.preprocessing import LabelEncoder, StandardScaler #mports two tools from Scikit-learn's preprocessing module
#LabelEncoder: Converts categorical string labels into numeric form.
#StandardScaler: Standardizes features by removing the mean and scaling to unit variance.
from sklearn.feature_selection import SelectKBest, f_classif #Imports tools for feature selection
from sklearn.decomposition import PCA #Imports Principal Component Analysis (PCA) from Scikit-learn.
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Split the data into training and testing sets
# random_state=42 ensures reproducibility
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Fixed random seed

In [None]:
# Task 1: Hyperparameter Tuning using GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

"""
    Find optimal hyperparameters for different models.
    
    Why we do this:
    - Default parameters are rarely optimal for specific datasets
    - Proper tuning can significantly improve model performance
    - Different problems require different model configurations
    - Helps balance bias-variance tradeoff
    """

param_grid = {
    'n_estimators': [100, 200],  # Number of trees (more trees = better but slower)
    'max_depth': [None, 5, 10], # Tree depth (None = unlimited, 5/10 = restricts depth)
    'min_samples_split': [2, 5] # Min samples to split a node (2 = default, 5 = conservative)
}
# Initialize GridSearchCV with 5-fold cross-validation
grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)
# Fit the grid search to training data
print('Best parameters:', grid.best_params_)
print('Best cross-validation accuracy:', grid.best_score_)

In [None]:
# Task 2: Cross-Validation
# Evaluate the best model using 5-fold cross-validation on the full dataset
from sklearn.model_selection import cross_val_score
best_rf = grid.best_estimator_  # Get the best model from GridSearchCV
cv_scores = cross_val_score(best_rf, X, y, cv=5) # 5-fold CV scores
print(f'Cross-Validation Accuracy Scores: {cv_scores}')
print(f'Average CV Accuracy: {cv_scores.mean()}')

In [None]:
# Task 3: Model Architecture Comparison
# Purpose: Compare performance of different machine learning algorithms on the loan approval dataset
from sklearn.linear_model import LogisticRegression # Linear classification model
from xgboost import XGBClassifier # Gradient boosting model (optimized for performance)
# Define a dictionary of models to compare:
 # 1. Logistic Regression (Baseline Linear Model)
    # - Simple, interpretable, but assumes linear decision boundaries
    # - max_iter=1000 ensures convergence (default may not suffice for some datasets)
 # 2. Random Forest (Best Tuned Model from GridSearchCV)
    # - Ensemble of decision trees, handles non-linear relationships well
    # - best_rf is the optimized model from earlier hyperparameter tuning
# 3. XGBoost (Gradient Boosting)
    # - State-of-the-art boosting algorithm, often high accuracy
    # - use_label_encoder=False avoids warnings (uses native XGBoost label handling)
    # - eval_metric='logloss' sets evaluation metric for binary classification
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000), # Linear model
    'Random Forest': best_rf,  # Best tuned RF
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss') # Gradient boosting
}
# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train) # Fit the model to the training data
    acc = model.score(X_test, y_test)  # Calculate accuracy on the test set
    print(f'{name} Accuracy: {acc:.4f}') # Print formatted results: # - and .4f formats the accuracy to 4 decimal places


"""
    Evaluate models using cross-validation and apply regularization.
    
    Why we do this:
    - Cross-validation provides more reliable performance estimates
    - Helps detect overfitting by evaluating on multiple validation sets
    - Regularization prevents overfitting by penalizing complex models
    - Ensures model generalizes well to unseen data
"""