In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.metrics import classification_report, confusion_matrix, mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
import mlflow
import mlflow.sklearn

In [2]:
df = pd.read_csv("cleaned_EMI.csv")
df.head(5)

Unnamed: 0,age,gender,marital_status,education,monthly_salary,employment_type,years_of_employment,company_type,house_type,monthly_rent,...,existing_loans,current_emi_amount,credit_score,bank_balance,emergency_fund,emi_scenario,requested_amount,requested_tenure,emi_eligibility,max_monthly_emi
0,38,Female,Married,Professional,82600.0,Private,0.9,Mid-size,Rented,20000.0,...,Yes,23700.0,660.0,303200.0,70200.0,Personal Loan EMI,850000.0,15,Not_Eligible,500.0
1,38,Female,Married,Graduate,21500.0,Private,7.0,MNC,Family,0.0,...,Yes,4100.0,714.0,92500.0,26900.0,E-commerce Shopping EMI,128000.0,19,Not_Eligible,700.0
2,38,Male,Married,Professional,86100.0,Private,5.8,Startup,Own,0.0,...,No,0.0,650.0,672100.0,324200.0,Education EMI,306000.0,16,Eligible,27775.0
3,58,Female,Married,High School,66800.0,Private,2.2,Mid-size,Own,0.0,...,No,0.0,685.0,440900.0,178100.0,Vehicle EMI,304000.0,83,Eligible,16170.0
4,48,Female,Married,Professional,57300.0,Private,3.4,Mid-size,Family,0.0,...,No,0.0,770.0,97300.0,28200.0,Home Appliances EMI,252000.0,7,Not_Eligible,500.0


In [3]:
df.shape

(392899, 27)

In [4]:
# categorical columns  -> Encoding
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392899 entries, 0 to 392898
Data columns (total 27 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   age                     392899 non-null  int64  
 1   gender                  392899 non-null  int64  
 2   marital_status          392899 non-null  int64  
 3   education               392899 non-null  int64  
 4   monthly_salary          392899 non-null  float64
 5   employment_type         392899 non-null  int64  
 6   years_of_employment     392899 non-null  float64
 7   company_type            392899 non-null  int64  
 8   house_type              392899 non-null  int64  
 9   monthly_rent            392899 non-null  float64
 10  family_size             392899 non-null  int64  
 11  dependents              392899 non-null  int64  
 12  school_fees             392899 non-null  float64
 13  college_fees            392899 non-null  float64
 14  travel_expenses     

In [6]:
#Linear Regression Model
# --- Step 1: Prepare features and target ---
X = df.drop('max_monthly_emi', axis=1)
y = df['max_monthly_emi']

# --- Step 2: Split data ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Step 3: Set MLflow experiment (optional) ---
mlflow.set_experiment("EMI_Prediction")

# --- Step 4: Start MLflow Run ---
with mlflow.start_run(run_name="LinearRegression_Model"):

    # Train model
    lr = LinearRegression()
    lr.fit(X_train, y_train)

    # Predict
    y_pred = lr.predict(X_test)

    # Evaluate
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)

    # --- Log parameters (if any) ---
    mlflow.log_param("model_type", "Linear Regression")
    mlflow.log_param("test_size", 0.2)

    # --- Log metrics ---
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2_score", r2)
    mlflow.log_metric("mape", mape)

    # --- Log the model ---
    mlflow.sklearn.log_model(lr, "linear_regression_model")

    # --- Print summary ---
    print(f"MSE   : {mse:.2f}")
    print(f"RMSE  : {rmse:.2f}")
    print(f"R²    : {r2:.3f}")
    print(f"MAPE  : {mape:.3f}")

# --- Optional: View actual vs predicted ---
comparison = pd.DataFrame({
    "Actual": y_test.values,
    "Predicted": y_pred
})
print(comparison.head(10))

2025/11/06 22:27:56 INFO mlflow.tracking.fluent: Experiment with name 'EMI_Prediction' does not exist. Creating a new experiment.
The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



MSE   : 14823741.59
RMSE  : 3850.16
R²    : 0.751
MAPE  : 1.684
    Actual     Predicted
0    500.0    662.622373
1  37422.0  26130.744198
2   4080.0   6629.652840
3   2485.0   2917.542988
4  22500.0  16875.085657
5    500.0   7621.962966
6   7459.2   8286.044021
7    500.0    105.564101
8    500.0    876.261592
9   2240.0   3413.479618


In [7]:
# Linear Regression with Hyperparameter Tuning (Ridge Regression)
# # --- Features and target ---
X = df.drop('max_monthly_emi', axis=1)
y = df['max_monthly_emi']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# MLflow experiment
mlflow.set_experiment("EMI_Prediction")

# Hyperparameter grid
param_grid = {
    'alpha': [0.01, 0.1, 1, 10, 100]
}

# Grid search with Ridge
ridge = Ridge()
grid = GridSearchCV(ridge, param_grid, cv=5, scoring='r2', n_jobs=-1)

with mlflow.start_run(run_name="Linear_Regression_Ridge_Regression_Tuning"):

    # Fit GridSearchCV
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_

    # Predict
    y_pred = best_model.predict(X_test)

    # Metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)

    # Log best hyperparameters
    mlflow.log_param("model_type", "Linear Regression with Ridge")
    mlflow.log_param("test_size", 0.2)
    mlflow.log_params(grid.best_params_)

    # Log metrics
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2_score", r2)
    mlflow.log_metric("mape", mape)

    # Log model
    mlflow.sklearn.log_model(best_model, "linear_regression_model with Ridge_tuning")

    # Print summary
    print("Best Hyperparameters:", grid.best_params_)
    print(f"MSE   : {mse:.2f}")
    print(f"RMSE  : {rmse:.2f}")
    print(f"R²    : {r2:.3f}")
    print(f"MAPE  : {mape:.3f}")

# Optional: compare actual vs predicted
comparison = pd.DataFrame({"Actual": y_test.values, "Predicted": y_pred})
print(comparison.head(10))



Best Hyperparameters: {'alpha': 10}
MSE   : 14823749.10
RMSE  : 3850.16
R²    : 0.751
MAPE  : 1.684
    Actual     Predicted
0    500.0    662.861856
1  37422.0  26130.662713
2   4080.0   6629.587724
3   2485.0   2917.787665
4  22500.0  16875.300940
5    500.0   7621.743534
6   7459.2   8286.337577
7    500.0    105.509808
8    500.0    876.355203
9   2240.0   3413.468685


In [8]:
# Random Forest Feature Importances and train it and get results
# --- Features and target ---
X = df.drop('max_monthly_emi', axis=1)
y = df['max_monthly_emi']

# --- Split data ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- MLflow experiment ---
mlflow.set_experiment("EMI_Prediction")

# --- Use your best hyperparameters ---
best_params = {
    'max_depth': None,
    'min_samples_leaf': 1,
    'min_samples_split': 2,
    'n_estimators': 200
}

# --- Initialize model ---
rf_best = RandomForestRegressor(random_state=42, **best_params)

# --- Start MLflow run ---
with mlflow.start_run(run_name="RandomForest_FinalModel_BestParams"):

    # Train model
    rf_best.fit(X_train, y_train)

    # Predict
    y_pred = rf_best.predict(X_test)

    # Evaluate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)

    # Log parameters and metrics to MLflow
    mlflow.log_param("model_type", "Random Forest Regressor - Final Best Model")
    mlflow.log_params(best_params)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2_score", r2)
    mlflow.log_metric("mape", mape)

    # Log model
    mlflow.sklearn.log_model(rf_best, "random_forest_final_model")

    # --- Feature Importances ---
    feature_importances = pd.DataFrame({
        'Feature': X.columns,
        'Importance': rf_best.feature_importances_
    }).sort_values(by='Importance', ascending=False)

    print("\nTop 10 Important Features:")
    print(feature_importances.head(10))

    # Save & log feature importances
    feature_importances.to_csv("feature_importances.csv", index=False)
    mlflow.log_artifact("feature_importances.csv")

    # --- Print Performance ---
    print("\nModel Performance:")
    print(f"MSE   : {mse:.2f}")
    print(f"RMSE  : {rmse:.2f}")
    print(f"R²    : {r2:.3f}")
    print(f"MAPE  : {mape:.3f}")

# Optional: view sample predictions
comparison = pd.DataFrame({"Actual": y_test.values, "Predicted": y_pred})
print("\nSample Predictions:")
print(comparison.head(10))




Top 10 Important Features:
                Feature  Importance
25      emi_eligibility    0.343610
4        monthly_salary    0.236287
8            house_type    0.087683
13         college_fees    0.059211
18   current_emi_amount    0.056742
17       existing_loans    0.044075
12          school_fees    0.041905
19         credit_score    0.028555
15  groceries_utilities    0.020518
20         bank_balance    0.020498

Model Performance:
MSE   : 2028240.70
RMSE  : 1424.16
R²    : 0.966
MAPE  : 0.227

Sample Predictions:
    Actual   Predicted
0    500.0    510.0250
1  37422.0  38197.3820
2   4080.0   5898.0650
3   2485.0   1800.6438
4  22500.0  20283.5295
5    500.0    505.8300
6   7459.2   7812.6608
7    500.0    502.3000
8    500.0    500.0000
9   2240.0   2444.4420


In [9]:
import joblib

joblib.dump(rf_best, "random_forest.pkl")

['random_forest.pkl']

In [10]:
# Random Forest with Top 10 Features
top_features = [
    'emi_eligibility',
    'monthly_salary',
    'house_type',
    'college_fees',
    'current_emi_amount',
    'existing_loans',
    'school_fees',
    'credit_score',
    'groceries_utilities',
    'bank_balance'
]

# --- Select features & target ---
X = df[top_features]
y = df['max_monthly_emi']

# --- Split data ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Best hyperparameters ---
best_params = {
    'n_estimators': 200,
    'max_depth': None,
    'min_samples_split': 2,
    'min_samples_leaf': 1
}

# --- MLflow experiment ---
mlflow.set_experiment("EMI_Prediction")

with mlflow.start_run(run_name="RandomForest_Top10Features"):
    
    # Initialize & train model
    rf_light = RandomForestRegressor(random_state=42, n_jobs=-1, **best_params)
    rf_light.fit(X_train, y_train)
    
    # Predict
    y_pred = rf_light.predict(X_test)
    
    # Evaluate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    
    # Log parameters & metrics
    mlflow.log_param("model_type", "Random Forest - Top 10 Features")
    mlflow.log_params(best_params)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2_score", r2)
    mlflow.log_metric("mape", mape)
    
    # Log model
    mlflow.sklearn.log_model(rf_light, "rf_light_model")
    
    # Feature importances
    feature_importances = pd.DataFrame({
        'Feature': X.columns,
        'Importance': rf_light.feature_importances_
    }).sort_values(by='Importance', ascending=False)
    print("\nTop 10 Feature Importances:")
    print(feature_importances)
    
    # Save & log feature importances
    feature_importances.to_csv("feature_importances_top10.csv", index=False)
    mlflow.log_artifact("feature_importances_top10.csv")
    
    # Print metrics
    print("\nModel Performance:")
    print(f"MSE   : {mse:.2f}")
    print(f"RMSE  : {rmse:.2f}")
    print(f"R²    : {r2:.3f}")
    print(f"MAPE  : {mape:.3f}")

# Optional: compare actual vs predicted
comparison = pd.DataFrame({"Actual": y_test.values, "Predicted": y_pred})
print("\nSample Comparison:")
print(comparison.head(10))




Top 10 Feature Importances:
               Feature  Importance
0      emi_eligibility    0.343405
1       monthly_salary    0.245411
2           house_type    0.089529
3         college_fees    0.062095
4   current_emi_amount    0.057183
6          school_fees    0.048323
5       existing_loans    0.046233
7         credit_score    0.040120
9         bank_balance    0.034404
8  groceries_utilities    0.033296

Model Performance:
MSE   : 4118667.22
RMSE  : 2029.45
R²    : 0.931
MAPE  : 0.339

Sample Comparison:
    Actual    Predicted
0    500.0    510.45500
1  37422.0  34439.14200
2   4080.0   5739.91500
3   2485.0   1366.64175
4  22500.0  18950.68250
5    500.0    505.83000
6   7459.2   8057.21420
7    500.0    503.70000
8    500.0    500.00000
9   2240.0   2314.52410


In [11]:
#Normal Random Forest Regression
# --- Features & target ---
X = df.drop('max_monthly_emi', axis=1)
y = df['max_monthly_emi']

# --- Split data ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- MLflow experiment ---
mlflow.set_experiment("EMI_Prediction")

with mlflow.start_run(run_name="RandomForest_AllFeatures"):
    
    # Initialize Random Forest with default parameters
    rf = RandomForestRegressor(random_state=42, n_jobs=-1)
    
    # Train model
    rf.fit(X_train, y_train)
    
    # Predict
    y_pred = rf.predict(X_test)
    
    # Evaluate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    
    # Log parameters & metrics
    mlflow.log_param("model_type", "Random Forest Regressor - All Features")
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2_score", r2)
    mlflow.log_metric("mape", mape)
    
    # Log model
    mlflow.sklearn.log_model(rf, "rf_default_model")
      
    # Print metrics
    print("\nModel Performance:")
    print(f"MSE   : {mse:.2f}")
    print(f"RMSE  : {rmse:.2f}")
    print(f"R²    : {r2:.3f}")
    print(f"MAPE  : {mape:.3f}")

# Optional: compare actual vs predicted
comparison = pd.DataFrame({"Actual": y_test.values, "Predicted": y_pred})
print("\nSample Comparison:")
print(comparison.head(10))




Model Performance:
MSE   : 2043374.91
RMSE  : 1429.47
R²    : 0.966
MAPE  : 0.227

Sample Comparison:
    Actual   Predicted
0    500.0    503.0000
1  37422.0  38253.1780
2   4080.0   5869.1720
3   2485.0   1903.4283
4  22500.0  20426.3100
5    500.0    502.8000
6   7459.2   7753.1440
7    500.0    504.6000
8    500.0    500.0000
9   2240.0   2461.7236


In [12]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['emi_eligibility'] = le.fit_transform(df['emi_eligibility'])
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(label_mapping)

{np.int64(0): np.int64(0), np.int64(1): np.int64(1), np.int64(2): np.int64(2)}
