In [None]:
import sys
import os

project_root = '' # Give path of your root directory
sys.path.append(project_root)

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from src.data_loader import load_data # To load the data from previous step
from src.model import train_risk_model, predict_risk_scores, evaluate_risk_model, assign_recovery_strategy, save_model # Import functions from src/model.py

In [None]:
df = load_data(r'data\processed\segmented_and_flagged_data.csv') # Load the data which you saved previously.
print("DataFrame loaded for model training. Head:")
print(df.head())
print("\nColumns after load:", df.columns.tolist())

DataFrame loaded for model training. Head:
  Borrower_ID  Age  Gender  Monthly_Income  Num_Dependents Loan_ID  \
0       BRW_1   59       0          215422               0    LN_1   
1       BRW_2   49       1           60893               0    LN_2   
2       BRW_3   35       0          116520               1    LN_3   
3       BRW_4   63       1          140818               2    LN_4   
4       BRW_5   28       0           76272               1    LN_5   

   Loan_Amount  Loan_Tenure  Interest_Rate  Loan_Type  ...  \
0      1445796           60          12.39          0  ...   
1      1044620           12          13.47          1  ...   
2      1923410           72           7.74          0  ...   
3      1811663           36          12.23          0  ...   
4        88578           48          16.13          2  ...   

   Employment_Type_Self-Employed  Collection_Method_Debt Collectors  \
0                          False                              False   
1                    

In [None]:
X_features_for_rf = [
    'Age', 'Monthly_Income', 'Num_Dependents', 'Loan_Amount', 'Loan_Tenure',
    'Interest_Rate', 'Collateral_Value', 'Outstanding_Loan_Amount', 'Monthly_EMI',
    'Num_Missed_Payments', 'Days_Past_Due', 'Collection_Attempts',
    'Loan_to_income_ratio', 'EMI_to_income_ratio', 'Has_Collateral',
    
    'Gender',
    'Payment_History',
    'Loan_Type',
    'Legal_Action_Taken',
    'Employment_Type_Salaried', 'Employment_Type_Self-Employed', 
    'Collection_Method_Debt Collectors', 'Collection_Method_Legal Notice', 'Collection_Method_Settlement Offer' 
    
]

In [None]:
for feature in X_features_for_rf:
    if feature not in df.columns:
        print(f"ERROR: Feature '{feature}' not found in DataFrame for Random Forest. Please check your preprocessing.")
       
        X_features_for_rf.remove(feature) 

In [6]:
X = df[X_features_for_rf]
y = df['High_Risk_Flag'] # Your target variable defined by K-Means segments

print(f"\nFeatures selected for Random Forest (X): {X.columns.tolist()}")
print(f"Target variable (y): {y.name}")
print(f"Target distribution:\n{y.value_counts()}")


Features selected for Random Forest (X): ['Age', 'Monthly_Income', 'Num_Dependents', 'Loan_Amount', 'Loan_Tenure', 'Interest_Rate', 'Collateral_Value', 'Outstanding_Loan_Amount', 'Monthly_EMI', 'Num_Missed_Payments', 'Days_Past_Due', 'Collection_Attempts', 'Loan_to_income_ratio', 'EMI_to_income_ratio', 'Has_Collateral', 'Gender', 'Payment_History', 'Loan_Type', 'Legal_Action_Taken', 'Employment_Type_Salaried', 'Employment_Type_Self-Employed', 'Collection_Method_Debt Collectors', 'Collection_Method_Legal Notice', 'Collection_Method_Settlement Offer']
Target variable (y): High_Risk_Flag
Target distribution:
High_Risk_Flag
1    335
0    165
Name: count, dtype: int64


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"\nTrain set size: {len(X_train)} samples")
print(f"Test set size: {len(X_test)} samples")


Train set size: 400 samples
Test set size: 100 samples


In [None]:

print(X_train.columns.tolist())

['Age', 'Monthly_Income', 'Num_Dependents', 'Loan_Amount', 'Loan_Tenure', 'Interest_Rate', 'Collateral_Value', 'Outstanding_Loan_Amount', 'Monthly_EMI', 'Num_Missed_Payments', 'Days_Past_Due', 'Collection_Attempts', 'Loan_to_income_ratio', 'EMI_to_income_ratio', 'Has_Collateral', 'Gender', 'Payment_History', 'Loan_Type', 'Legal_Action_Taken', 'Employment_Type_Salaried', 'Employment_Type_Self-Employed', 'Collection_Method_Debt Collectors', 'Collection_Method_Legal Notice', 'Collection_Method_Settlement Offer']


In [9]:
rf_model = train_risk_model(X_train, y_train, n_estimators=100, random_state=42)
print("\nRandom Forest model trained.")


Random Forest model trained.


In [None]:
model_path = r'outputs\models\random_forest_risk_model.joblib' # save your model
save_model(rf_model, model_path)

Model saved to C:\Loan Recovery ML Project\outputs\models\random_forest_risk_model.joblib


In [11]:
risk_scores_test = predict_risk_scores(rf_model, X_test)
print("\nRisk scores predicted on test set.")


Risk scores predicted on test set.


In [None]:
df_test_results = X_test.copy()
df_test_results['Risk_Score'] = risk_scores_test
df_test_results['Predicted_High_Risk'] = (df_test_results['Risk_Score'] > 0.5).astype(int) 

In [13]:
print("\n--- Model Performance Evaluation ---")
evaluate_risk_model(y_test, risk_scores_test, threshold=0.5)


--- Model Performance Evaluation ---
--- Model Evaluation ---
Accuracy: 0.9500
ROC AUC Score: 0.9959

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.85      0.92        33
           1       0.93      1.00      0.96        67

    accuracy                           0.95       100
   macro avg       0.97      0.92      0.94       100
weighted avg       0.95      0.95      0.95       100


Confusion Matrix:
[[28  5]
 [ 0 67]]


In [None]:
columns_to_merge = ['Borrower_ID', 'Segment_Name', 'Recovery_Status', 'Collection_Method', 'Collection_Attempts', 'Legal_Action_Taken']

existing_cols_to_merge = [col for col in columns_to_merge if col in df.columns]

df_test_results = df_test_results.merge(df[existing_cols_to_merge],
                                        left_index=True,
                                        right_index=True,
                                        how='left') 


df_test_results['Recovery_Strategy'] = df_test_results['Risk_Score'].apply(assign_recovery_strategy)

print("\n--- Sample of Test Results with Recovery Strategy ---")
print(df_test_results[['Borrower_ID', 'Segment_Name', 'Risk_Score', 'Predicted_High_Risk', 'Recovery_Strategy', 'Recovery_Status']].head())


df_test_results.to_csv(r'C:\Loan Recovery ML Project\outputs\reports\test_results_with_strategies.csv', index=False) # save this one too.


--- Sample of Test Results with Recovery Strategy ---
    Borrower_ID                              Segment_Name  Risk_Score  \
327     BRW_328     High Missed Payments, Behavioral Risk        0.91   
270     BRW_271           High Loan Burden, Critical Risk        0.91   
477     BRW_478           High Loan Burden, Critical Risk        0.99   
177     BRW_178     High Income, Large Loan, Managed Risk        0.09   
1         BRW_2  Stable Income, Manageable Loan, Low Risk        0.17   

     Predicted_High_Risk                                  Recovery_Strategy  \
327                    1  Immediate legal notices & aggressive recovery ...   
270                    1  Immediate legal notices & aggressive recovery ...   
477                    1  Immediate legal notices & aggressive recovery ...   
177                    0                   Automated reminders & monitoring   
1                      0                   Automated reminders & monitoring   

     Recovery_Status  
327     