In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import shap

In [2]:
data = pd.read_csv('loan_approval_dataset.csv')
data.columns = data.columns.str.strip()
data.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,age,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,3,1,1,79,1471000,18700,13,652,81000,15200,47700,960700,0
1,2,3,1,0,24,933000,34880,16,806,41300,20500,23200,499500,1
2,3,5,1,1,41,800000,36290,17,524,47200,37100,32100,241600,1
3,4,1,0,1,28,1281000,6240,28,788,9100,21000,8700,449400,1
4,5,2,1,1,45,1171000,7870,13,355,74100,78400,13500,35100,0


In [3]:
data.drop(columns=['loan_id'], inplace=True)

In [4]:
data['total_assets'] = (
    data['residential_assets_value'] +
    data['commercial_assets_value'] +
    data['luxury_assets_value'] +
    data['bank_asset_value']
)
data.drop(columns=['residential_assets_value', 'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value'], inplace=True)

In [5]:
X = data.drop(columns=['loan_status'])
y = data['loan_status']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

In [7]:
model = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=12)
model.fit(X_train, y_train)

In [8]:
y_pred = model.predict(X_test)
print(f"Model accuracy: {accuracy_score(y_test, y_pred):.2f}")

Model accuracy: 0.95


In [9]:
confusion_matrix(y_test,y_pred)

array([[1431,  153],
       [   0, 1216]], dtype=int64)

In [10]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

In [11]:
def get_user_input():
    user_input = {}
    user_input['no_of_dependents'] = int(input("Enter number of dependents: "))
    user_input['education'] = int(input("Enter education level (0 for others, 1 for higher education): "))
    user_input['self_employed'] = int(input("Enter employment status (0 for not self-employed, 1 for self-employed): "))
    user_input['age'] = int(input("Enter age: "))
    user_input['income_annum'] = int(input("Enter annual income in Rs: "))
    user_input['loan_amount'] = int(input("Enter loan amount in Rs: "))
    user_input['loan_term'] = int(input("Enter loan term in Months: "))
    user_input['cibil_score'] = int(input("Enter CIBIL score: "))
    user_input['total_assets'] = int(input("Enter total assets value in Rs: "))
    
    return pd.Series(user_input)

def explain_rejection_based_on_shap(feature_importance, user_input):
    rejection_reasons = []

    for index, row in feature_importance.iterrows():
        feature = row['Feature']
        feature_value = row['Feature Value']
        shap_value = row['SHAP Value']
        
        if shap_value < 0:  
            if feature == 'no_of_dependents' and user_input['no_of_dependents'] > 1:
                rejection_reasons.append(f"High number of dependents ({user_input['no_of_dependents']}) reduced the approval chances.")
            elif feature == 'education' and user_input['education'] == 0:
                rejection_reasons.append("Lower education level negatively impacted the loan approval.")
            elif feature == 'loan_amount' and user_input['loan_amount'] > 161600:
                rejection_reasons.append(f"Loan amount ({user_input['loan_amount']}) is higher than expected, contributing to rejection.")
            elif feature == 'age' and user_input['age'] < 33:
                rejection_reasons.append(f"Young age ({user_input['age']}) contributed negatively to loan rejection.")
            elif feature == 'cibil_score' and user_input['cibil_score'] < 600:
                rejection_reasons.append(f"Low CIBIL score ({user_input['cibil_score']}) reduced approval chances.")
            else:
                rejection_reasons.append(f"{feature} with value {user_input[feature]} contributed negatively to the decision.")

    if not rejection_reasons:
        rejection_reasons.append("No significant negative contributions to rejection were detected.")
    
    return rejection_reasons

user_input = get_user_input()
user_input_df = pd.DataFrame([user_input])
shap_values_user = explainer.shap_values(user_input_df)
shap_values_for_rejected = shap_values_user[0][:, 0]


feature_importance_user = pd.DataFrame({
    'Feature': X.columns,          
    'Feature Value': user_input_df.values.flatten(),  
    'SHAP Value': shap_values_for_rejected 
})

prediction = model.predict(user_input_df)[0]
probabilities = model.predict_proba(user_input_df)[0]

print(f"Predicted probabilities: Rejected: {probabilities[0]:.2f}, Approved: {probabilities[1]:.2f}")

if prediction == 0:
    print("Loan Rejected")
    
    rejection_reasons = explain_rejection_based_on_shap(feature_importance_user, user_input)

    print("Reasons for Loan Rejection:")
    for reason in rejection_reasons:
        print(reason)
else:
    print("Loan Approved")

Enter number of dependents: 0
Enter education level (0 for others, 1 for higher education): 0
Enter employment status (0 for not self-employed, 1 for self-employed): 0
Enter age: 24
Enter annual income in Rs: 1200000
Enter loan amount in Rs: 2000000
Enter loan term in Months: 23
Enter CIBIL score: 550
Enter total assets value in Rs: 100000
Predicted probabilities: Rejected: 0.67, Approved: 0.33
Loan Rejected
Reasons for Loan Rejection:
no_of_dependents with value 0 contributed negatively to the decision.
Lower education level negatively impacted the loan approval.
Young age (24) contributed negatively to loan rejection.
income_annum with value 1200000 contributed negatively to the decision.
Loan amount (2000000) is higher than expected, contributing to rejection.
Low CIBIL score (550) reduced approval chances.


In [12]:
import joblib
joblib.dump(model, 'random_forest_model.pkl')
print("Model saved successfully to 'random_forest_model.pkl'")

Model saved successfully to 'random_forest_model.pkl'
