Random Forest Model

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
data_train = pd.read_csv("loanapproval.csv")

# Function to clean and preprocess the data
def clean_data(df):
    # Drop columns that are not needed for prediction
    df = df.drop(columns=["Id", "CITY", "STATE"])

    # Convert Risk_Flag to categorical values
    df["Risk_Flag"] = df["Risk_Flag"].map({1: "YES", 0: "NO"})

    # Handle missing values by filling with mode or median
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].fillna(df[col].mode()[0])
        else:
            df[col] = df[col].fillna(df[col].median())

    # Remove outliers using the IQR method for numerical columns
    num_cols = df.select_dtypes(include=[np.number]).columns
    for col in num_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

    return df

# Clean the training data
data_train_cleaned = clean_data(data_train)

# Separate continuous and categorical columns for preprocessing
continuous_cols = ["Income", "Age", "Experience", "CURRENT_JOB_YRS", "CURRENT_HOUSE_YRS"]
categorical_cols = ["Married/Single", "House_Ownership", "Car_Ownership", "Profession"]

# Apply log transformation to continuous variables to reduce skewness
for col in continuous_cols:
    data_train_cleaned[col] = np.log1p(data_train_cleaned[col])

# Label Encoding for categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data_train_cleaned[col] = le.fit_transform(data_train_cleaned[col])
    label_encoders[col] = le

# Encode the target variable
target_encoder = LabelEncoder()
data_train_cleaned["Risk_Flag"] = target_encoder.fit_transform(data_train_cleaned["Risk_Flag"])

# Define features (X) and target (y)
X = data_train_cleaned.drop(columns=['Risk_Flag'])
y = data_train_cleaned['Risk_Flag']

# Check class distribution in the target variable
print("Class distribution:")
print(y.value_counts(normalize=True))

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train a Random Forest classifier
model_rf = RandomForestClassifier(random_state=0)
model_rf.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = model_rf.predict(X_test)

# Evaluate the model
print("\nRandom Forest Model Evaluation:")
print(classification_report(y_test, y_pred_rf))
print(f"Accuracy Score: {accuracy_score(y_test, y_pred_rf):.4f}")

# Function to predict loan risk for a single input
def predict_loan_risk(input_data):
    input_df = pd.DataFrame([input_data])
    input_encoded = input_df.copy()

    # Apply log transformation to continuous variables
    for col in continuous_cols:
        input_encoded[col] = np.log1p(input_encoded[col])

    # Label encode categorical variables
    for col in categorical_cols:
        le = label_encoders[col]
        input_encoded[col] = le.transform(input_encoded[col])

    # Ensure input DataFrame has the same columns as training data
    input_encoded = input_encoded[X.columns]

    # Predict probability and make the final prediction
    prob = model_rf.predict_proba(input_encoded)[0, 1]
    prediction = 'Yes' if prob >= 0.5 else 'No'
    return prediction, prob



Class distribution:
Risk_Flag
0    0.877
1    0.123
Name: proportion, dtype: float64

Random Forest Model Evaluation:
              precision    recall  f1-score   support

           0       0.93      0.95      0.94     44259
           1       0.60      0.52      0.56      6141

    accuracy                           0.90     50400
   macro avg       0.77      0.74      0.75     50400
weighted avg       0.89      0.90      0.90     50400

Accuracy Score: 0.8994


In [None]:
# Test the prediction function with a sample input
sample_input = {
    'Income': 4866477,
    'Age': 62,
    'Experience': 13,
    'Married/Single': 'married',
    'House_Ownership': 'rented',
    'Car_Ownership': 'no',
    'Profession': 'Chef',
    'CURRENT_JOB_YRS': 11,
    'CURRENT_HOUSE_YRS': 14
}

prediction, probability = predict_loan_risk(sample_input)
print(f"\nPredicted Loan Risk: {prediction}")
print(f"Probability of Yes: {probability:.4f}")


Predicted Loan Risk: No
Probability of Yes: 0.0838


    'Income': 6256451,
    'Age': 41,
    'Experience': 2,
    'Married/Single': 'single',
    'House_Ownership': 'rented',
    'Car_Ownership': 'yes',
    'Profession': 'Software_Developer',
    'CURRENT_JOB_YRS': 2,
    'CURRENT_HOUSE_YRS': 12


    'Income': 251994,
    'Age': 22,
    'Experience': 3,
    'Married/Single': 'single',
    'House_Ownership': 'rented',
    'Car_Ownership': 'no',
    'Profession': 'Mechanical_engineer',
    'CURRENT_JOB_YRS': 3,
    'CURRENT_HOUSE_YRS': 13