In [1]:
import pandas as pd
from Methods import Methods
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from RandomForest import RandomForest
import pickle
from sklearn.model_selection import train_test_split
import numpy as np

# Feature Engineering

In [2]:
# Back to data with outliers
df = pd.read_csv("encoded_final.csv")

In [3]:
# These vairables seemed to have high correlations so we're creating new columns to capture the trends even more.
# Interaction 1: Annual_Income * Monthly_Inhand_Salary
df['Annual_Income_Monthly_Inhand'] = df['Annual_Income'] * df['Monthly_Inhand_Salary']

# Interaction 2: Annual_Income / Amount_invested_monthly
df['Annual_Income_to_Investment_Ratio'] = df['Annual_Income'] / df['Amount_invested_monthly']

# Interaction 3: Interest_Rate * Num_Credit_Inquiries
df['Interest_Rate_Credit_Inquiries'] = df['Interest_Rate'] * df['Num_Credit_Inquiries']

In [4]:
df.head(10)

Unnamed: 0,Customer_ID,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,...,Credit_Mix_Good,Credit_Mix_Standard,Payment_of_Min_Amount_NM,Payment_of_Min_Amount_No,Payment_of_Min_Amount_Yes,Payment_Behaviour_encoded,Credit_Score_encoded,Annual_Income_Monthly_Inhand,Annual_Income_to_Investment_Ratio,Interest_Rate_Credit_Inquiries
0,3392,1,23.0,6744,19114.12,1824.843333,3.0,4.0,3.0,4.0,...,1,0,0,1,0,3,2,34880270.0,890.462678,12.0
1,3392,2,23.0,6744,19114.12,1824.843333,3.0,4.0,3.0,4.0,...,1,0,0,1,0,2,2,34880270.0,890.462678,12.0
2,3392,3,23.0,6744,19114.12,1824.843333,3.0,4.0,3.0,4.0,...,1,0,0,1,0,1,2,34880270.0,890.462678,12.0
3,3392,4,23.0,6744,19114.12,1824.843333,3.0,4.0,3.0,4.0,...,1,0,0,1,0,0,2,34880270.0,890.462678,12.0
4,3392,5,23.0,6744,19114.12,1824.843333,3.0,4.0,3.0,4.0,...,1,0,0,1,0,4,2,34880270.0,890.462678,12.0
5,3392,6,23.0,6744,19114.12,1824.843333,3.0,4.0,3.0,4.0,...,1,0,0,1,0,4,2,34880270.0,890.462678,12.0
6,3392,7,23.0,6744,19114.12,1824.843333,3.0,4.0,3.0,4.0,...,1,0,0,1,0,0,2,34880270.0,890.462678,12.0
7,3392,8,23.0,6744,19114.12,1824.843333,3.0,4.0,3.0,4.0,...,1,0,0,1,0,4,1,34880270.0,890.462678,12.0
8,8625,1,28.0,6672,34847.84,3037.986667,2.0,4.0,6.0,1.0,...,1,0,0,1,0,0,1,105867300.0,878.132845,12.0
9,8625,2,28.0,6672,34847.84,3037.986667,2.0,4.0,6.0,1.0,...,1,0,0,1,0,5,2,105867300.0,878.132845,12.0


In [5]:
# Create new columns based on possible groupings

# Calculate the mean, median, and standard deviation of 'Monthly_Inhand_Salary' for each 'Occupation' category
occupation_salary_stats = df.groupby('Occupation')['Monthly_Inhand_Salary'].agg(['mean', 'median', 'std'])
occupation_salary_stats.columns = ['Occupation_MonthlySalary_Mean', 'Occupation_MonthlySalary_Median', 'Occupation_MonthlySalary_Std']

# Compute the average 'Annual_Income' for customers with the same 'Num_Bank_Accounts' value
income_by_bank_accounts = df.groupby('Num_Bank_Accounts')['Annual_Income'].mean().reset_index()
income_by_bank_accounts.rename(columns = {'Annual_Income': 'Avg_Annual_Income_by_Bank_Accounts'}, inplace = True)

# Merging the aggregated features back into the original DataFrame
df = df.merge(occupation_salary_stats, on = 'Occupation', how = 'left')
df = df.merge(income_by_bank_accounts, on = 'Num_Bank_Accounts', how = 'left')

In [6]:
df.head(10)

Unnamed: 0,Customer_ID,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,...,Payment_of_Min_Amount_Yes,Payment_Behaviour_encoded,Credit_Score_encoded,Annual_Income_Monthly_Inhand,Annual_Income_to_Investment_Ratio,Interest_Rate_Credit_Inquiries,Occupation_MonthlySalary_Mean,Occupation_MonthlySalary_Median,Occupation_MonthlySalary_Std,Avg_Annual_Income_by_Bank_Accounts
0,3392,1,23.0,6744,19114.12,1824.843333,3.0,4.0,3.0,4.0,...,0,3,2,34880270.0,890.462678,12.0,4233.648483,3235.254167,3172.708354,57785.393603
1,3392,2,23.0,6744,19114.12,1824.843333,3.0,4.0,3.0,4.0,...,0,2,2,34880270.0,890.462678,12.0,4233.648483,3235.254167,3172.708354,57785.393603
2,3392,3,23.0,6744,19114.12,1824.843333,3.0,4.0,3.0,4.0,...,0,1,2,34880270.0,890.462678,12.0,4233.648483,3235.254167,3172.708354,57785.393603
3,3392,4,23.0,6744,19114.12,1824.843333,3.0,4.0,3.0,4.0,...,0,0,2,34880270.0,890.462678,12.0,4233.648483,3235.254167,3172.708354,57785.393603
4,3392,5,23.0,6744,19114.12,1824.843333,3.0,4.0,3.0,4.0,...,0,4,2,34880270.0,890.462678,12.0,4233.648483,3235.254167,3172.708354,57785.393603
5,3392,6,23.0,6744,19114.12,1824.843333,3.0,4.0,3.0,4.0,...,0,4,2,34880270.0,890.462678,12.0,4233.648483,3235.254167,3172.708354,57785.393603
6,3392,7,23.0,6744,19114.12,1824.843333,3.0,4.0,3.0,4.0,...,0,0,2,34880270.0,890.462678,12.0,4233.648483,3235.254167,3172.708354,57785.393603
7,3392,8,23.0,6744,19114.12,1824.843333,3.0,4.0,3.0,4.0,...,0,4,1,34880270.0,890.462678,12.0,4233.648483,3235.254167,3172.708354,57785.393603
8,8625,1,28.0,6672,34847.84,3037.986667,2.0,4.0,6.0,1.0,...,0,0,1,105867300.0,878.132845,12.0,4145.441555,3018.723333,3204.884553,73272.127454
9,8625,2,28.0,6672,34847.84,3037.986667,2.0,4.0,6.0,1.0,...,0,5,2,105867300.0,878.132845,12.0,4145.441555,3018.723333,3204.884553,73272.127454


In [7]:
# Grouping customers by 'Age'
# Bins and labels
age_bins = [0, 25, 35, 45, 55, 65, 100]  
age_labels = ['<25', '25-34', '35-44', '45-54', '55-64', '65+']  

# New column 'Age_Group' by categorizing 'Age' into age bins
df['Age_Group'] = pd.cut(df['Age'], bins = age_bins, labels = age_labels, right = False)

In [8]:
# Encoding the groups in 'Age_Group' using manual encoding
age_groups =  {
    '<25': 0,
    '25-34': 1,
    '35-44': 2,
    '45-54': 3,
    '55-64': 4,
    '65+' : 5
}

df['Age_Group_encoded'] = df['Age_Group'].map(age_groups)

In [9]:
df.drop('Age_Group', axis = 1, inplace = True)

In [10]:
# Creating a new column using feature crosses (Combining different features to unveil patterns)
# A feature to show the interaction between 'Annual_Income,' 'Num_Bank_Accounts,' and 'Num_Credit_Card.'
# This composite feature may capture interactions or relationships between these three variables that can be valuable for calssification.

df['Income_Bank_Accounts_Credit_Card'] = df['Annual_Income'] * df['Num_Bank_Accounts'] * df['Num_Credit_Card']

In [11]:
df.to_csv("After_Engineering.csv", index = False)

# Random Forest on Engineered Data

In [12]:
Engineered = pd.read_csv("After_Engineering.csv")

In [13]:
Engineered.drop('Annual_Income_to_Investment_Ratio', axis = 1, inplace = True)

In [14]:
# Split the data
X_eng = Engineered.drop(columns = ['Credit_Score_encoded'])
y_eng = Engineered['Credit_Score_encoded']
X_train_eng, X_temp_eng, y_train_eng, y_temp_eng = train_test_split(X_eng, y_eng, test_size = 0.2, random_state = 42)
X_val_eng, X_test_eng, y_val_eng, y_test_eng = train_test_split(X_temp_eng, y_temp_eng, test_size = 0.5, random_state = 42)

In [45]:
np.isinf(X_train_eng).sum()

Customer_ID                                0
Month                                      0
Age                                        0
Occupation                                 0
Annual_Income                              0
Monthly_Inhand_Salary                      0
Num_Bank_Accounts                          0
Num_Credit_Card                            0
Interest_Rate                              0
Num_of_Loan                                0
Delay_from_due_date                        0
Num_of_Delayed_Payment                     0
Changed_Credit_Limit                       0
Num_Credit_Inquiries                       0
Outstanding_Debt                           0
Credit_Utilization_Ratio                   0
Credit_History_Age                         0
Total_EMI_per_month                        0
Amount_invested_monthly                    0
Monthly_Balance                            0
Auto Loan                                  0
Credit-Builder Loan                        0
Personal L

In [46]:
np.isinf(X_val_eng).sum()

Customer_ID                               0
Month                                     0
Age                                       0
Occupation                                0
Annual_Income                             0
Monthly_Inhand_Salary                     0
Num_Bank_Accounts                         0
Num_Credit_Card                           0
Interest_Rate                             0
Num_of_Loan                               0
Delay_from_due_date                       0
Num_of_Delayed_Payment                    0
Changed_Credit_Limit                      0
Num_Credit_Inquiries                      0
Outstanding_Debt                          0
Credit_Utilization_Ratio                  0
Credit_History_Age                        0
Total_EMI_per_month                       0
Amount_invested_monthly                   0
Monthly_Balance                           0
Auto Loan                                 0
Credit-Builder Loan                       0
Personal Loan                   

- Dropping the 'Annual_Income_to_Investment_Ratio' column due to presence of infinite values
- Dropping is made in a cell above

In [15]:
# Running a RF Classifier with the best parameters we found in preceeding steps
Random_Forest_eng = RandomForestClassifier(max_depth = None,
 max_features = 'sqrt',
 min_samples_leaf = 1,
 min_samples_split = 2,
 n_estimators = 300, random_state = 42)

Random_Forest_eng.fit(X_train_eng, y_train_eng)
        
# Validation
pred_val_eng = Random_Forest_eng.predict(X_val_eng)
# Evaluating validation
# Accuracy
accuracy_val_eng = accuracy_score(y_val_eng, pred_val_eng)
# Classification report
report_val_eng = classification_report(y_val_eng, pred_val_eng)

# Testing
pred_test_eng = Random_Forest_eng.predict(X_test_eng)
# Evaluating testing
# Accuracy
accuracy_test_eng = accuracy_score(y_test_eng, pred_test_eng)
# Classification report
report_test_eng = classification_report(y_test_eng, pred_test_eng) 

# Printing the results
print(f"Random Forest Classifier's validation accuracy is {accuracy_val_eng}")
print("-"*70)
print(f"Random Forest Classifier's validation classification report is: \n {report_val_eng}")
print("="*100)
print(f"Random Forest Classifier's testing accuracy is {accuracy_test_eng}")
print("-"*70)
print(f"Random Forest Classifier's testing classification report is: \n {report_test_eng}")

Random Forest Classifier's validation accuracy is 0.8388
----------------------------------------------------------------------
Random Forest Classifier's validation classification report is: 
               precision    recall  f1-score   support

           0       0.82      0.87      0.84      2894
           1       0.86      0.83      0.85      5317
           2       0.81      0.81      0.81      1789

    accuracy                           0.84     10000
   macro avg       0.83      0.84      0.83     10000
weighted avg       0.84      0.84      0.84     10000

Random Forest Classifier's testing accuracy is 0.8424
----------------------------------------------------------------------
Random Forest Classifier's testing classification report is: 
               precision    recall  f1-score   support

           0       0.82      0.87      0.85      2980
           1       0.86      0.83      0.85      5282
           2       0.82      0.81      0.81      1738

    accuracy       

In [16]:
# Running the data RF with default parameters
# Create an instance of the RF class
RF = RandomForest(X_train_eng, y_train_eng, X_val_eng, y_val_eng, X_test_eng, y_test_eng)
# Run the default model
model = RF.Random_Forest_Classic()

Random Forest Classifier's validation accuracy is 0.8367
----------------------------------------------------------------------
Random Forest Classifier's validation classification report is: 
               precision    recall  f1-score   support

           0       0.81      0.87      0.84      2894
           1       0.86      0.83      0.84      5317
           2       0.82      0.81      0.81      1789

    accuracy                           0.84     10000
   macro avg       0.83      0.83      0.83     10000
weighted avg       0.84      0.84      0.84     10000

Random Forest Classifier's testing accuracy is 0.8409
----------------------------------------------------------------------
Random Forest Classifier's testing classification report is: 
               precision    recall  f1-score   support

           0       0.82      0.87      0.85      2980
           1       0.86      0.83      0.85      5282
           2       0.82      0.81      0.81      1738

    accuracy       

In [17]:
# Saving the data with the 'Annual_Income_to_Investment_Ratio' column
Engineered.to_csv("Engineered_final.csv", index = False)