In [1]:
from credit_score.ml_logic.data import clean_data
from credit_score.ml_logic.preprocessor import preprocess_features, preprocess_target

import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("../raw_data/train.csv")

In [3]:
df.head()

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,_,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.94496,,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good
2,0x1604,CUS_0xd40,March,Aaron Maashoh,-500,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521264648,Low_spent_Medium_value_payments,331.2098628537912,Good
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45130972736783,Good
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good


In [4]:
cleaned_df = clean_data(df)

🧹 Cleaning data ...
🔧 Data types changed
🔧 Credit_History_Age converted to months
🔧 Credit_Mix missing values re-assigned
🔧 Interpolating Credit_History_Age
🔧 Reassigning missing values for Age
🔧 Reassigning missing values for Annual_Income
🔧 Reassigning missing values for Monthly_Inhand_Salary
🔧 Reassigning missing values for Num_Bank_Accounts
🔧 Reassigning missing values for Num_Credit_Card
🔧 Reassigning missing values for Interest_Rate
🔧 Reassigning missing values for Num_of_Loan
🔧 Reassigning missing values for Delay_from_due_date
🔧 Reassigning missing values for Num_of_Delayed_Payment
🔧 Reassigning missing values for Changed_Credit_Limit
🔧 Reassigning missing values for Num_Credit_Inquiries
🔧 Reassigning missing values for Outstanding_Debt
🔧 Reassigning missing values for Monthly_Balance
🔧 Compressing DataFrame
✅ Data cleaned
(100000, 28)


In [5]:
columns_to_drop = ['ID', 'Customer_ID', 'Month', 'Name', 'SSN', 'Type_of_Loan', 
                   'Total_EMI_per_month', 'Amount_invested_monthly', 'Credit_Utilization_Ratio', 
                   'Payment_Behaviour', 'Occupation', 'Credit_Score']

In [6]:
X = cleaned_df.drop(columns=columns_to_drop)
y = cleaned_df['Credit_Score']

In [7]:
X_processed = preprocess_features(X)


🛠️ Preprocessing features ...

Processing column: ['Payment_of_Min_Amount', 'Credit_Mix'] with ordinal encoding ...

Processing column: ['Age', 'Num_Bank_Accounts', 'Num_Credit_Card', 'Num_Credit_Inquiries', 'Num_of_Delayed_Payment'] with min-max-scaler ...

Processing column: ['Changed_Credit_Limit', 'Credit_History_Age'] with standard-scaler ...

Processing column: ['Annual_Income', 'Monthly_Inhand_Salary', 'Interest_Rate', 'Num_of_Loan', 'Delay_from_due_date', 'Outstanding_Debt', 'Monthly_Balance'] with robust-scaler ...

✅ X_processed, with shape (100000, 16)


In [8]:
y_processed = preprocess_target(y)


🛠️ Preprocessing target ...
✅ y_encoded, with shape (100000,)
Classes: ['Good' 'Poor' 'Standard']


In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, test_size=0.3, random_state=42)

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200],  # Number of trees in the forest
    'max_depth': [2, 3],  # Maximum depth of the tree
    'class_weight': ['balanced'],  # Weights associated with classes
    'criterion': ['gini', 'entropy'],  # Function to measure the quality of a split
    'min_impurity_decrease': [0]
}

rf_classifier = RandomForestClassifier()
grid_search_rfc = GridSearchCV(rf_classifier, param_grid, cv = 5, scoring = 'accuracy', n_jobs = -1)
grid_search_rfc.fit(X_train, y_train)

In [21]:
grid_search_rfc.best_params_

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 3,
 'min_impurity_decrease': 0,
 'n_estimators': 200}

In [22]:
grid_search_rfc.best_score_

0.6685428571428572

In [23]:
# from sklearn.svm import SVC

# param_grid = {'C': [0.1, 1, 10, 100], 
#               'gamma': [10, 1, 0.1, 0.01], 
#               'kernel': ['rbf', 'sigmoid', 'poly']
#               }

# svc = SVC()
# grid_search_svc = GridSearchCV(estimator=svc, param_grid=param_grid, cv=5)
# grid_search_svc.fit(X_train, y_train)

In [None]:
# print("Best Parameters: ", grid_search_svc.best_params_)
# print("Best Score: ", grid_search_svc.best_score_)