# Import Libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import PowerTransformer
import pickle

# Read Data

In [None]:
data=pd.read_csv("credit_score_data.csv")
pd.set_option("display.max.columns", None)
data.head()

Unnamed: 0,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score,Annual-cat,history-age-cat,age_cat,Auto Loan,Credit-Builder Loan,Personal Loan,Not Specified,No Data,Debt Consolidation Loan,Payday Loan,Student Loan,Home Equity Loan,Mortgage Loan,delay_by_customer_mean,monthlyBalance_by_customer_mean
0,3392,1,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,4.0,3.0,4.0,"Auto Loan, Credit-Builder Loan, Personal Loan,...",3.0,7.0,11.27,4.0,Good,809.98,26.82262,265.0,No,49.574949,21.46538,High_spent_Small_value_payments,312.494089,Good,Lower-Class,greater_than_10 years,18-23,1,1,1,0,0,0,0,0,1,0,4.25,304.555294
1,3392,2,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,4.0,3.0,4.0,"Auto Loan, Credit-Builder Loan, Personal Loan,...",3.0,4.0,11.27,4.0,Good,809.98,31.94496,266.0,No,49.574949,21.46538,Low_spent_Large_value_payments,284.629162,Good,Lower-Class,greater_than_10 years,18-23,1,1,1,0,0,0,0,0,1,0,4.25,304.555294
2,3392,3,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,4.0,3.0,4.0,"Auto Loan, Credit-Builder Loan, Personal Loan,...",3.0,7.0,11.27,4.0,Good,809.98,28.609352,267.0,No,49.574949,21.46538,Low_spent_Medium_value_payments,331.209863,Good,Lower-Class,greater_than_10 years,18-23,1,1,1,0,0,0,0,0,1,0,4.25,304.555294
3,3392,4,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,4.0,3.0,4.0,"Auto Loan, Credit-Builder Loan, Personal Loan,...",5.0,4.0,6.27,4.0,Good,809.98,31.377862,268.0,No,49.574949,21.46538,Low_spent_Small_value_payments,223.45131,Good,Lower-Class,greater_than_10 years,18-23,1,1,1,0,0,0,0,0,1,0,4.25,304.555294
4,3392,5,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,4.0,3.0,4.0,"Auto Loan, Credit-Builder Loan, Personal Loan,...",6.0,4.0,11.27,4.0,Good,809.98,24.797347,269.0,No,49.574949,21.46538,High_spent_Medium_value_payments,341.489231,Good,Lower-Class,greater_than_10 years,18-23,1,1,1,0,0,0,0,0,1,0,4.25,304.555294


# Encoding

In [None]:
def get_dummy(data,col_name):
    col=pd.get_dummies(data[col_name], drop_first=True, prefix= '_')
    data = data.drop(col_name, axis= 1)
    data= pd.concat([data,col], axis=1)
    return data

In [None]:
data=get_dummy(data,'Occupation')
data=get_dummy(data,'Payment_Behaviour')

In [None]:
def leve_mapping(data,col_name,dic):
    data[col_name]=data[col_name].replace(dic)
    return data

In [None]:
data=leve_mapping(data,'Credit_Mix',{'Good':2,'Standard':1,'Bad':0})
data=leve_mapping(data,'Credit_Score',{'Good':2,'Standard':1,'Poor':0})
data=leve_mapping(data,'Payment_of_Min_Amount',{'No':2,'NM':1,'Yes':0})

# processing age 

**When the Age is less than 18, the values will be 18**

In [None]:
data.loc[data["Age"]<18,"Age"] = 18

# Drop irrelevant columns

In [None]:
data.drop(['Customer_ID','Name','SSN','Monthly_Inhand_Salary','Type_of_Loan','Annual-cat','history-age-cat','age_cat'],axis=1,inplace=True)

# Select one Month to all customers for training

In [None]:
data_month=data[data['Month']==1]
X=data_month.drop(['Credit_Score','Month'],axis=1)
y=data_month['Credit_Score']

# Feature Selection

In [None]:
column=X.columns

**Using mutual_info_classif**

In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif
kbest=SelectKBest(mutual_info_classif,k=17)
kbest.fit_transform(X,y)
kbest.get_feature_names_out()

array(['Annual_Income', 'Num_Bank_Accounts', 'Num_Credit_Card',
       'Interest_Rate', 'Num_of_Loan', 'Delay_from_due_date',
       'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt',
       'Credit_History_Age', 'Payment_of_Min_Amount',
       'Amount_invested_monthly', 'Monthly_Balance',
       'delay_by_customer_mean', 'monthlyBalance_by_customer_mean'],
      dtype=object)

**Using chi2**

In [None]:
from sklearn.feature_selection import SelectKBest,chi2

kbest = SelectKBest(chi2, k=17).fit(X, y)
kbest.get_feature_names_out()

array(['Annual_Income', 'Num_Bank_Accounts', 'Interest_Rate',
       'Num_of_Loan', 'Delay_from_due_date', 'Num_of_Delayed_Payment',
       'Changed_Credit_Limit', 'Num_Credit_Inquiries', 'Credit_Mix',
       'Outstanding_Debt', 'Credit_History_Age', 'Payment_of_Min_Amount',
       'Total_EMI_per_month', 'Amount_invested_monthly',
       'Monthly_Balance', 'delay_by_customer_mean',
       'monthlyBalance_by_customer_mean'], dtype=object)

**Using f_regression we choose this algorithm because it achieve the best result in the model**

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

# Select the top 16 features with the lowest p-values
selected_features = SelectKBest(f_regression, k=17).fit(X, y)

# Print the names and p-values of the selected features
names=[]
for feature in selected_features.get_support(indices=True):
    if column[feature]=='Credit_Mix':
        continue
    else:
        names.append(column[feature])
        print(f"{column[feature]}: {selected_features.pvalues_[feature]}")

Annual_Income: 5.29851835344353e-163
Num_Bank_Accounts: 0.0
Num_Credit_Card: 0.0
Interest_Rate: 0.0
Num_of_Loan: 0.0
Delay_from_due_date: 0.0
Num_of_Delayed_Payment: 0.0
Changed_Credit_Limit: 4.520970154899447e-100
Num_Credit_Inquiries: 0.0
Outstanding_Debt: 0.0
Credit_History_Age: 0.0
Payment_of_Min_Amount: 0.0
Amount_invested_monthly: 4.562457108482512e-109
Monthly_Balance: 5.999278540491398e-162
delay_by_customer_mean: 0.0
monthlyBalance_by_customer_mean: 8.467272973303519e-219


In [None]:
names=np.array(names)
names

array(['Annual_Income', 'Num_Bank_Accounts', 'Num_Credit_Card',
       'Interest_Rate', 'Num_of_Loan', 'Delay_from_due_date',
       'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Outstanding_Debt', 'Credit_History_Age',
       'Payment_of_Min_Amount', 'Amount_invested_monthly',
       'Monthly_Balance', 'delay_by_customer_mean',
       'monthlyBalance_by_customer_mean'], dtype='<U31')

# Split Data after feature selection

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0,stratify=y)

In [None]:
X_train_new=X_train.loc[:,names]

In [None]:
X_test_new=X_test.loc[:,names]

# Standardization

In [None]:
scalar = PowerTransformer(method='yeo-johnson', standardize=True)
X_train_scaled = scalar.fit_transform(X_train_new)
X_test_scaled = scalar.fit_transform(X_test_new)

In [None]:
scalar_save = PowerTransformer(method='yeo-johnson', standardize=True).fit(X_train_new)

# Models

In [None]:
cv = StratifiedKFold(3)

In [None]:
class Model:
    def __init__(self, X_train, X_test, y_train, y_test):
        self.X_train = X_train
        self.X_test = X_test
        self.y_test = y_test
        self.y_train = y_train
        self.model = None

    def fit(self):
        return self.model.fit(self.X_train,self.y_train)

    def predict(self):
        return self.model.predict(self.X_test)
    
    def classification_Report(self, y_pred):
        report = classification_report(self.y_test, y_pred)
        return report

    def gbM(self):#Score 0.764
        self.model = GradientBoostingClassifier(random_state=0)
        trained_model=self.fit()
        y_pred=self.predict()
        report =self.classification_Report(y_pred)
        return report,self.model,y_pred

# Calling class

In [None]:
gradient=Model(X_train_scaled,X_test_scaled,y_train,y_test)
report,model,y_pred =gradient.gbM()
print(report)
print(model)

              precision    recall  f1-score   support

           0       0.78      0.72      0.75       716
           1       0.79      0.82      0.80      1389
           2       0.66      0.67      0.66       395

    accuracy                           0.77      2500
   macro avg       0.74      0.73      0.74      2500
weighted avg       0.77      0.77      0.77      2500

GradientBoostingClassifier(random_state=0)


In [None]:
confusion_matrix(y_test,y_pred)

array([[ 513,  177,   26],
       [ 141, 1138,  110],
       [   1,  130,  264]], dtype=int64)

# Classification Report for training 

In [None]:
y_pred=model.predict(X_train_scaled)
print(classification_report(y_train,y_pred))

              precision    recall  f1-score   support

           0       0.83      0.74      0.78      2866
           1       0.81      0.85      0.83      5554
           2       0.69      0.68      0.69      1580

    accuracy                           0.79     10000
   macro avg       0.78      0.76      0.77     10000
weighted avg       0.80      0.79      0.79     10000



In [None]:
feature_importances=np.around(model.feature_importances_,4)
df = pd.DataFrame({'importance': feature_importances}, index=names)
df.sort_values(by='importance',ascending=False)

Unnamed: 0,importance
Outstanding_Debt,0.3207
Interest_Rate,0.1949
Payment_of_Min_Amount,0.1569
Num_Credit_Card,0.0755
Changed_Credit_Limit,0.0563
Delay_from_due_date,0.0428
Num_Credit_Inquiries,0.0371
Num_of_Delayed_Payment,0.034
Num_Bank_Accounts,0.0281
Credit_History_Age,0.0133


# Fine Tuning

In [None]:
def randomSearch(X_train,y_train,estimator, parameters, cv):
    random_search = RandomizedSearchCV(estimator,param_distributions=parameters, cv=cv, scoring='accuracy',n_iter=300, n_jobs=-1, random_state=0)
    random_search.fit(X_train,y_train)       
    return random_search

**Gradient boosting model fine tuning hyper parameter**

In [None]:
# Create the parameter grid
param_grid = {
    'n_estimators': [100, 200,300,400,500],
    'learning_rate': [0.01,0.05,0.1],
    'criterion': ['friedman_mse','squared_error'],
    'max_features': [None,'auto','sqrt','log2'],
    'max_depth':[3,4,6,8]}

gbm = GradientBoostingClassifier(random_state=0)
gbm_search=randomSearch(X_train_scaled,y_train,gbm,param_grid,cv)

best_estimator=gbm_search.best_estimator_
best_params= gbm_search.best_params_
best_score=gbm_search.best_score_ 

print(best_estimator)
print(best_params)
print(best_score)

GradientBoostingClassifier(criterion='squared_error', learning_rate=0.01,
                           max_depth=6, max_features='log2', n_estimators=300,
                           random_state=0)
{'n_estimators': 300, 'max_features': 'log2', 'max_depth': 6, 'learning_rate': 0.01, 'criterion': 'squared_error'}
0.7615008051495012


# Model After fine Tuning on testing data

In [None]:
y_pred=gbm_search.predict(X_test_scaled)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.80      0.70      0.74       716
           1       0.78      0.83      0.80      1389
           2       0.65      0.65      0.65       395

    accuracy                           0.76      2500
   macro avg       0.74      0.73      0.73      2500
weighted avg       0.76      0.76      0.76      2500



# Model After fine Tuning on training data

In [None]:
y_pred=gbm_search.predict(X_train_scaled)
print(classification_report(y_train,y_pred))

              precision    recall  f1-score   support

           0       0.86      0.76      0.81      2866
           1       0.83      0.87      0.85      5554
           2       0.72      0.74      0.73      1580

    accuracy                           0.82     10000
   macro avg       0.80      0.79      0.80     10000
weighted avg       0.82      0.82      0.82     10000



# Saving model to disk

In [None]:
pickle.dump(scalar_save, open('final_scaler.pkl','wb'))

In [None]:
# Saving model to disk
pickle.dump(model, open('final_model.pkl','wb'))