---
**<center><h1>Credit score classification</h1></center>**
<center><h3>Learning ML, DL through 100 Practical Projects</h3></center>

---

This project focuses on developing a machine learning model to classify credit scores based on a person's credit-related information. The goal is to assist a global finance company in automating the segregation of individuals into credit score brackets, thereby reducing manual efforts and improving efficiency.

# **Import Libraries and Data**
---

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [None]:
!pip install category-encoders

In [None]:
from category_encoders import BinaryEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
! pip install kaggle
! mkdir ~/.kaggle
! cp /content/drive/MyDrive/kaggle/kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d parisrohan/credit-score-classification
!unzip credit-score-classification.zip

In [None]:
df_train = pd.read_csv('train.csv').copy()
df_test = pd.read_csv('test.csv').copy()

# **Take a look at the data**
---

In [None]:
print(df_train.shape)
print(df_test.shape)

In [None]:
df_train.head()

In [None]:
df_train.tail()

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
df_train.select_dtypes(include='object').describe()

In [None]:
df_train['Credit_Score'].value_counts()

# **Exploratory Data Analysis**
---

**Univariate Analysis**

In [None]:
plt.pie(df_train['Credit_Score'].value_counts(),labels=df_train['Credit_Score'].value_counts().index, autopct='%1.1f%%')
plt.title('Credit Score')
plt.show()

In [None]:
numerical_data = df_train.select_dtypes(include='number')

numerical_data.hist(figsize=(10,8))
plt.show()

In [None]:
plt.figure(figsize=(10,8))
sns.boxplot(numerical_data)
plt.xticks(rotation = 45)
plt.show()

In [None]:
categorical_data = df_train.select_dtypes(exclude='number')
for column in categorical_data.columns:
    sns.countplot(data=categorical_data, x=column, palette="Set1")
    plt.title(f"Countplot of {column}")
    plt.show()

**Multivariate Analysis**

In [None]:
sns.pairplot(df_train, corner = True)
plt.show()

In [None]:
corr = df_train.select_dtypes(exclude=['object']).corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr,annot=True,cmap = 'coolwarm')
plt.show()

# **Data Cleaning**
---

**Drop Unnecessary Columns**

In [None]:
df_train.drop(['ID','Customer_ID','Name','SSN','Occupation','Payment_Behaviour','Type_of_Loan'],axis=1,inplace = True)
df_test.drop(['ID','Customer_ID','Name','SSN','Occupation','Payment_Behaviour','Type_of_Loan'],axis=1,inplace = True)

**Handling Duplicate Rows**

In [None]:
print(f"Number of duplicate rows in df_train: {df_train.duplicated().sum()}")
print(f"Number of duplicate rows in df_test: {df_test.duplicated().sum()}")


In [None]:
df_train = df_train.drop_duplicates()
df_test = df_test.drop_duplicates()

**Formatting**

In [None]:
df_train = df_train.replace(['', 'nan', '!@9#%8', '#F%$D@*&8','_','NM'], np.NaN)
df_test = df_test.replace(['', 'nan', '!@9#%8', '#F%$D@*&8','_','NM'], np.NaN)

In [None]:
def convert_rating(num):
    try:
        num_str = str(num)
        return float(num_str.replace('_', ''))
    except:
        return np.NaN

In [None]:
columns_to_convert = [
    'Amount_invested_monthly', 'Monthly_Balance', 'Num_of_Delayed_Payment',
    'Monthly_Inhand_Salary', 'Num_Credit_Inquiries', 'Age', 'Annual_Income',
    'Num_of_Loan', 'Changed_Credit_Limit', 'Outstanding_Debt'
]

for col in columns_to_convert:
    df_train[col] = df_train[col].apply(convert_rating)
    df_test[col] = df_test[col].apply(convert_rating)


In [None]:
import re

def convert_credit_history(age_str):
    try:
        years = int(re.search(r'(\d+) Years', age_str).group(1))
        months = int(re.search(r'(\d+) Months', age_str).group(1))
        return years * 12 + months
    except:
        return np.NaN

df_train['Credit_History_Age'] = df_train['Credit_History_Age'].apply(convert_credit_history)
df_test['Credit_History_Age'] = df_test['Credit_History_Age'].apply(convert_credit_history)


**Handling Missing Data**

In [None]:
total = df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(10)

In [None]:
total = df_test.isnull().sum().sort_values(ascending=False)
percent = (df_test.isnull().sum()/df_test.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(10)

In [None]:
def fill_missing_values(df, columns):
    for col in columns:
        if df[col].dtype == 'object':
            mode_value = df[col].mode()[0]
            df[col].fillna(mode_value, inplace=True)
        elif pd.api.types.is_numeric_dtype(df[col]):
            mean_value = df[col].mean()
            df[col].fillna(mean_value, inplace=True)
    return df

columns = df_test.columns.tolist()

df_train = fill_missing_values(df_train, columns)
df_test = fill_missing_values(df_test, columns)

In [None]:
total = df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(10)

In [None]:
total = df_test.isnull().sum().sort_values(ascending=False)
percent = (df_test.isnull().sum()/df_test.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(10)

In [None]:
total = df_train.isnull().sum().sum()
print('Total Null values =' ,total)

# **Data Preprocessing**
---

**Encoding Categorical Variables**

In [None]:
df_train.select_dtypes(include='object').head()

In [None]:
df_train['Payment_of_Min_Amount'] = df_train['Payment_of_Min_Amount'].apply(lambda x: 0 if x == 'No' else 1)
df_test['Payment_of_Min_Amount'] = df_test['Payment_of_Min_Amount'].apply(lambda x: 0 if x == 'No' else 1)

In [None]:
ordinal_mapping = {'Poor': 0, 'Standard': 1, 'Good': 2}
df_train['Credit_Score'] = df_train['Credit_Score'].map(ordinal_mapping)

In [None]:
ordinal_mapping = {'Standard': 0, 'Good': 1, 'Bad': 2}
df_train['Credit_Mix'] = df_train['Credit_Mix'].map(ordinal_mapping)
df_test['Credit_Mix'] = df_test['Credit_Mix'].map(ordinal_mapping)

In [None]:
month_mapping = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4,
    'May': 5, 'June': 6, 'July': 7, 'August': 8,
    'September': 9, 'October': 10, 'November': 11, 'December': 12
}

df_train['Month'] = df_train['Month'].map(month_mapping)
df_test['Month'] = df_test['Month'].map(month_mapping)

In [None]:
df_train.head()

**Dealing with Outliers**

In [None]:

Q1 = df_train.quantile(0.25)
Q3 = df_train.quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df_train[(df_train < lower_bound) | (df_train > upper_bound)]
print(outliers.count())

In [None]:
def cap_age(value):
    if value < 10:
        return 10
    elif value > 120:
        return 120
    else:
        return value

df_train['Age'] = df_train['Age'].apply(cap_age)
df_test['Age'] = df_test['Age'].apply(cap_age)

In [None]:
from scipy import stats

for col in df_train.drop('Credit_Score',axis=1).columns:
  df_train[col] = df_train[col].apply(lambda x: x if x > 0 else 1e-6)
  df_train[col], _ = stats.boxcox(df_train[col])

for col in df_test.select_dtypes(include=['number']).columns:
  df_test[col] = df_test[col].apply(lambda x: x if x > 0 else 1e-6)
  df_test[col], _ = stats.boxcox(df_test[col])

In [None]:
#np.log1p(df_train.drop('Credit_Score',axis=1))
#np.log1p(df_test)

**Data splitting**

In [None]:
X = df_train.drop(['Credit_Score'], axis=1)
y = df_train['Credit_Score']
X_train, X_temp, y_train, y_temp = train_test_split(X,y,test_size=.1,random_state=44,stratify=y)
X_test, X_val, y_test, y_val = train_test_split(X_temp,y_temp,test_size=.5,random_state=44,stratify=y_temp)

**Data Normalization**

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)

# **Modeling**
-----

In [None]:
from sklearn.metrics import classification_report , f1_score, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.svm import SVC
import xgboost as xgb

In [None]:
classifiers = [
    ('Logistic Regression', LogisticRegression(random_state=42)),
    ('Support Vector Machine', SVC(random_state=42)),
    ('Decision Tree', DecisionTreeClassifier(random_state=42)),
    ('Random Forest', RandomForestClassifier(n_estimators=500, bootstrap=True,random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42)),
    ('xgboost', xgb.XGBClassifier())
]

In [None]:
for clf_name, clf in classifiers:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    cv_scores = cross_val_score(clf, X_train, y_train, cv=5)
    print(f'{clf_name}:')
    print(f"Cross Val Score: {cv_scores.mean():.2f}")
    print(f"F1 Score: {f1:.2f}")
    print(f"Classification Report:\n{classification_report(y_test, y_pred)}")
    sns.heatmap(confusion_matrix(y_test,y_pred),annot=True,fmt='d',cmap='Blues')
    plt.show()
    print('---------------------------------------------------')

# **DNN**
-----

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import EarlyStopping , ReduceLROnPlateau
from tensorflow.keras.metrics import F1Score

In [None]:
model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.5))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))

In [None]:
ES = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
RL = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=1e-6)

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=[F1Score])

In [None]:
history = model.fit(X_train, y_train, epochs=150, batch_size=64, validation_data=(X_val, y_val), callbacks=[ES,RL])

In [None]:
# Plot training & validation accuracy values
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['f1_score'])
plt.plot(history.history['val_f1_score'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

# Plot training & validation loss values
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

plt.tight_layout()
plt.show()

In [None]:
model.evaluate(X_test, y_test)