# Corporate Credit Rating

In [None]:
#Import necessary packages

#For importing data
import pandas as pd

#For data preprocessing
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

#For hyperparameter optimization
from sklearn.model_selection import GridSearchCV

#For mathematical operations
import numpy as np

#For data visualization
import matplotlib.pyplot as plt

#For dataset split
from sklearn.model_selection import KFold

#For model evaluation
from sklearn.metrics import accuracy_score, f1_score, r2_score

#For Randome Forest model
from sklearn.ensemble import RandomForestClassifier

#For SVM model
from sklearn.svm import SVC

#For GBDT
import xgboost

#For Logistic Regression
from sklearn.linear_model import LogisticRegression

#For Neural Network
from sklearn.neural_network import MLPClassifier

#For KNN
from sklearn.neighbors import KNeighborsClassifier



## Data Preprocessing (Part 1)

In [None]:
#Date import and cleansing

#Import the CSV data
df = pd.read_csv("corporateCreditRatingWithFinancialRatios.csv")

#Check if any missing values
num = len(df)
print("===== Data Cleansing =====")
print("Number of original data:", num)
print("Any missing values? ", df.isnull().values.any())
print(df["Rating"].unique())

#Drop unnecessary columns
print("Dropping unnecessary columns ...")
df = df.drop(["Rating Agency", "Corporation", "Ticker", "Binary Rating", "CIK", "SIC Code"], axis=1)

#Eliminate duplicated rows
df = df.drop_duplicates()
print("Duplicated rows dropped: ", num - len(df))

#Print first 5 rows after cleansing the dataset
df.head(5)

In [None]:
#rating mapping
rating_dict = {'AAA':'A_rank', 
               'AA+':'A_rank', 
               'AA':'A_rank', 
               'AA-':'A_rank',  
               'A+':'A_rank',  
               'A':'A_rank', 
               'A-':'A_rank', 
               'BBB+':'B_rank', 
               'BBB':'B_rank', 
               'BBB-':'B_rank', 
               'BB+':'B_rank', 
               'BB':'B_rank', 
               'BB-':'B_rank', 
               'B+':'B_rank', 
               'B':'B_rank',  
               'B-':'B_rank',  
               'CCC+':'C_rank', 
               'CCC':'C_rank', 
               'CCC-':'C_rank',
               'CC+':'C_rank',   
               'CC':'C_rank',
               'CC-':'C_rank',
               'C+':'C_rank',  
               'C':'C_rank',
               'C-':'C_rank',  
               'D':'D_rank', }

df["Rating"] = df["Rating"].map(rating_dict)
print(df.isnull().sum())

In [None]:
#Format convertion
sector_classes = df["Sector"].unique()
rating_classes = df["Rating"].unique()

#Turn categorical column into one-hot vectors
df = pd.get_dummies(df, columns=["Sector"], dtype=float) #Use df.iloc[:,17:29] to extract dummy columns for Sector
df = pd.get_dummies(df, columns=["Rating"], dtype=float) #Use df.iloc[:,29:] to extract dummy columns for Rating

#Transform strings in Rating Date Column into datetime format
df["Rating Date"] = pd.to_datetime(df["Rating Date"], format="%Y-%m-%d")

#Transform datetime in Rating Date Column into timestamp
df["Rating Date"] = df["Rating Date"].values.astype(np.int64)

#Print first 5 rows after transforming the dataset
df.head(5)

## Data Exploration

In [None]:
#Plotting the piecharts for visualizing sector & rating data

#Define the subplots
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(15,15))

#Obtain sector & rating data
sector_distribution = df.iloc[:,17:29].sum()
rating_distribution = df.iloc[:,29:].sum()

#Sort the labels
sector_classes.sort()
rating_classes.sort()

#Define the pie chart for sector distribution
ax1.pie(sector_distribution, labels=sector_classes, autopct='%1.1f%%',pctdistance=0.8)
ax1.title.set_text('sector distribution')

#Define the pie chart for rating distribution
ax2.pie(rating_distribution, labels=rating_classes, autopct='%1.1f%%',pctdistance=0.9, textprops={'fontsize': 7})
ax2.title.set_text('rating distribution')

#Display the graph
plt.show()

In [None]:
#Plotting the correlation matrix for features
feature_corr_matrix = df.iloc[:,:29].corr()
corr_plot = plt.imshow(feature_corr_matrix, cmap='gray')
plt.xlabel("feature index")
plt.ylabel("feature index")
plt.title("correlation matrix for all features")
plt.colorbar()
plt.show()

## Data Preprocessing (Part 2)

In [None]:
# Separate input features from labels
X = df.iloc[:, :29]
y = df.iloc[:, 29:]

# Define feature scalars
feature_scalers = [MinMaxScaler(feature_range=(0,1))]*len(X.iloc[1,:17])
scaler = MinMaxScaler()

# Train the scalars per column
for i in range(len(feature_scalers)):
    feature_scalers[i] = MinMaxScaler(feature_range=(0,1)).fit(X.iloc[:,i].values.reshape(-1, 1))

# Scale the data per column
X_scaled = X
for index, i in enumerate(X.iloc[:,:17].columns):
    X_scaled[i] = feature_scalers[index].transform(X.loc[:,[i]])

# Display top 5 rows of scaled dataset
print(X_scaled.head(5))

# Transform into nparrays
X_scaled = X_scaled.values
y = y.values

## PCA Decomposition

In [None]:
# Perform PCA with the desired number of components
n_components = 24 # Specify the number of components you want to keep
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X_scaled)

# Use X_pca for further modeling or analysis

In [None]:
def grid_search(model, params, X, y, fold):
    gd_sr = GridSearchCV(model, params, cv=fold, scoring='accuracy')
    gd_sr.fit(X, y)
    print(gd_sr.best_params_)
    return 

In [None]:
def kf_train_test(model, df, X, y, fold):
    # create a KFold object with 5 folds
    kf = KFold(n_splits=fold, shuffle=True, random_state=1234)

    # initialize lists to store accuracy and R2 scores for each fold
    acc_scores = []
    r2_scores = []

    # loop over the folds
    for train_index, test_index in kf.split(df):
        # split the data into training and testing sets for this fold
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        y_train = np.argmax(y_train, axis=1)
        y_test = np.argmax(y_test, axis=1)

        # Create a Gradient Boost model
        model.fit(X_train, y_train)
    
        # make predictions on the testing set and compute accuracy and R2 score
        y_pred = model.predict(X_test)
        acc_score = accuracy_score(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # append the scores to the lists for this fold
        acc_scores.append(acc_score)
        r2_scores.append(r2)

    # compute the average accuracy and R2 score across all folds
    avg_acc_score = sum(acc_scores) / len(acc_scores)
    avg_r2_score = sum(r2_scores) / len(r2_scores)

    return [avg_acc_score, avg_r2_score, model]

In [None]:
#


#               

In [21]:
param_grids = {"Random Forest":[{ 
                    'bootstrap': [True],
                    'max_depth': [40],
                    'max_features': ['sqrt'],
                    'n_estimators': [600]
                    }],
               "GBDT":[{
                    'max_depth': [6],
                    'n_estimators': [180],
                    'learning_rate': [0.1],
               }],
               "SVM":[{
                    'C': [100], 
                    'gamma': ['auto'],
                    'decision_function_shape':['ovr'],
                    'kernel': ['rbf']
                    }],
               "Logistic Regression":[{
                    'C': 10**np.linspace(-3,3,20)
               }],
               "MLP":[{
                    'learning_rate': ["constant"], 
                    'hidden_layer_sizes': [(25, 25, 25, 25, 25)], #10, 20, 30, 40
                    'alpha': [0.05],
                    'activation': ["tanh"]
               }],
               "KNN":[{
                    'leaf_size': [1], #10, 50, 100 -> 3, 5, 7, 10 -> 1, 2, 3
                    'p': [1],
                    'weights': ['distance'],
                    'metric': ['minkowski']
               }]
               }

In [22]:
models = {"Random Forest":RandomForestClassifier(bootstrap=True, max_depth=40, max_features='sqrt', n_estimators=600, random_state=1234), 
          "GBDT":xgboost.XGBClassifier(max_depth=6, n_estimators=180, learning_rate=0.1),
          "SVM":SVC(kernel='sigmoid', 
                    decision_function_shape='ovr',
                    gamma='auto',
                    degree=5,
                    max_iter=1000,
                    random_state=1234),
          "Logistic Regression":LogisticRegression(random_state=1234, 
                    multi_class='multinomial', 
                    solver='lbfgs', 
                    max_iter=1000), 
          "MLP":MLPClassifier(hidden_layer_sizes=(25, 25, 25, 25, 25),
                               activation='tanh', 
                               learning_rate='constant',
                               alpha=0.05,
                               max_iter=5000),
          "KNN":KNeighborsClassifier(n_neighbors=5,
                                     leaf_size=1,
                                     p=1,
                                     weights='distance',
                                     metric='minkowski')
          }

In [23]:
for (name, clf) in models.items():
    if name not in ["SVM", "Logistic Regression"]:
        print(name)
        grid_search(clf, param_grids[name], X_scaled, np.argmax(y, axis=1), 10)
        print()
        continue
    print(name)
    grid_search(clf, param_grids[name], X_scaled, np.argmax(y, axis=1), 10)
    print()

Random Forest




{'bootstrap': True, 'max_depth': 40, 'max_features': 'sqrt', 'n_estimators': 600}

GBDT




{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 180}

SVM




{'C': 100, 'decision_function_shape': 'ovr', 'gamma': 'auto', 'kernel': 'rbf'}

MLP
{'activation': 'tanh', 'alpha': 0.05, 'hidden_layer_sizes': (25, 25, 25, 25, 25), 'learning_rate': 'constant'}

KNN




{'leaf_size': 1, 'metric': 'minkowski', 'p': 1, 'weights': 'distance'}



In [None]:
for (name, clf) in models.items():
    [avg_acc, avg_r2, res_clf] = kf_train_test(clf, df, X_pca, y, 10)
    print(name)
    print("Average Accuracy:", avg_acc)
    print("Average R2 Square:", avg_r2)
    print()