# Long Term Credit Rating Projection 

## Import of all the packages needed

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV, cross_validate, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA
import statsmodels.api as sm
from statsmodels.imputation.mice import MICE, MICEData
# import fancyimpute
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.impute import SimpleImputer
# from imblearn.over_sampling import SMOTE
import time
from collections import Counter
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer

## Importing and Preprocessing Data

In [2]:
df = pd.read_csv("/Users/Simon/Documents/UZH/Machine Learning/Clean Mapping/Clean Mapping.csv", sep=';',
                parse_dates=['adate', 'qdate', 'public_date', 'datadate'])

Deleting all the rows where splticrm has NA values.

In [3]:
df = df[df['splticrm'].notna()]

In [4]:
Y = pd.factorize(df["splticrm"])[0]
print(np.bincount(Y))
print(pd.factorize(df["splticrm"])[1])
Y = pd.DataFrame(Y, columns=["Rating as Factor"])
print(Y)

[3656 1943  990  296 5455 3209  489 3882 4682  199   50  143  382 1069
 1648 1096  220    4    2]
Index(['A', 'A+', 'AA-', 'AAA', 'BBB', 'BBB-', 'AA', 'A-', 'BBB+', 'AA+',
       'CCC+', 'B-', 'B+', 'BB-', 'BB+', 'BB', 'B', 'D', 'CCC'],
      dtype='object')
       Rating as Factor
0                     0
1                     0
2                     0
3                     0
4                     0
...                 ...
29410                 8
29411                 8
29412                 8
29413                 8
29414                 8

[29415 rows x 1 columns]


After taking a first look at the data we see that for ratings D and CCC we only have 4, respectively 2 observations. Therefore we delete these values due to the very low number of observations.

In [5]:
df = df[df['splticrm'] != 'D']
df = df[df['splticrm'] != 'CCC']

Assign to X all the columns but splticrm. Then we drop some columns which will not be relevant for the algorithms.

In [10]:
X = df.loc[:, df.columns != "splticrm"]
X = X.drop(["permno", "CUSIP", "NCUSIP", "adate", "qdate", "public_date", "TICKER"], axis=1)
X = X.drop(["COMNAM", "PERMCO", "NWPERM", "gvkey", "datadate", "tic", "cusip", "conm", "PRC"], axis=1)
X.head()

Unnamed: 0,CAPEI,bm,evm,pe_op_basic,pe_op_dil,pe_exi,pe_inc,ps,pcf,dpr,...,rd_sale,adv_sale,staff_sale,accrual,ptb,PEG_trailing,DIVYIELD,PEG_1yrforward,PEG_ltgforward,Unnamed: 88
0,26.386,0.252,10.11,19.217,19.378,20.052,20.052,4.976,13.353,0.129,...,0.12,0.003,0.0,0.058,4.145,1.045,0.00867,2.221,1.554,
1,28.226,0.252,10.11,20.542,20.714,21.435,21.435,5.323,14.285,0.129,...,0.12,0.003,0.0,0.058,4.434,1.117,0.00811,2.058,1.52,
2,29.464,0.252,10.11,21.425,21.605,22.357,22.357,5.556,14.911,0.129,...,0.12,0.003,0.0,0.058,4.628,1.165,0.00778,2.146,1.586,
3,28.783,0.233,10.983,21.378,21.556,23.096,23.096,5.381,15.909,0.177,...,0.121,0.003,0.0,0.049,4.515,1.545,0.00773,1.848,1.642,
4,25.096,0.233,10.983,18.653,18.808,20.152,20.152,4.692,13.871,0.177,...,0.121,0.003,0.0,0.049,3.937,1.348,0.00886,1.612,1.432,


### Handling missing data
We decided to drop columns which contain more than 10'000 (corresponds to roughly 30%)  NA values or zero values.

In [11]:
NAs = X.isnull().sum() > 10000
Zeros =  (X == 0).sum() > 10000
delNAs = X.columns[NAs] #drops PEG_trailing
delZeros= X.columns[Zeros] #drops rd_sale, adv_sale, staff_sale
X = X.drop(delNAs, axis=1)
X = X.drop(delZeros, axis=1)

Now we split the data into a train and a test set. The test set consists of 20% of the whole dataset.

In [12]:
#now do the train test split
X_train, X_test, y_train, y_test = train_test_split(X, Y, 
                                                    test_size=0.2, 
                                                    random_state=0, 
                                                    stratify=Y)

In [13]:
X_train.head(1)


Unnamed: 0,CAPEI,bm,evm,pe_op_basic,pe_op_dil,pe_exi,pe_inc,ps,pcf,dpr,...,rect_turn,pay_turn,sale_invcap,sale_equity,sale_nwc,accrual,ptb,DIVYIELD,PEG_1yrforward,PEG_ltgforward
7104,36.03,0.235,12.83,11.986,12.131,13.943,6.876,1.884,13.644,0.334,...,4.891,4.585,1.665,2.794,4.008,0.062,4.444,0.0133,0.978,1.281


In [14]:
X_train.isna().sum()

CAPEI               47
bm                 529
evm                 76
pe_op_basic        274
pe_op_dil          278
                  ... 
accrual             28
ptb                529
DIVYIELD          4003
PEG_1yrforward     799
PEG_ltgforward    1286
Length: 67, dtype: int64

We see that there are still missing values that have to be imputed. We do this in the next section.

#### Iterative Imputer
The iterative imputer imputes missing values by by modeling each feature containing missing values as a function of other features and is applied separately to the train and test set.

In [17]:
# apply IterativeImputer
## this can be deleted once the functions file is imported

num_cols = ['CAPEI', 'bm', 'evm', 'pe_op_basic', 'pe_op_dil', 'pe_exi', 'pe_inc', 'ps', 'pcf', 
            'dpr', 'npm', 'opmbd', 'opmad', 'gpm', 'ptpm', 'cfm', 'roa', 'roe', 'roce', 'efftax', 'aftret_eq',
            'aftret_invcapx', 'aftret_equity', 'pretret_noa', 'pretret_earnat', 'GProf', 'equity_invcap',
            'debt_invcap', 'totdebt_invcap', 'capital_ratio', 'int_debt', 'int_totdebt', 'cash_lt', 'invt_act',
            'rect_act', 'debt_at', 'debt_ebitda', 'short_debt', 'curr_debt', 'lt_debt', 'profit_lct', 'ocf_lct',
            'cash_debt', 'fcf_ocf', 'lt_ppent', 'dltt_be', 'debt_assets', 'debt_capital', 'de_ratio', 'intcov',
            'intcov_ratio', 'cash_ratio', 'quick_ratio', 'curr_ratio', 'cash_conversion', 'inv_turn', 'at_turn',
            'rect_turn', 'pay_turn', 'sale_invcap', 'sale_equity', 'sale_nwc', 'accrual', 'ptb',
            'DIVYIELD', 'PEG_1yrforward', 'PEG_ltgforward']

# Copy df to df_mice_imputed
X_train_imputed = X_train[num_cols].copy(deep=True)

# Initialize IterativeImputer
mice_imputer = IterativeImputer(random_state=0)

# Impute using fit_tranform on diabetes
X_train_imputed.iloc[:, :] = mice_imputer.fit_transform(X_train[num_cols])

# Copy df to df_mice_imputed
X_test_imputed = X_test[num_cols].copy(deep=True)

# Impute using tranformation of training set on test set
X_test_imputed.iloc[:, :] = mice_imputer.transform(X_test[num_cols])



In [None]:
# At the latest here import the 'Functions' file

In [None]:
# apply IterativeImputer

X_train_imputed = my_iterative_imputer(X_train)
X_test_imputed = my_iterative_imputer(X_test)

#### Feature Selection
To find out which variables are most important we run the 'features_selection' function and select all variables which explain more than 0.15%.

In [18]:
important_features = feature_selection(x = X_train_imputed, y = y_train.values.ravel(), thres=0.015)

NameError: name 'feature_selection' is not defined

In [None]:
#Subset important features matrix for ML algorithms
X_train_imputed = X_train_imputed.loc[:,important_features]
X_test_imputed = X_test_imputed.loc[:, important_features]

In [None]:
X_train_imputed.head(5)

In [None]:
X_train_imputed.to_csv('X_train.csv')
X_test_imputed.to_csv('X_test.csv')
y_train.to_csv('y_train.csv')
y_test.to_csv('y_test.csv')

## Machine Learning Algorithms

### Logistic Regression

In [None]:
## Logistic Regression

X_train = pd.read_csv("X_train.csv")
X_train = X_train.iloc[:, 1:]
X_test = pd.read_csv("X_test.csv")
X_test = X_test.iloc[:, 1:]
Y_train = pd.read_csv("y_train.csv")
Y_test = pd.read_csv("y_test.csv")
Y_train = Y_train['Rating as Factor'].astype('category') #factorize trainset
Y_test = Y_test['Rating as Factor'].astype('category')   #factorize testset

In [None]:
import datetime
print(datetime.datetime.now())
grid = LogReg(X_train,Y_train)
print('Best parameters:', grid.best_params_) #best parameters are C=7 & ratio=0 -> l2 penalty function
print('Best CV accuracy:', grid.best_score_)
print('Test score:', grid.score(X_test,Y_test)) #31%
print(datetime.datetime.now()) #10 minutes

In [None]:
# Predict classes
y_pred = grid.predict(X_test)

# Manual confusion matrix as pandas DataFrame
confm = pd.DataFrame({'Predicted': y_pred,
                      'True': Y_test})
print(confm.groupby(['True','Predicted'], sort=True).size().unstack('Predicted'))

### Support Vector Machines

In [None]:
#######Polynomial Kernel Function#######
print(datetime.datetime.now()) #computation time
poly = SVM_poly(X_train,Y_train)
print('Best CV accuracy: {:.2f}'.format(poly.best_score_))
print('Test score:       {:.2f}'.format(poly.score(X_test, Y_test)))
print('Best parameters: {}'.format(poly.best_params_))
print(datetime.datetime.now()) #20min

# Predict classes
y_pred = poly.predict(X_test)

# Manual confusion matrix as pandas DataFrame
confm = pd.DataFrame({'Predicted': y_pred,
                      'True': Y_test})
print('Polynomial Kernel Function yields the following confusion matrix:')
print(confm.groupby(['True','Predicted'], sort=True).size().unstack('Predicted')) 


#######Radial Basis Kernel Function(rbf)#######
print(datetime.datetime.now()) #computation time
rbf = SVM_rbf(X_train,Y_train)
print('Best CV accuracy: {:.2f}'.format(rbf.best_score_))
print('Test score:       {:.2f}'.format(rbf.score(X_test, Y_test)))
print('Best parameters: {}'.format(rbf.best_params_))
print(datetime.datetime.now()) #10min

# Predict classes
y_pred = rbf.predict(X_test)

# Manual confusion matrix as pandas DataFrame
confm = pd.DataFrame({'Predicted': y_pred,
                      'True': Y_test})
print('Radial Basis Function Kernel yields the following confusion matrix:')
print(confm.groupby(['True','Predicted'], sort=True).size().unstack('Predicted'))


#######Radial Basis Kernel Function(rbf) with Balanced class weights#######
print(datetime.datetime.now()) #computation time
rbf_bal = SVM_rbf_bal(X_train,Y_train)
print('Best CV accuracy: {:.2f}'.format(rbf_bal.best_score_))
print('Test score:       {:.2f}'.format(rbf_bal.score(X_test, Y_test)))
print('Best parameters: {}'.format(rbf_bal.best_params_))
print(datetime.datetime.now())#10min

#looking at the confusion matrix of non-balanced rbf we see that the smaller classes don't get more wrong classification. Therefore balancing the weights should not influence the outcome greatly which it doesn't


# Predict classes
y_pred = rbf_bal.predict(X_test)

# Manual confusion matrix as pandas DataFrame
confm = pd.DataFrame({'Predicted': y_pred,
                      'True': Y_test})
print('Radial Basis Function Kernel with Balanced class weights yields the following confusion matrix:')
print(confm.groupby(['True','Predicted'], sort=True).size().unstack('Predicted'))

#According to "https://stackoverflow.com/questions/21390570/scikit-learn-svc-coef0-parameter-range" the Sigmoid function does not fulfill the definition of a kernel as it is not positive semidefinite. Therefore we will not use it with Support Vector Machines.