In [1]:
# Python Version 3.11.4, conda version 23.1.0

In [52]:
import pandas as pd # v 1.4.4
import statsmodels as st # v 0.13.2
import sklearn as sk #v 1.0.2
import time
import pandas_profiling as pp # v 3.6.6
import statsmodels.api as sm #v 0.13.2
import matplotlib.pyplot as plt
import numpy as np # v 1.21.5
import seaborn as sns # v 0.11.2


from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.tree import export_text


# EDA, Class Imbalance

In [27]:
df = pd.read_csv("/Users/maiavachon/Documents/GitHub/DATA-300-Statistical-Machine-Learning-Fall-2023-/Data/company_bankruptcy_data.csv")

In [28]:
df.dropna(inplace=True) #drops rows with missing values

In [29]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6819 entries, 0 to 6818
Data columns (total 96 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   Bankrupt                                                  6819 non-null   int64  
 1    ROA(C) before interest and depreciation before interest  6819 non-null   float64
 2    ROA(A) before interest and % after tax                   6819 non-null   float64
 3    ROA(B) before interest and depreciation after tax        6819 non-null   float64
 4    Operating Gross Margin                                   6819 non-null   float64
 5    Realized Sales Gross Margin                              6819 non-null   float64
 6    Operating Profit Rate                                    6819 non-null   float64
 7    Pre-tax net Interest Rate                                6819 non-null   float64
 8    After-tax net Int

Unnamed: 0,Bankrupt,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
count,6819.0,6819.0,6819.0,6819.0,6819.0,6819.0,6819.0,6819.0,6819.0,6819.0,...,6819.0,6819.0,6819.0,6819.0,6819.0,6819.0,6819.0,6819.0,6819.0,6819.0
mean,0.032263,0.50518,0.558625,0.553589,0.607948,0.607929,0.998755,0.79719,0.809084,0.303623,...,0.80776,18629420.0,0.623915,0.607946,0.840402,0.280365,0.027541,0.565358,1.0,0.047578
std,0.17671,0.060686,0.06562,0.061595,0.016934,0.016916,0.01301,0.012869,0.013601,0.011163,...,0.040332,376450100.0,0.01229,0.016934,0.014523,0.014463,0.015668,0.013214,0.0,0.050014
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,0.0,0.476527,0.535543,0.527277,0.600445,0.600434,0.998969,0.797386,0.809312,0.303466,...,0.79675,0.0009036205,0.623636,0.600443,0.840115,0.276944,0.026791,0.565158,1.0,0.024477
50%,0.0,0.502706,0.559802,0.552278,0.605997,0.605976,0.999022,0.797464,0.809375,0.303525,...,0.810619,0.002085213,0.623879,0.605998,0.841179,0.278778,0.026808,0.565252,1.0,0.033798
75%,0.0,0.535563,0.589157,0.584105,0.613914,0.613842,0.999095,0.797579,0.809469,0.303585,...,0.826455,0.005269777,0.624168,0.613913,0.842357,0.281449,0.026913,0.565725,1.0,0.052838
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,9820000000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


This is a large dataset, with 96 variables (columns), 6,819 instances, and no missing values according to the description of the dataset. I'm sure that there is multicollinearity, but the dataset will not work with pandas_profiling() so I will continue to check if the classes are imbalanced.

In [7]:
#checking for balanced classes

class_counts = df['Bankrupt'].value_counts()
print(class_counts)


0    6599
1     220
Name: Bankrupt, dtype: int64


This code shows the approximate class imbalance in the dataset when split by the Bankrupt variable. It is clear from the output of this code that the classes are imbalanced (6599 vs 220). As a result, I will undersample.


In [11]:
#undersampling the data due to imbalance

#group by Bankrupt, separate into 2 datasets
grouped_df = df.groupby('Bankrupt')
not_bankrupt = grouped_df.get_group(0) #6599 observations
bankrupt = grouped_df.get_group(1) #220 observations

#undersample the majority class (not_bankrupt) to match the length of bankrupt, allows for replacement and reproduceability
undersampled_not_bankrupt = not_bankrupt.sample(n=len(bankrupt), replace=True, random_state=10)

#concatenate the new undersampled majority class to the minority dataset
new_df = pd.concat([bankrupt, undersampled_not_bankrupt])


In [12]:
#splitting data into training and testing sets - PART 2 DATA

new_y = new_df['Bankrupt'] #create subset of data, only Bankrupt
new_X = new_df.drop('Bankrupt', axis=1) #create subset of data, everything but Def_ind
new_X_train, new_X_test, new_y_train, new_y_test = train_test_split(new_X, new_y, test_size=0.3, random_state=10) #split the data!


In [30]:
#FOR PART 1

y = df['Bankrupt'] #create subset of data, only Bankrupt
X = df.drop('Bankrupt', axis=1) #create subset of data, everything but Def_ind
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10) #split the data!

## PART 1, Random Forest

In [14]:
#1

start_cpu_time = time.time()
rf_classifier = RandomForestClassifier(random_state=10) #create a classifier with default parameters
rf_classifier.fit(X_train, y_train) #fit the model on the training set
end_cpu_time = time.time()

total_cpu_time = end_cpu_time - start_cpu_time

rf_y_pred = rf_classifier.predict(X_test) #predict values in the test set

#calculate evaluation metrics for this random forest
print("Accuracy:", accuracy_score(y_test, rf_y_pred))
print("Precision:", precision_score(y_test, rf_y_pred))
print("Recall:", recall_score(y_test, rf_y_pred))


Accuracy: 0.9657869012707723
Precision: 0.5882352941176471
Recall: 0.136986301369863


Overall, according to the accuracy score, 96.6% of predictions were correct using the model. This is very good but could indicate overfitting. According to the precision score, 58.8% of values predicted as positive were actually positive. According to the recall score, 13.7% of the actual positive values were correctly predicted by the model. This is not so good. These evaluation metrics indicate that this random forest classifier may have done a good job, but we should be very careful of imbalance since the recall and precision scores were very low.

In [15]:
#2

start_cpu_time2 = time.time()
rf_classifier_2 = RandomForestClassifier(criterion = 'entropy', n_estimators=100, max_depth=10, min_samples_split=2, random_state=10) #create a classifier with non-default parameters
rf_classifier_2.fit(X_train, y_train) #fit this custom model on the training set
end_cpu_time2 = time.time()

total_cpu_time2 = end_cpu_time2 - start_cpu_time2

rf_y_pred_2 = rf_classifier_2.predict(X_test) # predict values from the test set using this second random forest classifier

#calculate evaluation metrics for this random forest
print("Accuracy:", accuracy_score(y_test, rf_y_pred_2))
print("Precision:", precision_score(y_test, rf_y_pred_2))
print("Recall:", recall_score(y_test, rf_y_pred_2))


Accuracy: 0.9672531769305963
Precision: 0.6875
Recall: 0.1506849315068493


The differences in evaluation metric scores between these two random forest classifiers indicates that this one predicts values correctly a little bit more of the time (97%), and the imbalance was handled a little bit better (precision: 69%, recall: 15%). Overall, I prefer these parameters to the defaulted ones, even though there is clearly an issue with how this model is dealing with the imbalanced data.


In [16]:
#3

#I added the calculation of CPU time into the code above, but here are the results.

print("Total CPU time(1):", total_cpu_time, "seconds")
print("Total CPU time(2):", total_cpu_time2, "seconds")


Total CPU time(1): 3.077683925628662 seconds
Total CPU time(2): 2.422775983810425 seconds


According to these CPU times, the first model (with the defaulted parameters) performed almost one second slower than the second model (with unique parameters).


In [17]:
#4

#I used critierion = 'entropy' above, I will try "gini" here.

start_cpu_time3 = time.time()
rf_classifier_3 = RandomForestClassifier(criterion = 'gini', n_estimators=100, max_depth=10, min_samples_split=2, random_state=10) #create a classifier with non-default parameters
rf_classifier_3.fit(X_train, y_train) #fit this custom model on the training set
end_cpu_time3 = time.time()

total_cpu_time3 = end_cpu_time3 - start_cpu_time3

rf_y_pred_3 = rf_classifier_3.predict(X_test) # predict values from the test set using this second random forest classifier

print("Accuracy:", accuracy_score(y_test, rf_y_pred_3))
print("Precision:", precision_score(y_test, rf_y_pred_3))
print("Recall:", recall_score(y_test, rf_y_pred_3))
print("Total CPU time(3):", total_cpu_time3, "seconds")


Accuracy: 0.9672531769305963
Precision: 0.6666666666666666
Recall: 0.1643835616438356
Total CPU time(3): 2.852857828140259 seconds


According to these results compared to the results from above, I think that I would choose the random forest classifier with the criterion = 'entropy' parameter. Although it may mot provide the best results in terms of prediction (using accuracy, precision, and recall scores), it has the fastest CPU time which is a useful quality. Plus, using the 'entropy' parameter resulted in a much lower total CPU time.


In [18]:
#5

rf_first_tree = rf_classifier.estimators_[0] #find the first decision tree from the random forest
rf_tree_rules = export_text(rf_first_tree, feature_names=list(X.columns)) #export just the first tree's worth of rules
print(rf_tree_rules)


|---  Net profit before tax/Paid-in capital <= 0.11
|   |---  Quick Ratio <= 0.01
|   |   |--- class: 1.0
|   |---  Quick Ratio >  0.01
|   |   |--- class: 0.0
|---  Net profit before tax/Paid-in capital >  0.11
|   |---  Current Liability to Current Assets <= 0.08
|   |   |---  Debt ratio % <= 0.22
|   |   |   |---  Net worth/Assets <= 0.84
|   |   |   |   |---  Allocation rate per person <= 0.02
|   |   |   |   |   |---  Total income/Total expense <= 0.00
|   |   |   |   |   |   |---  Working capitcal Turnover Rate <= 0.59
|   |   |   |   |   |   |   |---  Pre-tax net Interest Rate <= 0.80
|   |   |   |   |   |   |   |   |---  Gross Profit to Sales <= 0.58
|   |   |   |   |   |   |   |   |   |--- class: 0.0
|   |   |   |   |   |   |   |   |---  Gross Profit to Sales >  0.58
|   |   |   |   |   |   |   |   |   |--- class: 1.0
|   |   |   |   |   |   |   |---  Pre-tax net Interest Rate >  0.80
|   |   |   |   |   |   |   |   |--- class: 0.0
|   |   |   |   |   |   |---  Working capitca

From what I can tell, these decision rules do make sense for classification. For example, if cashflow is over 0, it should not predict bankrupcy.

In [19]:
#6

feature_importance_df = pd.DataFrame({'Importance': rf_classifier.feature_importances_}) #create a dataframe with the feature importance values for each variable, labeling them as "importance"
print(feature_importance_df.sort_values(by='Importance', ascending=False)) #sort this dataframe by "importance" so that we can see the top 5 feature importance scores!

    Importance
29    0.040877
89    0.031350
22    0.022515
42    0.021618
92    0.020828
..         ...
40    0.004271
66    0.004002
14    0.002183
84    0.000000
93    0.000000

[95 rows x 1 columns]


According to this output, features 29, 89, 22, 42, and 92 were the most important for classification of this random forest.

#7.

I think that the tradeoff between model fitting time and performance has to do with the complexity of the model as well as the size of the data. It is a common misconception that increased model complexity guarantees better performance. But issues like overfitting and overgeneralization prevent this from being the case. As a result, the choice of algorithm soley depends on the characteristics of the problem at hand, the available resources, and what would be deemed as "acceptable" in terms of performance for the problem.


# PART 1, SVM

In [32]:
#1, #3

svm_start_cpu_time = time.time()
svm_classifier = svm.SVC(kernel='rbf',random_state=10) #create an SVM classifier with default parameters
svm_classifier.fit(X_train, y_train) #fit the SVM classifier onto the training data
svm_y_pred = svm_classifier.predict(X_test) #make predictions on the testing data
svm_end_cpu_time = time.time()

svm_total_cpu_time = svm_end_cpu_time - svm_start_cpu_time

print("SVM Accuracy:", accuracy_score(y_test, svm_y_pred))
print("SVM Precision:", precision_score(y_test, svm_y_pred))
print("SVM Recall:", recall_score(y_test, svm_y_pred))

SVM Accuracy: 0.9643206256109482
SVM Precision: 0.0
SVM Recall: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


The SVM classifier with default parameters did not seem to make predictions with as much accuracy as we would have liked. According to the accuracy metric, the model correctly predicted 54% instances in the test set. The warning from running this code indicates that there are no predicted positive samples (TP or FP) in the test set. This seems unlikely, but can be explained by the use of highly unbalanced data. The model may be predicting towards the majority class, which is MUCH larger than the minority class.

In [None]:
#2, #3

svm_classifier2 = svm.SVC(kernel='linear',random_state=10) #create an SVM classifier with the kernel = 'linear' parameter
svm_classifier2.fit(X_train, y_train) #fit the SVM classifier onto the training data
svm_y_pred2 = svm_classifier2.predict(X_test) #make predictions on the testing data

print("SVM Accuracy (2):", accuracy_score(y_test, svm_y_pred2))
print("SVM Precision (2):", precision_score(y_test, svm_y_pred2))
print("SVM Recall (2):", recall_score(y_test, svm_y_pred2))


It seems that when we introduce the kernel = 'linear' parameter to the SVM classifier, it does not converge. This means that the model has failed to reach an optimal solution within a certain number of tires.

In [36]:
#4

print("Total SVM CPU time:", svm_total_cpu_time, "seconds")
print("Total SVM CPU time (linear): Forever")

Total SVM CPU time: 0.7720298767089844 seconds
Total SVM CPU time (linear): Forever


According to the CPU times above, it is clear that we would choose the first SVM model. For one, this total CPU time is pretty good! But also, in comparison to the model using kernel='linear', this time is exponentially better because it converges!

# PART 2 - Undersampled Data

In [37]:
#Random Forest - Undersampled Data

rf2_start_cpu_time = time.time()
rf2_classifier = RandomForestClassifier(random_state=10) #create a classifier with default parameters
rf2_classifier.fit(new_X_train, new_y_train) #fit the model on the training set
rf2_end_cpu_time = time.time()

rf2_total_cpu_time = rf2_end_cpu_time - rf2_start_cpu_time

rf2_y_pred = rf2_classifier.predict(new_X_test) #predict values in the test set


Accuracy: 0.8712121212121212
Precision: 0.8985507246376812
Recall: 0.8611111111111112


In [39]:
#SVM - Undersampled Data

svm2_start_cpu_time = time.time()
svm2_classifier = svm.SVC(kernel='rbf',random_state=10) #create an SVM classifier with default parameters
svm2_classifier.fit(new_X_train, new_y_train) #fit the SVM classifier onto the training data
svm2_end_cpu_time = time.time()

svm2_total_cpu_time = svm2_end_cpu_time - svm2_start_cpu_time

svm2_y_pred = svm2_classifier.predict(new_X_test) #make predictions on the testing data


SVM Accuracy: 0.5378787878787878
SVM Precision: 0.6222222222222222
SVM Recall: 0.3888888888888889


In [40]:
#1

print("Total RF CPU time:", rf2_total_cpu_time, "seconds")
print("Total SVM CPU time:", svm2_total_cpu_time, "seconds")

Total RF CPU time: 0.2516050338745117 seconds
Total SVM CPU time: 0.013772964477539062 seconds


When using the undersampled data, the CPU times went down for both models. Although both times are pretty good, the total CPU time for the SVM model using this undersampled data was the lowest at 0.014 seconds. This is great!

In [42]:
#2

#calculate and print the evaluation metrics for the new random forest
print("RF Accuracy:", accuracy_score(new_y_test, rf2_y_pred))
print("RF Precision:", precision_score(new_y_test, rf2_y_pred))
print("RF Recall:", recall_score(new_y_test, rf2_y_pred))

print("")

#calculate and print the evaluation metrics for the new SVM
print("SVM Accuracy:", accuracy_score(new_y_test, svm2_y_pred))
print("SVM Precision:", precision_score(new_y_test, svm2_y_pred))
print("SVM Recall:", recall_score(new_y_test, svm2_y_pred))


RF Accuracy: 0.8712121212121212
RF Precision: 0.8985507246376812
RF Recall: 0.8611111111111112

SVM Accuracy: 0.5378787878787878
SVM Precision: 0.6222222222222222
SVM Recall: 0.3888888888888889


While the SVM model was way faster, it is clear from the evaluation metrics that the random forest model was better at predicting values in the test set. The scores for the random forest are as follows. 87% of the predictions made by the model were correct (accuracy). When the model predicts a positive, it is correct 90% of the time (precision). Lastly, 86% of the actual positives were identified (recall). In my opinion, the CPU time for this random forest, 0.25 seconds, is fine for this dataset. However, if we had a more complex problem or more data, I might choose the SVM for speed.