In [None]:
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [None]:
data_path = 'data/'

In [None]:
df = pd.read_csv(os.path.join(data_path, 'data.csv'))
df['Bankrupt?'] = df['Bankrupt?'].astype(bool)
df.info()

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(df[df.columns[1:]])
y = df['Bankrupt?'].to_numpy()
df_normalized = pd.DataFrame(np.column_stack((y, X)), columns=df.columns)

In [None]:
df_normalized.describe()

In [None]:
plt.figure(figsize=(12, 12))
sns.heatmap(df_normalized.corr(numeric_only=True))
plt.title('Correlation plot between columns')
plt.show()

In [None]:
pca = PCA(10)
pca.fit(df_normalized)

In [None]:
pd.DataFrame(pca.components_.T, index=df.columns)

In [None]:
important_cols = ['Bankrupt?', 'Current Liability to Equity', 'Net Income to Stockholder\'s Equity', 'Debt ratio %', 'ROA(A) before interest and % after tax', 'Persistent EPS in the Last Four Seasons', 'Equity to Long-term Liability', 'Current Assets/Total Assets', 'Working Capital to Total Assets', 'Current Liability to Current Assets', 'Pre-tax net Interest Rate', 'Cash Reinvestment %', 'CFO to Assets', 'Net worth/Assets', 'Operating Gross Margin', 'Gross Profit to Sales', 'Working capitcal Turnover Rate', 'Cash Flow to Sales', 'After-tax Net Profit Growth Rate', 'Net Income to Total Assets', 'Net Value Per Share (A)']

In [None]:
plt.figure(figsize=(12, 12))
sns.heatmap(df_normalized[important_cols].corr(numeric_only=True), annot=True, fmt='.2f')
plt.title('Correlation plot between columns')
plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

## Random Forest Classifier

In [None]:
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)

In [None]:
y_pred = rf_classifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred, normalize='pred')
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.title("Random Forest Confusion Matrix")
plt.show()

In [None]:
y_proba = rf_classifier.predict_proba(X_test)[:, 1]

RocCurveDisplay.from_predictions(y_test, y_proba)
plt.title("Random Forest ROC Curve")
plt.show()

In [None]:
print(classification_report(y_test, y_pred))

## Support Vector Machine

In [None]:
svc = SVC(probability=True)
svc.fit(X_train, y_train)

In [None]:
y_pred = svc.predict(X_test)

cm = confusion_matrix(y_test, y_pred, normalize='pred')
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.title("Support Vector Machine Confusion Matrix")
plt.show()

In [None]:
y_proba = svc.predict_proba(X_test)[:, 1]

RocCurveDisplay.from_predictions(y_test, y_proba)
plt.title("Support Vector Machine ROC Curve")
plt.show()

In [None]:
print(classification_report(y_test, y_pred))

## Logistic Regression

In [None]:
logr = LogisticRegression(max_iter=1000)

logr.fit(X_train, y_train)
y_pred = logr.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, y_pred, normalize='pred')
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.title("Logistic Regression Confusion Matrix")
plt.show()

In [None]:
y_proba = logr.predict_proba(X_test)[:, 1]

RocCurveDisplay.from_predictions(y_test, y_proba)
plt.title("Logistic Regression ROC Curve")
plt.show()

In [None]:
print(classification_report(y_test, y_pred))

This one seems to be the best

# XAI