# Tutorial

### Instructions

In this tutorial, you will be creating a classifcation model. The data were collected from the Taiwan Economic Journal for the years 1999 to 2009. Company bankruptcy was defined based on the business regulations of the Taiwan Stock Exchange. Apply what you have learnt and come up with the optimal model to predict if a company goes bankrupt or not. 

Bonus: Determine what are the key features that influence your model. 

 PS: If you find a technique/method that you believe was not shared in class but useful; you can and should use it. No points will be deducted for trying! Do not delete your trial scripts, these will be important for you to understand where you improved from your previous attempts.



given your model score, why did you choose a specific model to present to us? is it cuz of the precision score? accuracy score? t score? to this score/f1 score? 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, r2_score, f1_score, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn import tree

from scipy.stats import zscore
from scipy import optimize

In [None]:
df = pd.read_csv ('data.csv')
df.shape

In [None]:
df.head()

In [None]:
df.columns

In [None]:
# Core ratios for Z-score calculation
df['Z_score'] = (
    1.2 * df[' Working Capital to Total Assets'] +
    1.4 * df[' Retained Earnings to Total Assets'] +
    3.3 * df[' Net Income to Total Assets'] +  # Approximation for EBIT/Total Assets
    0.6 * df[' Net worth/Assets'] +  # Substitute for Market Value of Equity/Book Liabilities
    1.0 * df[' Revenue Per Share (Yuan ¥)'] / df[' Total Asset Growth Rate']  # Scaled for Sales/Total Assets
)

# Target variable: Bankrupt? (1 = bankrupt, 0 = not bankrupt)
x = df[['Z_score', ' Debt ratio %', ' Cash Flow to Total Assets', " Net Income to Stockholder's Equity"]]
y = df['Bankrupt?']

from scipy.stats import zscore

# Calculate Z-scores for each feature
z_scores = np.abs(zscore(df[['Z_score', ' Debt ratio %', ' Cash Flow to Total Assets', " Net Income to Stockholder's Equity"]]))

# Set a threshold (e.g., 3 standard deviations)
outliers = (z_scores > 3)

# Print outliers indices
print(np.where(outliers))

In [None]:
# Check for NaN or Inf values in your dataset
print(df.isna().sum())  # Check for NaN values
print(np.isinf(df).sum())  # Check for Inf values

In [None]:
# Replace NaN values with column mean
df.fillna(df.mean(), inplace=True)

# Replace zero values with a small non-zero value (e.g., a small epsilon)
df[[' Working Capital to Total Assets', ' Retained Earnings to Total Assets', ' Net Income to Total Assets',
    ' Net worth/Assets', ' Revenue Per Share (Yuan ¥)', ' Total Asset Growth Rate']] = df[
    [' Working Capital to Total Assets', ' Retained Earnings to Total Assets', ' Net Income to Total Assets',
     ' Net worth/Assets', ' Revenue Per Share (Yuan ¥)', ' Total Asset Growth Rate']].replace(0, 1e-10)


In [None]:
# Core ratios for Z-score calculation
df['Z_score'] = (
    1.2 * df[' Working Capital to Total Assets'] +
    1.4 * df[' Retained Earnings to Total Assets'] +
    3.3 * df[' Net Income to Total Assets'] +  # Approximation for EBIT/Total Assets
    0.6 * df[' Net worth/Assets'] +  # Substitute for Market Value of Equity/Book Liabilities
    1.0 * df[' Revenue Per Share (Yuan ¥)'] / df[' Total Asset Growth Rate']  # Scaled for Sales/Total Assets
)

# Target variable: Bankrupt? (1 = bankrupt, 0 = not bankrupt)
x = df[['Z_score', ' Debt ratio %', ' Cash Flow to Total Assets', " Net Income to Stockholder's Equity"]]
y = df['Bankrupt?']

In [None]:
# Check for Inf values in the Z-score column
print(np.isinf(df['Z_score']).sum())

# Optionally, clip values that are still too large (if desired)
df['Z_score'] = np.clip(df['Z_score'], -1e10, 1e10)

In [None]:
# Train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# Feature scaling
scaler = StandardScaler()

x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='roc_auc', verbose=2, n_jobs=-1)
grid_search.fit(x_train, y_train)

# regularization constant (strength)
REG_CONST = 0.01

# Create a model and fit it to the training data.
#(l2 -Ridge Regression here)
# C := inverse of regularization strength 
model = LogisticRegression(penalty='l2', C=1./REG_CONST, max_iter=300)
model.fit(x_train_scaled, y_train)

# Best model
best_rf = grid_search.best_estimator_

# Predictions and evaluation
y_pred = best_rf.predict(x_test)
y_prob = best_rf.predict_proba(x_test)[:, 1]  # Get probabilities for ROC

print("Best Hyperparameters:", grid_search.best_params_)
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.grid()
plt.show()

# Create and plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()

# Calculate F1 score
f1 = f1_score(y_test, y_pred)

# Print AUC score
print(f'AUC Score: {roc_auc:.3f}')
print(f'F1 Score: {f1:.3f}')

# Print Classification Report 
print("\nClassification Report:")
print(classification_report(y_test, y_pred))