# CS559 Project

Import Dataset

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from itertools import combinations
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

file_path = './Desktop/train_data.csv'
data = pd.read_csv(file_path)

X = data.drop(columns=['Index', 'Bankrupt?'])
y = data['Bankrupt?']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

data.head()

Unnamed: 0,Index,Bankrupt?,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
0,0,0,0.450397,0.504034,0.506986,0.59464,0.59464,0.998906,0.797293,0.809239,...,0.780554,0.004919,0.623634,0.594641,0.838869,0.279036,0.026788,0.565144,1,0.032464
1,1,0,0.530005,0.572885,0.574763,0.605695,0.605558,0.999058,0.797512,0.809399,...,0.819963,0.005968,0.624171,0.60569,0.841869,0.27904,0.026801,0.565205,1,0.032442
2,2,0,0.57115,0.620148,0.624177,0.612275,0.612282,0.999163,0.797654,0.809533,...,0.839128,0.006022,0.625306,0.612271,0.843294,0.278927,0.026816,0.565276,1,0.033034
3,3,0,0.483401,0.556694,0.536164,0.602445,0.602445,0.999035,0.797458,0.80938,...,0.806477,0.002177,0.62161,0.602444,0.841891,0.293391,0.027063,0.56619,1,0.015406
4,4,0,0.510359,0.537287,0.552546,0.600023,0.600023,0.999009,0.797406,0.809313,...,0.799277,0.001124,0.623993,0.600019,0.840313,0.279878,0.02688,0.565549,1,0.028858


Check Assumptions

In [4]:
def calculate_vif(X):
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif

vif = calculate_vif(pd.DataFrame(X_scaled, columns=X.columns))
print(vif)

  vif = 1. / (1. - r_squared_i)


                                            variables           VIF
0    ROA(C) before interest and depreciation befor...  7.063760e+01
1              ROA(A) before interest and % after tax  5.067655e+01
2    ROA(B) before interest and depreciation after...  1.071441e+02
3                              Operating Gross Margin  7.183786e+07
4                         Realized Sales Gross Margin  1.097341e+03
..                                                ...           ...
90                                Liability to Equity  4.656354e+02
91                 Degree of Financial Leverage (DFL)  1.010536e+00
92   Interest Coverage Ratio (Interest expense to ...  1.011676e+00
93                                    Net Income Flag           NaN
94                                Equity to Liability  3.251059e+00

[95 rows x 2 columns]


  return 1 - self.ssr/self.uncentered_tss


Define models and k

In [8]:
k_values = range(3, 11)
base_models = [
    LogisticRegression(max_iter=1000),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    SVC(probability=True) 
]

Iterate through k and m

In [9]:
model_combinations = sum([list(combinations(base_models, i)) for i in range(1, len(base_models)+1)], [])

best_k = None
best_model_combination = None
best_accuracy = 0

for k in k_values:
    for model_comb in model_combinations:
        # Initialize KFold
        kf = KFold(n_splits=k, shuffle=True, random_state=42)
        splits = list(kf.split(X_scaled))
        
        # Number of models in the current combination
        num_models = len(model_comb)
        
        # Initialize an array to store predictions for stacking
        stacked_predictions = np.zeros((X_scaled.shape[0], num_models))
        
        # Iterate over each fold
        for fold_index, (train_index, test_index) in enumerate(splits):
            X_train, X_test = X_scaled[train_index], X_scaled[test_index]
            y_train, y_test = y[train_index], y[test_index]
            
            # Train and test each model
            for model_index, model in enumerate(model_comb):
                model.fit(X_train, y_train)
                preds = model.predict(X_test)
                stacked_predictions[test_index, model_index] = preds
        
        # Define meta-learner
        meta_learner = LogisticRegression(max_iter=1000)
        
        # Define stacking model
        estimators = [(f'model_{i}', model) for i, model in enumerate(model_comb)]
        
        stacking_model = StackingClassifier(estimators=estimators, final_estimator=meta_learner)
        
        # Split data for training and testing the stacking model
        X_train_meta, X_test_meta, y_train_meta, y_test_meta = train_test_split(stacked_predictions, y, test_size=0.2, random_state=42)
        
        # Fit the stacking model
        stacking_model.fit(X_train_meta, y_train_meta)
        
        # Generate final predictions
        final_predictions = stacking_model.predict(X_test_meta)
        
        # Evaluate the final model
        accuracy = accuracy_score(y_test_meta, final_predictions)
        print(f'k={k}, Models={estimators}, Accuracy={accuracy}')
        
        # Update best k value and model combination
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_k = k
            best_model_combination = model_comb

print(f'Best k value: {best_k} with accuracy: {best_accuracy}')
print('Best model combination:', [(f'model_{i}', model.__class__.__name__) for i, model in enumerate(best_model_combination)])

k=3, Models=[('model_0', LogisticRegression(max_iter=1000))], Accuracy=0.9672977624784854
k=3, Models=[('model_0', DecisionTreeClassifier())], Accuracy=0.9672977624784854
k=3, Models=[('model_0', RandomForestClassifier())], Accuracy=0.9672977624784854
k=3, Models=[('model_0', GradientBoostingClassifier())], Accuracy=0.9672977624784854
k=3, Models=[('model_0', SVC(probability=True))], Accuracy=0.9672977624784854
k=3, Models=[('model_0', LogisticRegression(max_iter=1000)), ('model_1', DecisionTreeClassifier())], Accuracy=0.9672977624784854
k=3, Models=[('model_0', LogisticRegression(max_iter=1000)), ('model_1', RandomForestClassifier())], Accuracy=0.9690189328743546
k=3, Models=[('model_0', LogisticRegression(max_iter=1000)), ('model_1', GradientBoostingClassifier())], Accuracy=0.9698795180722891
k=3, Models=[('model_0', LogisticRegression(max_iter=1000)), ('model_1', SVC(probability=True))], Accuracy=0.9672977624784854
k=3, Models=[('model_0', DecisionTreeClassifier()), ('model_1', Rand

Possible changes: Class imbalance, bias & variance, different models? 