In [1]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
polish_companies_bankruptcy = fetch_ucirepo(id=365) 
  
# data (as pandas dataframes) 
X = polish_companies_bankruptcy.data.features 
y = polish_companies_bankruptcy.data.targets 
  

In [4]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd

In [5]:
df = pd.concat([X, y], axis=1)
df = df.dropna()
X = df.drop('class', axis=1)
y = df["class"]

In [6]:
X_train = X[X['year'].isin([1, 2, 3, 4])]
X_test = X[X['year']==5]
y_train = y.iloc[:len(X_train)]
y_test = y.tail(len(X_test))

In [7]:
import numpy as np
import time
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [8]:
glm_fit = LogisticRegression().fit(X_train, y_train)

logit_pr = glm_fit.predict_proba(X_test)[:, 1]
logit_est = np.where(logit_pr > 0.5, 1, 0)

misclassification_error = np.mean(logit_est != y_test)
print(misclassification_error)

0.03365225998020455


In [9]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

def processSubset(feature_set):
    # Fit model on feature_set and calculate cross-validated F1-score
    model = LogisticRegression()
    f1 = cross_val_score(model, X_train[list(feature_set)], y_train, cv=5, scoring='f1').mean()
    return {"model": model, "features": feature_set, "f1_score": f1}

def forward(predictors):
    remaining_predictors = [p for p in X_train.columns if p not in predictors]
    tic = time.time()
    results = []
    
    for p in remaining_predictors:
        results.append(processSubset(predictors+[p]))
    
    models = pd.DataFrame(results)
    best_model = models.loc[models['f1_score'].idxmax()]
    
    toc = time.time()
    print("Processed ", models.shape[0], "models on", len(predictors)+1, "predictors in", (toc-tic), "seconds.")
    
    return best_model

models_fwd = pd.DataFrame(columns=["f1_score", "model", "features"])

tic = time.time()
predictors = []

for i in range(1, len(X_train.columns) + 1):
    best_model = forward(predictors)
    if models_fwd.empty or best_model['f1_score'] > models_fwd.iloc[-1]['f1_score']:
        models_fwd.loc[i] = best_model
        predictors = list(best_model["features"])
    else:
        break  # Break if no improvement

toc = time.time()
print("Total elapsed time:", (toc-tic), "seconds.")

Processed  65 models on 1 predictors in 2.5166401863098145 seconds.
Processed  64 models on 2 predictors in 9.600405931472778 seconds.
Processed  63 models on 3 predictors in 9.993138790130615 seconds.
Processed  62 models on 4 predictors in 12.689826965332031 seconds.
Processed  61 models on 5 predictors in 13.63638710975647 seconds.
Processed  60 models on 6 predictors in 13.936256885528564 seconds.
Processed  59 models on 7 predictors in 15.975390195846558 seconds.
Processed  58 models on 8 predictors in 12.820421934127808 seconds.
Processed  57 models on 9 predictors in 14.159792184829712 seconds.
Total elapsed time: 105.34145402908325 seconds.


In [11]:
best_features = models_fwd.iloc[-1]['features']
best_model = models_fwd.loc[2, "model"].fit(X_train[best_features],y_train)
predictions = best_model.predict(X_test[best_features])
f1 = f1_score(y_test, predictions)
accuracy = accuracy_score(y_test, predictions)
recall = recall_score(y_test, predictions)
precision = precision_score(y_test, predictions)

In [13]:
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.9574397888485648
Precision: 0.09090909090909091
Recall: 0.029411764705882353
F1 Score: 0.04444444444444444
