In [41]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, classification_report, precision_recall_curve, auc, f1_score
from pathlib import Path
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV 
from scipy.stats import randint, uniform
import pickle

DATA_PATH = Path("data/processed")

In [None]:
# Reading the data and preparing dependent and independent variables 
df = pd.read_csv(DATA_PATH / 'train.csv')
X = df.drop(columns=['Unnamed: 0', 'into_default'])
y = df['into_default']

# Logistic Regression

In [43]:
X_const = sm.add_constant(X)

# Logistic regression fitting 
logit_model = sm.Logit(y, X_const)
result = logit_model.fit()

# Displaying summary of the model 
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.228838
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:           into_default   No. Observations:                33986
Model:                          Logit   Df Residuals:                    33974
Method:                           MLE   Df Model:                           11
Date:                Thu, 29 May 2025   Pseudo R-squ.:                 0.03875
Time:                        15:59:24   Log-Likelihood:                -7777.3
converged:                       True   LL-Null:                       -8090.8
Covariance Type:            nonrobust   LLR p-value:                2.246e-127
                                                  coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------------------
const                                     

### General To Specific Approach 

In [44]:
# Dropping feature with highest p-value
X = df.drop(columns=['Unnamed: 0', 'into_default', 'Wskaźnik rotacji kapitału obrotowego (x)'])
X_const = sm.add_constant(X)
y = df['into_default']
# Logistic regression fitting 
logit_model = sm.Logit(y, X_const)
result = logit_model.fit()

# Displaying summary of the model 
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.228838
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:           into_default   No. Observations:                33986
Model:                          Logit   Df Residuals:                    33975
Method:                           MLE   Df Model:                           10
Date:                Thu, 29 May 2025   Pseudo R-squ.:                 0.03875
Time:                        15:59:26   Log-Likelihood:                -7777.3
converged:                       True   LL-Null:                       -8090.8
Covariance Type:            nonrobust   LLR p-value:                2.811e-128
                                                  coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------------------
const                                     

In [45]:
# Dropping feature with highest p-value
X = df.drop(columns=['Unnamed: 0', 'into_default', 'Wskaźnik rotacji kapitału obrotowego (x)', 'Obrót aktywów bieżących (x)'])
X_const = sm.add_constant(X)
y = df['into_default']
# Logistic regression fitting 
logit_model = sm.Logit(y, X_const)
result = logit_model.fit()

# Displaying summary of the model 
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.228841
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:           into_default   No. Observations:                33986
Model:                          Logit   Df Residuals:                    33976
Method:                           MLE   Df Model:                            9
Date:                Thu, 29 May 2025   Pseudo R-squ.:                 0.03874
Time:                        15:59:28   Log-Likelihood:                -7777.4
converged:                       True   LL-Null:                       -8090.8
Covariance Type:            nonrobust   LLR p-value:                3.600e-129
                                                  coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------------------
const                                     

In [46]:
# Dropping feature with highest p-value
X = df.drop(columns=['Unnamed: 0', 'into_default', 'Wskaźnik rotacji kapitału obrotowego (x)', 
                     'Obrót aktywów bieżących (x)', 'Wskaźnik rotacji aktywów (x)'])
X_const = sm.add_constant(X)
y = df['into_default']
# Logistic regression fitting 
logit_model = sm.Logit(y, X_const)
result = logit_model.fit()

# Displaying summary of the model 
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.228845
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:           into_default   No. Observations:                33986
Model:                          Logit   Df Residuals:                    33977
Method:                           MLE   Df Model:                            8
Date:                Thu, 29 May 2025   Pseudo R-squ.:                 0.03872
Time:                        15:59:30   Log-Likelihood:                -7777.5
converged:                       True   LL-Null:                       -8090.8
Covariance Type:            nonrobust   LLR p-value:                4.531e-130
                                                  coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------------------
const                                     

### AUC-ROC Calculation 

In [47]:
# Predicting target 
y_pred_prob = result.predict(X_const)

# AUC-ROC calculation 
auc_roc = roc_auc_score(y, y_pred_prob)

print(f'AUC-ROC: {auc_roc:.4f}')

AUC-ROC: 0.6946


### Testing the Model 

In [48]:
# Reading the data and preparing dependent and independent variables 
df_test = pd.read_csv(DATA_PATH /'test.csv')
X_test = df_test.drop(columns=['Unnamed: 0', 'into_default', 'Wskaźnik rotacji kapitału obrotowego (x)', 
                          'Obrót aktywów bieżących (x)', 'Wskaźnik rotacji aktywów (x)'])
y_test = df_test['into_default']
X_test_const = sm.add_constant(X_test)

In [49]:
# Predicting target 
y_test_pred_prob = result.predict(X_test_const)

# AUC-ROC calculation 
auc_roc = roc_auc_score(y_test, y_test_pred_prob)

print(f'AUC-ROC: {auc_roc:.4f}')

AUC-ROC: 0.6906


# XGBoost

In [50]:
# Reading the data and preparing dependent and independent variables 
df = pd.read_csv(DATA_PATH / 'train.csv')
X = df.drop(columns=['Unnamed: 0', 'into_default'])
y = df['into_default']

In [51]:
# Translation
column_translation = {
    'Aktywa razem/Kapitał własnyj (%)': 'Total assets / Equity (%)',
    'Wskaźnik rotacji kapitału obrotowego (x)': 'Working capital turnover ratio (x)',
    'Kapitał obrotowy': 'Working capital',
    'Obrót aktywów bieżących (x)': 'Current assets turnover (x)',
    'Gotówka netto': 'Net cash',
    'Stopa zwrotu z kapitału własnego (ROE) (%)': 'Return on equity (ROE) (%)',
    'Kapitał własny ogółem': 'Total equity',
    'Wskaźnik rotacji aktywów (x)': 'Asset turnover ratio (x)',
    'Operacyjny wskaźnik rentowności aktywów (%)': 'Operating return on assets (%)',
    'Zwrot z kapitału zaangażowanego (%)': 'Return on invested capital (%)',
    'Wiek firmy (lata)': 'Company age (years)'
}

X.rename(columns=column_translation, inplace=True)

print(X.columns)

Index(['Total assets / Equity (%)', 'Working capital turnover ratio (x)',
       'Working capital', 'Current assets turnover (x)', 'Net cash',
       'Return on equity (ROE) (%)', 'Total equity',
       'Asset turnover ratio (x)', 'Operating return on assets (%)',
       'Return on invested capital (%)', 'Company age (years)'],
      dtype='object')


### Hyperparameter Tuning - Grid Search

In [52]:
# Defining hyperparameter grid for XGBoost
param_grid_xgb = {
    'n_estimators': [10, 50, 100],
    'max_depth': [3, 5, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Initializing the XGBoost classifier
xgb = XGBClassifier(random_state=37, eval_metric='logloss', verbosity=0, use_label_encoder=False)

# Setting up GridSearchCV
grid_search_xgb = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid_xgb,
    scoring='roc_auc',  
    cv=5,
    verbose=1,
    n_jobs=-1,
    refit=True  
)

# Fitting the model to the resampled data
grid_search_xgb.fit(X, y)

# Printing best parameters and F1 score
print("Best parameters for XGBoost:", grid_search_xgb.best_params_)
print("Best ROC-AUC score for XGBoost:", grid_search_xgb.best_score_)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best parameters for XGBoost: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50, 'subsample': 0.8}
Best ROC-AUC score for XGBoost: 0.8042725923551991


In [53]:
# Assigning best XGBoost model 
best_xgb = grid_search_xgb.best_estimator_
y_pred_proba_xgb = best_xgb.predict_proba(X)[:, 1]

### Hyperparameter Tuning - Random Search

In [54]:
# Defining the hyperparameter space for XGBoost
param_distributions_xgb = {
    'n_estimators': randint(10, 100), 
    'max_depth': randint(3, 20), 
    'learning_rate': uniform(0.01, 0.3), 
    'subsample': uniform(0.6, 0.4), 
    'colsample_bytree': uniform(0.6, 0.4), 
    'gamma': uniform(0, 5), 
    'reg_alpha': uniform(0, 10), 
    'reg_lambda': uniform(0, 10) 
}

# Initializing the XGBoost classifier
xgb = XGBClassifier(random_state=37, eval_metric='logloss')

# Setting up RandomizedSearchCV
random_search_xgb = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_distributions_xgb,
    n_iter=200, 
    scoring='roc_auc', 
    cv=5, 
    verbose=2,
    random_state=37,
    n_jobs=-1 
)

# Fitting RandomizedSearchCV to the resampled data
random_search_xgb.fit(X, y)

# Printing the best parameters and F1 score
print("Best parameters for XGBoost (Random Search):", random_search_xgb.best_params_)
print("Best ROC-AUC for XGBoost (Random Search):", random_search_xgb.best_score_)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Best parameters for XGBoost (Random Search): {'colsample_bytree': 0.6169224222124968, 'gamma': 0.6798372968803684, 'learning_rate': 0.07940525119049888, 'max_depth': 8, 'n_estimators': 77, 'reg_alpha': 7.052251765229659, 'reg_lambda': 8.414441820976549, 'subsample': 0.8902303305151654}
Best ROC-AUC for XGBoost (Random Search): 0.8034535741784572


In [55]:
# Assigning best XGBoost model 
best_random_xgb = random_search_xgb.best_estimator_
y_pred_proba_random_xgb = best_random_xgb.predict_proba(X)[:, 1]

### Testing the Models

In [56]:
# Reading the data and preparing dependent and independent variables 
df_test = pd.read_csv(DATA_PATH / 'test.csv')
X_test = df_test.drop(columns=['Unnamed: 0', 'into_default'])
y_test = df_test['into_default']
X_test.rename(columns=column_translation, inplace=True)

In [57]:
y_pred_proba_xgb = best_xgb.predict_proba(X_test)[:, 1]

auc_roc = roc_auc_score(y_test, y_pred_proba_xgb)
print(f'AUC-ROC: {auc_roc:.4f}')

AUC-ROC: 0.7849


In [58]:
y_pred_proba_random_xgb = best_random_xgb.predict_proba(X_test)[:, 1]
auc_roc = roc_auc_score(y_test, y_pred_proba_random_xgb)
print(f'AUC-ROC: {auc_roc:.4f}')

AUC-ROC: 0.7879


# Saving Models

In [59]:
# Logistic Regression
with open("models/logistic_model.pkl", "wb") as f:
    pickle.dump(logit_model, f)

# XGBoost
with open("models/xgb_model.pkl", "wb") as f:
    pickle.dump(best_random_xgb, f)