# Libraries

In [32]:
import pandas as pd
import numpy as np

# Dataset

In [46]:
df_original = pd.read_csv("Data/train.csv").drop("Unnamed: 0",axis=1)
df = df_original.copy()

In [47]:
df.head()

Unnamed: 0,id,stmt_date,HQ_city,legal_struct,ateco_sector,def_date,fs_year,asst_intang_fixed,asst_tang_fixed,asst_fixed_fin,...,inc_extraord,taxes,profit,days_rec,ebitda,roa,roe,wc_net,margin_fin,cf_operations
0,520288,2011-12-31,28.0,SRL,14.0,NaT,2011,67537.0,1137566.0,1908.0,...,-309.0,-80959.0,-81288.0,,6318.0,-3.81,-28.03,496258.0,-917029.0,-849.0
1,520288,2008-12-31,28.0,SRL,14.0,NaT,2008,256438.0,1181416.0,860.0,...,-678.0,-94622.0,-107382.0,,46088.0,-2.76,,97952.0,,-3881.0
2,520288,2009-12-31,28.0,SRL,14.0,NaT,2009,194046.0,1152014.0,860.0,...,4224.0,-74235.0,-77819.0,,67611.0,-2.17,,-210671.0,,32618.0
3,520288,2012-12-31,28.0,SRL,14.0,NaT,2012,15195.0,1116938.0,2023.0,...,3634.0,-250786.0,-250786.0,,-161478.0,-12.99,,367892.0,-1094962.0,-168907.0
4,520288,2007-12-31,28.0,SRL,14.0,NaT,2007,126603.0,1127807.0,620.0,...,820.0,92192.0,36733.0,,153060.0,6.2,52.43,-317007.0,-1184970.0,80039.0


## Target Labels

In [48]:
df['default_year'] = pd.to_datetime(df['def_date'], format="%d/%m/%Y", errors='coerce').dt.year

df['target'] = ((df['default_year'].notna()) & (df['fs_year'] + 1 >= df['default_year'])).astype(int)

# Features selection

## Null values

In [49]:
df['roa'] = df['roa'].fillna(df['profit'] / df['asst_tot'])
df['roe'] = df['roe'].fillna(df['profit'] / df['eqty_tot'])
df['exp_financing'] = df['exp_financing'].fillna(df['inc_financing'] - df['prof_financing'])
df['eqty_tot'] = df['eqty_tot'].fillna(df['asst_tot'] - (df['liab_lt'] + df['debt_bank_st'] + df['debt_bank_lt'] +
                                        df['debt_fin_st'] + df['debt_fin_lt'] + df['AP_st'] +
                                        df['AP_lt']))


## Features engineering

In [50]:
df['def_date'] = pd.to_datetime(df['def_date'], dayfirst=True)
df['stmt_date'] = pd.to_datetime(df['stmt_date'])
# size
# df['asst_tot']
# leverage
df['td_ta'] = (df['asst_tot'] - df['eqty_tot']) / df['asst_tot']
df['td_te'] = (df['asst_tot'] - df['eqty_tot']) / df['eqty_tot']
df['td_ebitda'] = (df['asst_tot'] - df['eqty_tot']) / df['ebitda']
# profitability
# df['roa']
df['operating_margin'] = df['prof_operations'] / df['rev_operating']
df['earning_power'] = df['ebitda'] / df['asst_tot']
# liquidity
df['Liquidity'] = df['cash_and_equiv'] / (df['asst_tot'])
df['current_ratio'] = df['asst_tot']/(df['asst_tot'] - df['wc_net'])
df['cash_ratio'] = df['cash_and_equiv'] / (df['asst_tot'] - df['wc_net'])
# debt coverage
df['Debt_coverage'] = df['cf_operations'] / df['exp_financing']

In [38]:
# train = df[['id', 'stmt_date', 'fs_year', 'def_date', 'legal_struct', 'ateco_sector', 
#              'roa','operating_margin','earning_power','td_ta', 'td_te',
#             'td_ebitda','current_ratio','cash_ratio','Debt_coverage', 'Liquidity', 'asst_tot']].copy()

# train.head()

Unnamed: 0,id,stmt_date,fs_year,def_date,legal_struct,ateco_sector,roa,operating_margin,earning_power,td_ta,td_te,td_ebitda,current_ratio,cash_ratio,Debt_coverage,Liquidity,asst_tot,default_year,target
0,520288,2011-12-31,2011,NaT,SRL,14.0,-3.81,-0.137751,0.003245,0.851058,5.714003,262.259892,1.342086,0.001436,-0.054598,0.00107,1946940.0,,0
1,520288,2008-12-31,2008,NaT,SRL,14.0,-2.76,-0.059348,0.022186,1.017968,-56.65562,45.883332,1.049486,0.002697,-0.077998,0.00257,2077346.0,,0
2,520288,2009-12-31,2009,NaT,SRL,14.0,-2.17,-0.067053,0.034218,1.022845,-44.773101,29.891778,0.903651,0.004798,0.925622,0.00531,1975874.0,,0
3,520288,2012-12-31,2012,NaT,SRL,14.0,-12.99,-0.76099,-0.086219,0.979073,46.784916,-11.355652,1.244448,0.004351,-12.015864,0.003496,1872882.0,,0
4,520288,2007-12-31,2007,NaT,SRL,14.0,6.2,0.053771,0.086517,0.960398,24.251527,11.100627,0.848041,0.006845,5.898666,0.008071,1769122.0,,0


In [53]:
final_train = df[["fs_year",'target','roa','td_ta','current_ratio','Debt_coverage', 'asst_tot']].copy()
final_train = final_train.replace([float('inf'), -float('inf')], float('nan')).dropna()
final_train     

Unnamed: 0,fs_year,target,roa,td_ta,current_ratio,Debt_coverage,asst_tot
0,2011,0,-3.81,0.851058,1.342086,-0.054598,1946940.0
1,2008,0,-2.76,1.017968,1.049486,-0.077998,2077346.0
2,2009,0,-2.17,1.022845,0.903651,0.925622,1975874.0
3,2012,0,-12.99,0.979073,1.244448,-12.015864,1872882.0
4,2007,0,6.20,0.960398,0.848041,5.898666,1769122.0
...,...,...,...,...,...,...,...
1023546,2009,0,6.03,0.874221,0.939924,6.128956,5108231.0
1023547,2011,0,2.16,0.918389,1.027497,6.999581,7648851.0
1023548,2008,0,12.97,0.897331,0.929462,4.469833,6223862.0
1023549,2007,0,6.15,0.897084,0.987401,8.157203,6146110.0


In [54]:
final_train.isna().sum()

fs_year          0
target           0
roa              0
td_ta            0
current_ratio    0
Debt_coverage    0
asst_tot         0
dtype: int64

# Logistic test

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
import statsmodels.formula.api as sm


df_train, df_test = train_test_split(final_train, test_size=0.3, random_state=42, stratify=final_train['target'])
formula = f"target ~ roa + td_ta + current_ratio + Debt_coverage + asst_tot"
model = sm.logit(formula=formula, data=df_train).fit()

X_test = df_test[['roa','td_ta','current_ratio','Debt_coverage', 'asst_tot']]
y_test = df_test['target']
prob = model.predict(X_test)
roc_auc = roc_auc_score(y_test, prob)
print(model.summary())
print(f"ROC AUC: {roc_auc:.4f}")

Optimization terminated successfully.
         Current function value: 0.053666
         Iterations 11
                           Logit Regression Results                           
Dep. Variable:                 target   No. Observations:               690461
Model:                          Logit   Df Residuals:                   690455
Method:                           MLE   Df Model:                            5
Date:                Sun, 10 Nov 2024   Pseudo R-squ.:                  0.1299
Time:                        15:35:42   Log-Likelihood:                -37054.
converged:                       True   LL-Null:                       -42586.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept        -5.4147      0.032   -166.966      0.000      -5.478      -5.351
roa              -0

# Walk-forward analysis

In [None]:
def walk_forward_harness(data, preprocessor, estimator, predictor_harness):

    df = data.copy()

    predictions = []
    model_list = []
    stats_list = []

    df['fs_year'] = df['fs_year'].astype(int)
    df = df.sort_values(by="fs_year",ascending=True)
    years = df['fs_year'].unique()

    print(years)
    
    for i in range(0, len(years)-1):

        year = years[i]
        print("Test year:", year)

        train_data = df[df['fs_year'] <= year]
        test_data = df[df['fs_year'] == year + 1]

        # pre-processing
        df_train = preprocessor(train_data)

        # estimator
        column_names = list(df_train.drop(["target","fs_year"], axis=1).columns)
        my_formula = "target ~ " + " + ".join(column_names)
        model = estimator(df_train, my_formula)
        model_list.append(model)

        # predictor
        result_dict = predictor_harness(test_data, model, preprocessor)
        predictions.append(result_dict)

        # performance metrics
        stats = metrics(result_dict, year)
        stats_list.append(stats)              

    return predictions, model_list, stats_list

# Test

In [66]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
import statsmodels.formula.api as sm


def target_label(df):
    """
    Create target label for the dataset.
    If the default year within the fiscal year and the next year of the fiscal year (fis_year + 1), return 1
    Else (longer or NaT) return 0
    """

    df['default_year'] = pd.to_datetime(df['def_date'], format="%d/%m/%Y", errors='coerce').dt.year
    df['target'] = ((df['default_year'].notna()) & (df['fs_year'] + 1 >= df['default_year'])).astype(int)

    return df

def null_imputation(df):
    """
    Null values imputation for features engineering
    """

    df['roa'] = df['roa'].fillna(df['profit'] / df['asst_tot'])
    df['roe'] = df['roe'].fillna(df['profit'] / df['eqty_tot'])
    df['exp_financing'] = df['exp_financing'].fillna(df['inc_financing'] - df['prof_financing'])
    df['eqty_tot'] = df['eqty_tot'].fillna(df['asst_tot'] - (df['liab_lt'] + df['debt_bank_st'] + df['debt_bank_lt'] +
                                            df['debt_fin_st'] + df['debt_fin_lt'] + df['AP_st'] +
                                            df['AP_lt']))
    
    return df


def features_engineering(df):

    df['def_date'] = pd.to_datetime(df['def_date'], dayfirst=True)
    df['stmt_date'] = pd.to_datetime(df['stmt_date'])
 
    # leverage
    df['td_ta'] = (df['asst_tot'] - df['eqty_tot']) / df['asst_tot']
    df['td_te'] = (df['asst_tot'] - df['eqty_tot']) / df['eqty_tot']
    df['td_ebitda'] = (df['asst_tot'] - df['eqty_tot']) / df['ebitda']
    # profitability
    df['operating_margin'] = df['prof_operations'] / df['rev_operating']
    df['earning_power'] = df['ebitda'] / df['asst_tot']
    # liquidity
    df['Liquidity'] = df['cash_and_equiv'] / (df['asst_tot'])
    df['current_ratio'] = df['asst_tot']/(df['asst_tot'] - df['wc_net'])
    df['cash_ratio'] = df['cash_and_equiv'] / (df['asst_tot'] - df['wc_net'])
    # debt coverage
    df['Debt_coverage'] = df['cf_operations'] / df['exp_financing']

    return df



def preprocessor(data):

    df = data.copy()

    df_labeled = target_label(df)

    df_imputed = null_imputation(df_labeled)

    df_engineered = features_engineering(df_imputed)

    final_columns = ["fs_year",'target','roa','td_ta','current_ratio','Debt_coverage', 'asst_tot']
    final_df = df_engineered[final_columns]
    final_df = final_df.replace([float('inf'), -float('inf')], float('nan')).dropna()

    return final_df


def estimator(df, formula):
    #f: "target ~ roa + td_ta + current_ratio + Debt_coverage + asst_tot"
    model = sm.logit(formula, data=df).fit()
    return model


def predictor(test_df, model):
    prob = model.predict(test_df)
    return prob


original_train = pd.read_csv("Data/train.csv").drop("Unnamed: 0",axis=1)
df = original_train.copy()

In [67]:
def predictor_harness(new_df, model, preprocessor):

    df_test = preprocessor(new_df)
    
    X_test = df_test.drop(columns = ['target', 'fs_year'], axis=1)
    y_test = df_test['target']
    
    prob = predictor(X_test, model)
    predictions = {
        'Actual': y_test,
        'Predicted': prob
    }   
    
    return predictions 

In [68]:
def metrics(result_dict, year):
    y_actual = result_dict["Actual"]
    prob = result_dict["Predicted"]
    roc_auc = roc_auc_score(y_actual, prob)
    
    print(f"Year: {year}  ROC AUC: {roc_auc:.4f}")

    return {"Year": year, "AUC":roc_auc}

In [None]:
def walk_forward_harness(data, preprocessor, estimator, predictor_harness):

    df = data.copy()

    predictions = []
    model_list = []
    stats_list = []

    df['fs_year'] = df['fs_year'].astype(int)
    df = df.sort_values(by="fs_year",ascending=True)
    years = df['fs_year'].unique()

    print(years)
    
    for i in range(0, len(years)-1):

        year = years[i]
        print("Test year:", year)

        train_data = df[df['fs_year'] <= year]
        test_data = df[df['fs_year'] == year + 1]

        # pre-processing
        df_train = preprocessor(train_data)

        # estimator
        column_names = list(df_train.drop(["target","fs_year"], axis=1).columns)
        my_formula = "target ~ " + " + ".join(column_names)
        model = estimator(df_train, my_formula)
        model_list.append(model)

        # predictor
        result_dict = predictor_harness(test_data, model, preprocessor)
        predictions.append(result_dict)

        # performance metrics
        stats = metrics(result_dict, year)
        stats_list.append(stats)              

    return predictions, model_list, stats_list

In [75]:
predictions, model_list, stats_list = walk_forward_harness(df, preprocessor, estimator, predictor_harness)

[2007 2008 2009 2010 2011 2012]
Test year: 2007
Optimization terminated successfully.
         Current function value: 0.033446
         Iterations 12
Year: 2007  ROC AUC: 0.8332
Test year: 2008
Optimization terminated successfully.
         Current function value: 0.041061
         Iterations 11
Year: 2008  ROC AUC: 0.8377
Test year: 2009
Optimization terminated successfully.
         Current function value: 0.047362
         Iterations 11
Year: 2009  ROC AUC: 0.8138
Test year: 2010
Optimization terminated successfully.
         Current function value: 0.051755
         Iterations 11
Year: 2010  ROC AUC: 0.8350
Test year: 2011
Optimization terminated successfully.
         Current function value: 0.053306
         Iterations 11
Year: 2011  ROC AUC: 0.8579


In [77]:
all_actuals = pd.concat([d['Actual'] for d in predictions], ignore_index=True).values
all_predictions = pd.concat([d['Predicted'] for d in predictions], ignore_index=True).values

# Step 2: Calculate AUC
auc_score = roc_auc_score(all_actuals, all_predictions)
print(f"Overall AUC: {auc_score:.4f}")

Overall AUC: 0.8184


In [78]:
# train the final model on the entire train.csv
import pickle
df_train = preprocessor(df)

X_train = df_train.drop(columns = ['target', 'fs_year'], axis=1)
y_train = df_train['target']

variables = list(X_train.columns)
my_formula = "target ~ " + " + ".join(variables)
model = estimator(df_train, my_formula)

# save the model
with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

Optimization terminated successfully.
         Current function value: 0.053445
         Iterations 11
