https://ec.europa.eu/competition/mergers/cases/index/nace_all.html

In [1]:
import os
import sys
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import patsy
import sklearn.metrics as metrics
import statsmodels.formula.api as smf
from plotnine import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import (
    LinearRegression,
    LogisticRegression,
    LogisticRegressionCV,
)
from sklearn.metrics import (
    auc,
    brier_score_loss,
    confusion_matrix,
    mean_squared_error,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from statsmodels.tools.eval_measures import rmse

warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv("cs_bisnode_panel.csv")

In [3]:
# add all missing year and comp_id combinations -
# originally missing combinations will have NAs in all other columns
data = (
    data.set_index(["year", "comp_id"])
    .unstack(fill_value="toReplace")
    .stack()
    .reset_index()
)
data = data.replace("toReplace", np.nan)  # only way I could define it as NaN

In [4]:
# generate status_alive; if sales larger than zero and not-NA, then firm is alive
data["status_alive"] = (data["sales"] > 0 & (False == data["sales"].isna())).astype(int)

# defaults in two years if there are sales in this year but no sales two years later
# Status_in_two_years: data.groupby('comp_id')['status_alive'].shift(-2)
data["default"] = (
    (data["status_alive"] == 1)
    & (data.groupby("comp_id")["status_alive"].shift(-1) == 0)
).astype(int)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 556944 entries, 0 to 556943
Data columns (total 50 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   year                  556944 non-null  int64  
 1   comp_id               556944 non-null  float64
 2   begin                 287829 non-null  object 
 3   end                   287829 non-null  object 
 4   COGS                  18257 non-null   float64
 5   amort                 279789 non-null  float64
 6   curr_assets           287698 non-null  float64
 7   curr_liab             287698 non-null  float64
 8   extra_exp             269300 non-null  float64
 9   extra_inc             269300 non-null  float64
 10  extra_profit_loss     270626 non-null  float64
 11  finished_prod         17485 non-null   float64
 12  fixed_assets          287698 non-null  float64
 13  inc_bef_tax           280392 non-null  float64
 14  intang_assets         287689 non-null  float64
 15  

In [9]:
data_2014 = data[(data['ind2'] == 26) & 
                 (data['year'] == 2014) & 
                 (data['sales'] >= 1000) & 
                 (data['sales'] <= 10000000)]

data_2014['year'].value_counts()

print(data_2014['default'].value_counts())

default
0    981
1     56
Name: count, dtype: int64
