In [13]:
import pandas as pd
import seaborn as sns

In [4]:
df = pd.read_csv('../data/company-bankruptcy-prediction.csv')
print(df.head())
print(df.info())
print(df.describe())

   Bankrupt?   ROA(C) before interest and depreciation before interest  \
0          1                                           0.370594          
1          1                                           0.464291          
2          1                                           0.426071          
3          1                                           0.399844          
4          1                                           0.465022          

    ROA(A) before interest and % after tax  \
0                                 0.424389   
1                                 0.538214   
2                                 0.499019   
3                                 0.451265   
4                                 0.538432   

    ROA(B) before interest and depreciation after tax  \
0                                           0.405750    
1                                           0.516730    
2                                           0.472295    
3                                           0.4577

In [20]:
column_renaming_map = {
    'Bankrupt?': 'Target_Bankruptcy', # Changed to a more descriptive target name
    ' ROA(C) before interest and depreciation before interest': 'ROA_C_BeforeInterestDepreciation',
    ' ROA(A) before interest and % after tax': 'ROA_A_BeforeInterestAfterTax',
    ' ROA(B) before interest and depreciation after tax': 'ROA_B_BeforeInterestDepreciationAfterTax',
    ' Operating Gross Margin': 'Operating_Gross_Margin',
    ' Realized Sales Gross Margin': 'Realized_Sales_Gross_Margin',
    ' Operating Profit Rate': 'Operating_Profit_Rate',
    ' Pre-tax net Interest Rate': 'PreTax_Net_Interest_Rate',
    ' After-tax net Interest Rate': 'AfterTax_Net_Interest_Rate',
    ' Non-industry income and expenditure/revenue': 'NonIndustry_Income_Expenditure_Revenue',
    ' Continuous interest rate (after tax)': 'Continuous_Interest_Rate_AfterTax',
    ' Operating Expense Rate': 'Operating_Expense_Rate',
    ' Research and development expense rate': 'R&D_Expense_Rate',
    ' Cash flow rate': 'Cash_Flow_Rate',
    ' Interest-bearing debt interest rate': 'Interest_Bearing_Debt_Interest_Rate',
    ' Tax rate (A)': 'Tax_Rate_A',
    ' Net Value Per Share (B)': 'Net_Value_Per_Share_B',
    ' Net Value Per Share (A)': 'Net_Value_Per_Share_A',
    ' Net Value Per Share (C)': 'Net_Value_Per_Share_C',
    ' Persistent EPS in the Last Four Seasons': 'Persistent_EPS_Last_4_Seasons',
    ' Cash Flow Per Share': 'Cash_Flow_Per_Share',
    ' Revenue Per Share (Yuan ¥)': 'Revenue_Per_Share',
    ' Operating Profit Per Share (Yuan ¥)': 'Operating_Profit_Per_Share',
    ' Per Share Net profit before tax (Yuan ¥)': 'Pretax_Net_Profit_Per_Share',
    ' Realized Sales Gross Profit Growth Rate': 'Realized_Sales_Gross_Profit_Growth_Rate',
    ' Operating Profit Growth Rate': 'Operating_Profit_Growth_Rate',
    ' After-tax Net Profit Growth Rate': 'AfterTax_Net_Profit_Growth_Rate',
    ' Regular Net Profit Growth Rate': 'Regular_Net_Profit_Growth_Rate',
    ' Continuous Net Profit Growth Rate': 'Continuous_Net_Profit_Growth_Rate',
    ' Total Asset Growth Rate': 'Total_Asset_Growth_Rate',
    ' Net Value Growth Rate': 'Net_Value_Growth_Rate',
    ' Total Asset Return Growth Rate Ratio': 'Total_Asset_Return_Growth_Rate_Ratio',
    ' Cash Reinvestment %': 'Cash_Reinvestment_Percentage',
    ' Current Ratio': 'Current_Ratio',
    ' Quick Ratio': 'Quick_Ratio',
    ' Quick Ratio: Acid Test': 'Quick_Ratio_Acid_Test',
    ' Interest Expense Ratio': 'Interest_Expense_Ratio',
    ' Total debt/Total net worth': 'Total_Debt_Total_Net_Worth_Ratio',
    ' Debt ratio %': 'Debt_Ratio_Percentage',
    ' Net worth/Assets': 'Net_Worth_Assets_Ratio',
    ' Long-term fund suitability ratio (A)': 'LongTerm_Fund_Suitability_Ratio_A',
    ' Borrowing dependency': 'Borrowing_Dependency',
    ' Contingent liabilities/Net worth': 'Contingent_Liabilities_Net_Worth',
    ' Operating profit/Paid-in capital': 'Operating_Profit_Paid_in_Capital',
    ' Net profit before tax/Paid-in capital': 'Net_Profit_BeforeTax_Paid_in_Capital',
    ' Inventory and accounts receivable/Net value': 'Inventory_Accounts_Receivable_Net_Value',
    ' Total Asset Turnover': 'Total_Asset_Turnover',
    ' Accounts Receivable Turnover': 'Accounts_Receivable_Turnover',
    ' Average Collection Days': 'Average_Collection_Days',
    ' Inventory Turnover Rate (times)': 'Inventory_Turnover_Rate',
    ' Fixed Assets Turnover Frequency': 'Fixed_Assets_Turnover_Frequency',
    ' Net Worth Turnover Rate (times)': 'Net_Worth_Turnover_Rate',
    ' Revenue per person': 'Revenue_Per_Person',
    ' Operating profit per person': 'Operating_Profit_Per_Person',
    ' Allocation rate per person': 'Allocation_Rate_Per_Person',
    ' Working Capital to Total Assets': 'Working_Capital_to_Total_Assets',
    ' Quick Assets/Total Assets': 'Quick_Assets_Total_Assets',
    ' Current Assets/Total Assets': 'Current_Assets_Total_Assets',
    ' Cash/Total Assets': 'Cash_Total_Assets',
    ' Quick Assets/Current Liability': 'Quick_Assets_Current_Liability',
    ' Cash/Current Liability': 'Cash_Current_Liability',
    ' Current Liability to Assets': 'Current_Liability_to_Assets',
    ' Operating Funds to Liability': 'Operating_Funds_to_Liability',
    ' Inventory/Working Capital': 'Inventory_Working_Capital',
    ' Inventory/Current Liability': 'Inventory_Current_Liability',
    ' Current Liabilities/Liability': 'Current_Liabilities_Liability',
    ' Working Capital/Equity': 'Working_Capital_Equity',
    ' Current Liabilities/Equity': 'Current_Liabilities_Equity',
    ' Long-term Liability to Current Assets': 'LongTerm_Liability_to_Current_Assets',
    ' Retained Earnings to Total Assets': 'Retained_Earnings_to_Total_Assets',
    ' Total income/Total expense': 'Total_Income_Total_Expense',
    ' Total expense/Assets': 'Total_Expense_Assets',
    ' Current Asset Turnover Rate': 'Current_Asset_Turnover_Rate',
    ' Quick Asset Turnover Rate': 'Quick_Asset_Turnover_Rate',
    ' Working capitcal Turnover Rate': 'Working_Capital_Turnover_Rate',
    ' Cash Turnover Rate': 'Cash_Turnover_Rate',
    ' Cash Flow to Sales': 'Cash_Flow_to_Sales',
    ' Fixed Assets to Assets': 'Fixed_Assets_to_Assets',
    ' Current Liability to Liability': 'Current_Liability_to_Liability',
    ' Current Liability to Equity': 'Current_Liability_to_Equity',
    ' Equity to Long-term Liability': 'Equity_to_LongTerm_Liability',
    ' Cash Flow to Total Assets': 'Cash_Flow_to_Total_Assets',
    ' Cash Flow to Liability': 'Cash_Flow_to_Liability',
    ' CFO to Assets': 'CFO_to_Assets',
    ' Cash Flow to Equity': 'Cash_Flow_to_Equity',
    ' Current Liability to Current Assets': 'Current_Liability_to_Current_Assets',
    ' Liability-Assets Flag': 'Liability_Assets_Flag', # 1 if Total Liability exceeds Total Assets, 0 otherwise
    ' Net Income to Total Assets': 'Net_Income_to_Total_Assets',
    ' Total assets to GNP price': 'Total_Assets_to_GNP_Price',
    ' No-credit Interval': 'No_Credit_Interval',
    ' Gross Profit to Sales': 'Gross_Profit_to_Sales',
    ' Net Income to Stockholder\'s Equity': 'Net_Income_to_Stockholders_Equity',
    ' Liability to Equity': 'Liability_to_Equity',
    ' Degree of Financial Leverage (DFL)': 'Degree_of_Financial_Leverage',
    ' Interest Coverage Ratio (Interest expense to EBIT)': 'Interest_Coverage_Ratio',
    ' Net Income Flag': 'Net_Income_Flag', # 1 if Net Income is Negative for the last two years, 0 otherwise
    ' Equity to Liability': 'Equity_to_Liability'
}

In [21]:
 df = df.rename(columns=column_renaming_map)

In [22]:
print(df.head())
print(df.info())
print(df.describe())

   Target_Bankruptcy  ROA_C_BeforeInterestDepreciation  \
0                  1                          0.370594   
1                  1                          0.464291   
2                  1                          0.426071   
3                  1                          0.399844   
4                  1                          0.465022   

   ROA_A_BeforeInterestAfterTax  ROA_B_BeforeInterestDepreciationAfterTax  \
0                      0.424389                                  0.405750   
1                      0.538214                                  0.516730   
2                      0.499019                                  0.472295   
3                      0.451265                                  0.457733   
4                      0.538432                                  0.522298   

   Operating_Gross_Margin  Realized_Sales_Gross_Margin  Operating_Profit_Rate  \
0                0.601457                     0.601457               0.998969   
1                0.610235 

In [23]:
print(df['Target_Bankruptcy'].value_counts())
print(df['Target_Bankruptcy'].value_counts(normalize=True) * 100)

Target_Bankruptcy
0    6599
1     220
Name: count, dtype: int64
Target_Bankruptcy
0    96.77372
1     3.22628
Name: proportion, dtype: float64


In [15]:
df.corr()

Unnamed: 0,Target_Bankruptcy,ROA_C_BeforeInterestDepreciation,ROA_A_BeforeInterestAfterTax,ROA_B_BeforeInterestDepreciationAfterTax,Operating_Gross_Margin,Realized_Sales_Gross_Margin,Operating_Profit_Rate,PreTax_Net_Interest_Rate,AfterTax_Net_Interest_Rate,NonIndustry_Income_Expenditure_Revenue,...,Net_Income_to_Total_Assets,Total_Assets_to_GNP_Price,No_Credit_Interval,Gross_Profit_to_Sales,Net_Income_to_Stockholders_Equity,Liability_to_Equity,Degree_of_Financial_Leverage,Interest_Coverage_Ratio,Net_Income_Flag,Equity_to_Liability
Target_Bankruptcy,1.000000,-0.260807,-0.282941,-0.273051,-0.100043,-0.099445,-0.000230,-0.008517,-0.008857,-0.016593,...,-0.315457,0.035104,-0.005547,-0.100044,-0.180987,0.166812,0.010508,-0.005509,,-0.083048
ROA_C_BeforeInterestDepreciation,-0.260807,1.000000,0.940124,0.986849,0.334719,0.332755,0.035725,0.053419,0.049222,0.020501,...,0.887670,-0.071725,0.008135,0.334721,0.274287,-0.143629,-0.016575,0.010573,,0.052416
ROA_A_BeforeInterestAfterTax,-0.282941,0.940124,1.000000,0.955741,0.326969,0.324956,0.032053,0.053518,0.049474,0.029649,...,0.961552,-0.098900,0.011463,0.326971,0.291744,-0.141039,-0.011515,0.013372,,0.057887
ROA_B_BeforeInterestDepreciationAfterTax,-0.273051,0.986849,0.955741,1.000000,0.333749,0.331755,0.035212,0.053726,0.049952,0.022366,...,0.912040,-0.089088,0.007523,0.333750,0.280617,-0.142838,-0.014663,0.011473,,0.056430
Operating_Gross_Margin,-0.100043,0.334719,0.326969,0.333749,1.000000,0.999518,0.005745,0.032493,0.027175,0.051438,...,0.300143,0.022672,0.004205,1.000000,0.075304,-0.085434,-0.011806,-0.001167,,0.120029
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Liability_to_Equity,0.166812,-0.143629,-0.141039,-0.142838,-0.085434,-0.085407,0.001541,-0.004043,-0.004390,-0.011899,...,-0.159697,0.021982,-0.003724,-0.085434,-0.791836,1.000000,0.002119,0.001487,,-0.159654
Degree_of_Financial_Leverage,0.010508,-0.016575,-0.011515,-0.014663,-0.011806,-0.011268,0.000935,0.000855,0.000927,-0.000556,...,-0.010463,-0.001881,-0.008812,-0.011806,-0.000093,0.002119,1.000000,0.016513,,-0.016739
Interest_Coverage_Ratio,-0.005509,0.010573,0.013372,0.011473,-0.001167,-0.001158,0.000393,0.000984,0.000957,0.001024,...,0.012746,0.000239,0.001027,-0.001169,0.005147,0.001487,0.016513,1.000000,,-0.008339
Net_Income_Flag,,,,,,,,,,,...,,,,,,,,,,


In [16]:
df.isna()

Unnamed: 0,Target_Bankruptcy,ROA_C_BeforeInterestDepreciation,ROA_A_BeforeInterestAfterTax,ROA_B_BeforeInterestDepreciationAfterTax,Operating_Gross_Margin,Realized_Sales_Gross_Margin,Operating_Profit_Rate,PreTax_Net_Interest_Rate,AfterTax_Net_Interest_Rate,NonIndustry_Income_Expenditure_Revenue,...,Net_Income_to_Total_Assets,Total_Assets_to_GNP_Price,No_Credit_Interval,Gross_Profit_to_Sales,Net_Income_to_Stockholders_Equity,Liability_to_Equity,Degree_of_Financial_Leverage,Interest_Coverage_Ratio,Net_Income_Flag,Equity_to_Liability
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6814,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6815,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6816,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6817,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [27]:
X_train = pd.read_csv('../data/X_train.csv')
print(X_train.head())
print(X_train.info())
print(X_train.describe())

   ROA_C_BeforeInterestDepreciation  ROA_A_BeforeInterestAfterTax  \
0                         -0.187788                     -0.129121   
1                          0.481421                      0.489832   
2                          0.349668                      0.430845   
3                         -0.584654                     -0.331838   
4                         -1.872459                     -1.759169   

   ROA_B_BeforeInterestDepreciationAfterTax  Operating_Gross_Margin  \
0                                 -0.145581               -0.556560   
1                                  0.322064               -0.510593   
2                                  0.209933               -0.071356   
3                                 -0.528042               -0.742555   
4                                 -1.966616                3.339539   

   Realized_Sales_Gross_Margin  Operating_Profit_Rate  \
0                    -0.556046               0.017291   
1                    -0.517700              

In [28]:
y_train = pd.read_csv('../data/y_train.csv')
print(y_train.head())
print(y_train.info())
print(y_train.describe())

   Target_Bankruptcy
0                  0
1                  0
2                  0
3                  0
4                  0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10558 entries, 0 to 10557
Data columns (total 1 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   Target_Bankruptcy  10558 non-null  int64
dtypes: int64(1)
memory usage: 82.6 KB
None
       Target_Bankruptcy
count       10558.000000
mean            0.500000
std             0.500024
min             0.000000
25%             0.000000
50%             0.500000
75%             1.000000
max             1.000000
