In [20]:
import pandas as pd

# Install imblearn module using the command: pip install imblearn
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, NearMiss, ClusterCentroids

In [21]:
data = pd.read_csv(r'..\data\taiwanese-bankruptcy-prediction.csv')

In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6819 entries, 0 to 6818
Data columns (total 96 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   Bankrupt?                                                 6819 non-null   int64  
 1    ROA(C) before interest and depreciation before interest  6819 non-null   float64
 2    ROA(A) before interest and % after tax                   6819 non-null   float64
 3    ROA(B) before interest and depreciation after tax        6819 non-null   float64
 4    Operating Gross Margin                                   6819 non-null   float64
 5    Realized Sales Gross Margin                              6819 non-null   float64
 6    Operating Profit Rate                                    6819 non-null   float64
 7    Pre-tax net Interest Rate                                6819 non-null   float64
 8    After-tax net Int

In [23]:
data.describe()

Unnamed: 0,Bankrupt?,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
count,6819.0,6819.0,6819.0,6819.0,6819.0,6819.0,6819.0,6819.0,6819.0,6819.0,...,6819.0,6819.0,6819.0,6819.0,6819.0,6819.0,6819.0,6819.0,6819.0,6819.0
mean,0.032263,0.50518,0.558625,0.553589,0.607948,0.607929,0.998755,0.79719,0.809084,0.303623,...,0.80776,18629420.0,0.623915,0.607946,0.840402,0.280365,0.027541,0.565358,1.0,0.047578
std,0.17671,0.060686,0.06562,0.061595,0.016934,0.016916,0.01301,0.012869,0.013601,0.011163,...,0.040332,376450100.0,0.01229,0.016934,0.014523,0.014463,0.015668,0.013214,0.0,0.050014
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,0.0,0.476527,0.535543,0.527277,0.600445,0.600434,0.998969,0.797386,0.809312,0.303466,...,0.79675,0.0009036205,0.623636,0.600443,0.840115,0.276944,0.026791,0.565158,1.0,0.024477
50%,0.0,0.502706,0.559802,0.552278,0.605997,0.605976,0.999022,0.797464,0.809375,0.303525,...,0.810619,0.002085213,0.623879,0.605998,0.841179,0.278778,0.026808,0.565252,1.0,0.033798
75%,0.0,0.535563,0.589157,0.584105,0.613914,0.613842,0.999095,0.797579,0.809469,0.303585,...,0.826455,0.005269777,0.624168,0.613913,0.842357,0.281449,0.026913,0.565725,1.0,0.052838
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,9820000000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [24]:
def sampler(data, features, label, type, output_file_name='', random_state=0):
    X = data[features]
    Y = data[label]
    X_resampled, Y_resampled = pd.DataFrame(), pd.DataFrame()

    sampling_types = ['RandomOverSampling', 'SMOTE', 'ADASYN', 'RandomUnderSampling', 'NearMiss', 'ClusterCentroids']

    if(type>=len(sampling_types)):
        print(f"Invalid Option")
        return
    

    print(f"Balance before {sampling_types[type]} sampling:\nBankrupt: {len(Y.loc[Y['Bankrupt?'] == 1, 'Bankrupt?'])} | Not Bankrupt: {len(Y.loc[Y['Bankrupt?'] == 0, 'Bankrupt?'])}\n")

    match type:
        case 0: # Random Over Sampling
            ros = RandomOverSampler(random_state=random_state)
            X_resampled, Y_resampled = ros.fit_resample(X, Y)
            
        case 1: # SMOTE
            smote = SMOTE(random_state=random_state)
            X_resampled, Y_resampled = smote.fit_resample(X, Y)

        case 2: # ADASYN
            adasyn = ADASYN(random_state=random_state) 
            X_resampled, Y_resampled = adasyn.fit_resample(X, Y)
        
        case 3: # RandomUnderSampling
            rus = RandomUnderSampler(random_state=random_state)
            X_resampled, Y_resampled = rus.fit_resample(X, Y)

        case 4: # NearMiss
            rus = NearMiss()
            X_resampled, Y_resampled = rus.fit_resample(X, Y)

        case 5: # ClusterCentroids
            cc = ClusterCentroids(random_state=random_state)
            X_resampled, Y_resampled = cc.fit_resample(X, Y)
            
        case _:
            print("Invalid Option")
            return 
        
    print(f"Balance after {sampling_types[type]} sampling:\nBankrupt: {len(Y_resampled.loc[Y_resampled['Bankrupt?'] == 1, 'Bankrupt?'])} | Not Bankrupt: {len(Y_resampled.loc[Y_resampled['Bankrupt?'] == 0, 'Bankrupt?'])}\n")
    df = pd.concat([X_resampled, Y_resampled], axis=1)

    if(output_file_name!=''):
        df.to_csv(f"..\\data\\{output_file_name}.csv")

    return df
    



sampler(data, list(set(data.columns) - set(['Bankrupt?'])), list(['Bankrupt?']), 0)
sampler(data, list(set(data.columns) - set(['Bankrupt?'])), list(['Bankrupt?']), 1)
sampler(data, list(set(data.columns) - set(['Bankrupt?'])), list(['Bankrupt?']), 2)
sampler(data, list(set(data.columns) - set(['Bankrupt?'])), list(['Bankrupt?']), 3)
sampler(data, list(set(data.columns) - set(['Bankrupt?'])), list(['Bankrupt?']), 4)
sampler(data, list(set(data.columns) - set(['Bankrupt?'])), list(['Bankrupt?']), 5)

Balance before RandomOverSampling sampling:
Bankrupt: 220 | Not Bankrupt: 6599

Balance after RandomOverSampling sampling:
Bankrupt: 6599 | Not Bankrupt: 6599

Balance before SMOTE sampling:
Bankrupt: 220 | Not Bankrupt: 6599

Balance after SMOTE sampling:
Bankrupt: 6599 | Not Bankrupt: 6599

Balance before ADASYN sampling:
Bankrupt: 220 | Not Bankrupt: 6599

Balance after ADASYN sampling:
Bankrupt: 6523 | Not Bankrupt: 6599

Balance before RandomUnderSampling sampling:
Bankrupt: 220 | Not Bankrupt: 6599

Balance after RandomUnderSampling sampling:
Bankrupt: 220 | Not Bankrupt: 220

Balance before NearMiss sampling:
Bankrupt: 220 | Not Bankrupt: 6599

Balance after NearMiss sampling:
Bankrupt: 220 | Not Bankrupt: 220

Balance before ClusterCentroids sampling:
Bankrupt: 220 | Not Bankrupt: 6599



  super()._check_params_vs_input(X, default_n_init=10)


Balance after ClusterCentroids sampling:
Bankrupt: 220 | Not Bankrupt: 220



Unnamed: 0,Fixed Assets Turnover Frequency,Tax rate (A),Operating Expense Rate,Working capitcal Turnover Rate,Cash Reinvestment %,Current Liability to Current Assets,Total assets to GNP price,Operating Profit Per Share (Yuan ¥),Research and development expense rate,Cash flow rate,...,Total Asset Growth Rate,Interest Expense Ratio,Net Worth Turnover Rate (times),Current Assets/Total Assets,Degree of Financial Leverage (DFL),Liability to Equity,Operating Profit Growth Rate,Interest-bearing debt interest rate,Quick Asset Turnover Rate,Bankrupt?
0,5.013462e+07,0.111935,8.582692e+07,0.593985,0.380424,0.035812,0.008482,0.094356,5.422288e+08,0.464460,...,5.617115e+09,0.630719,0.032584,0.530107,0.027165,0.281275,0.847950,4.457571e-04,1.168462e+08,0
1,2.696991e-03,0.189535,6.564118e+09,0.593928,0.381223,0.035639,0.003425,0.114256,6.246176e+08,0.469450,...,7.231176e+09,0.631044,0.075247,0.507862,0.028193,0.279893,0.848017,5.705882e+07,5.264118e+09,0
2,8.849057e+07,0.158640,5.037925e+09,0.593967,0.379741,0.028425,0.010242,0.105451,4.722962e+08,0.468694,...,6.185849e+09,0.630640,0.037456,0.527675,0.027716,0.279601,0.847953,2.113208e+07,1.427925e+08,0
3,8.069444e+07,0.098410,2.086163e-04,0.593963,0.379192,0.030007,0.002264,0.105898,7.748056e+09,0.465396,...,6.734722e+09,0.630758,0.033002,0.503695,0.027778,0.279831,0.848482,3.365353e-04,1.925556e+07,0
4,8.960976e+07,0.131765,1.278049e+08,0.593928,0.381853,0.039091,0.009242,0.104703,5.403220e+08,0.465235,...,6.800976e+09,0.631603,0.049347,0.441282,0.027296,0.281055,0.848024,1.341463e+07,7.950244e+09,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435,1.596210e-04,0.000000,4.117816e-04,0.593930,0.359404,0.044960,0.000487,0.089895,1.163030e-04,0.459854,...,4.780000e+09,0.630167,0.035968,0.737306,0.026666,0.292648,0.848117,5.610561e-04,3.019010e-04,1
436,1.739945e-04,0.000000,1.068481e-04,0.593895,0.432426,0.059374,0.017588,0.091931,1.880000e+09,0.472751,...,2.590000e+09,0.630536,0.158871,0.434591,0.026769,0.259280,0.847908,2.340234e-04,1.085847e-04,1
437,9.720996e-04,0.000000,5.530000e+09,0.593912,0.445615,0.050780,0.000847,0.074098,0.000000e+00,0.465222,...,2.570000e+08,0.630567,0.090161,0.739236,0.026777,0.336515,0.847953,2.760276e-04,4.620000e+08,1
438,2.103064e-04,0.000000,4.027619e-04,0.593880,0.355128,0.060766,0.000376,0.050566,0.000000e+00,0.457965,...,3.130000e+09,0.630378,0.038226,0.430385,0.026722,0.337315,0.848051,3.820382e-04,8.480000e+09,1
