In [1]:
from scipy.io import arff
import pandas as pd

In [2]:
# Preprocessing
data = arff.loadarff('4year.arff')
df = pd.DataFrame(data[0])
df['bankruptcy'] = (df['class']==b'1')
df.head(5)

Unnamed: 0,Attr1,Attr2,Attr3,Attr4,Attr5,Attr6,Attr7,Attr8,Attr9,Attr10,...,Attr57,Attr58,Attr59,Attr60,Attr61,Attr62,Attr63,Attr64,class,bankruptcy
0,0.15929,0.4624,0.07773,1.1683,-44.853,0.46702,0.18948,0.82895,1.1223,0.3833,...,0.41557,0.89101,0.001422,7.7928,4.9914,119.81,3.0465,3.056,b'0',False
1,-0.12743,0.46243,0.26917,1.7517,7.597,0.000925,-0.12743,1.1625,1.2944,0.53757,...,-0.23704,1.0625,0.15041,5.4327,3.4629,100.97,3.615,3.4725,b'0',False
2,0.070488,0.2357,0.52781,3.2393,125.68,0.16367,0.086895,2.8718,1.0574,0.67689,...,0.10413,0.94571,0.0,7.107,3.3808,76.076,4.7978,4.7818,b'0',False
3,0.13676,0.40538,0.31543,1.8705,19.115,0.50497,0.13676,1.4539,1.1144,0.58938,...,0.23203,0.89737,0.073024,6.1384,4.2241,88.299,4.1337,4.6484,b'0',False
4,-0.11008,0.69793,0.18878,1.2713,-15.344,0.0,-0.11008,0.43282,1.735,0.30207,...,-0.3644,0.57153,0.0,18.801,2.7925,146.39,2.4934,15.036,b'0',False


In [3]:
sum(df.bankruptcy == True)

515

In [4]:
# Create new dataframe
df_new = df[['Attr1','Attr2','Attr7','Attr10','bankruptcy']]
new_col = ['X1', 'X2', 'X7','X10','Bankruptcy']
df_new.columns = new_col
df_new

Unnamed: 0,X1,X2,X7,X10,Bankruptcy
0,0.159290,0.46240,0.189480,0.38330,False
1,-0.127430,0.46243,-0.127430,0.53757,False
2,0.070488,0.23570,0.086895,0.67689,False
3,0.136760,0.40538,0.136760,0.58938,False
4,-0.110080,0.69793,-0.110080,0.30207,False
...,...,...,...,...,...
9787,0.004676,0.54949,0.013002,0.43205,True
9788,-0.027610,0.60748,-0.027610,0.33509,True
9789,-0.238290,0.62708,-0.240360,0.17760,True
9790,0.097188,0.75300,0.104280,0.24700,True


In [5]:
# Filling missing value with mean 
df_new = df_new.fillna(df_new.mean())

In [6]:
# Check if is any missing value in df
df_new.isnull().any()

X1            False
X2            False
X7            False
X10           False
Bankruptcy    False
dtype: bool

In [7]:
# mean and std of all companies
df1 = pd.DataFrame(df_new.loc[:,'X1':'X10'].mean(),columns = ['mean'])
df2 = pd.DataFrame(df_new.loc[:,'X1':'X10'].std(),columns = ['std'])
df_stat = pd.concat([df1, df2], axis=1)
print("mean and std of all companies")
df_stat

mean and std of all companies


Unnamed: 0,mean,std
X1,0.043019,0.359303
X2,0.596404,4.586887
X7,0.059446,0.533317
X10,0.38904,4.590064


In [8]:
# mean and std of bankrupt companies
df1 = pd.DataFrame(df_new[df_new['Bankruptcy'] == 1].loc[:,'X1':'X10'].mean(),columns = ['mean'])
df2 = pd.DataFrame(df_new[df_new['Bankruptcy'] == 1].loc[:,'X1':'X10'].std(),columns = ['std'])
df_stat = pd.concat([df1, df2], axis=1)
print("mean and std of bankrupt companies")
df_stat 

mean and std of bankrupt companies


Unnamed: 0,mean,std
X1,-0.068873,0.568076
X2,0.878355,1.945596
X7,-0.061538,0.568432
X10,0.103367,1.946747


In [9]:
# mean and std of still-operating companies
df1 = pd.DataFrame(df_new[df_new['Bankruptcy'] == 0].loc[:,'X1':'X10'].mean(),columns = ['mean'])
df2 = pd.DataFrame(df_new[df_new['Bankruptcy'] == 0].loc[:,'X1':'X10'].std(),columns = ['std'])
df_stat = pd.concat([df1, df2], axis=1)
print("mean and std of still-operating companies")
df_stat

mean and std of still-operating companies


Unnamed: 0,mean,std
X1,0.049231,0.343002
X2,0.580752,4.689694
X7,0.066162,0.530524
X10,0.404899,4.692934


In [10]:
# the number of companies have X1 values 1 std below the mean AND X10 values 1 std below the mean
df_select = df_new[(df_new['X1'] < df_new.X1.mean()-df_new.X1.std()) & (df_new['X10'] < df_new.X10.mean()-df_new.X10.std())]
df_select

Unnamed: 0,X1,X2,X7,X10,Bankruptcy
2312,-1.0927,5.6368,-1.0927,-4.6368,False
2608,-3.7231,11.53,-3.6424,-10.53,False
3017,-1.948,25.005,-1.948,-24.005,False
3739,-0.72685,6.9334,-0.72685,-5.9334,False
4767,-5.9655,6.6818,-5.9655,-5.6818,False
5001,-3.2845,20.403,-3.2845,-19.403,False
5259,-0.44,16.487,-0.44,-15.487,False
5859,-0.32841,6.1187,-0.32841,-5.1187,False
6264,-0.72755,5.2632,-0.72755,-4.2632,False
7846,-1.9841,13.063,-1.9841,-12.473,False


In [11]:
print("the number of companies have X1 and X10 values 1 std below the mean:",len(df_select))

the number of companies have X1 and X10 values 1 std below the mean: 15


In [12]:
ratio_bcp = sum(df_select['Bankruptcy']==1)/len(df_select)
print("ratio of the bankrupted companies among the sub-groups:",ratio_bcp)

ratio of the bankrupted companies among the sub-groups: 0.2
