In [80]:
#Import the Intel sklearn accelerater
from sklearnex import patch_sklearn
patch_sklearn()
import os
from scipy.io import arff
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
import plotly.graph_objects as go
from imblearn.pipeline import make_pipeline as make_pipeline_with_sampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, recall_score, f1_score

#classification models used:
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [50]:
directory = 'data/'

#store each dataframe in a list
dfs = []

#iterate through each .arff file in the directory
for filename in os.listdir(directory):
    if filename.endswith('.arff'):
        #load the arff file
        data = arff.loadarff(os.path.join(directory, filename))
        #convert it to dataframe
        df = pd.DataFrame(data[0])
        #append the dataframe to the list
        dfs.append(df)

#concat all dataframes in 1 df
all_years = pd.concat(dfs, ignore_index=True)
all_years.head()

Unnamed: 0,Attr1,Attr2,Attr3,Attr4,Attr5,Attr6,Attr7,Attr8,Attr9,Attr10,...,Attr56,Attr57,Attr58,Attr59,Attr60,Attr61,Attr62,Attr63,Attr64,class
0,0.20055,0.37951,0.39641,2.0472,32.351,0.38825,0.24976,1.3305,1.1389,0.50494,...,0.12196,0.39718,0.87804,0.001924,8.416,5.1372,82.658,4.4158,7.4277,b'0'
1,0.20912,0.49988,0.47225,1.9447,14.786,0.0,0.25834,0.99601,1.6996,0.49788,...,0.1213,0.42002,0.853,0.0,4.1486,3.2732,107.35,3.4,60.987,b'0'
2,0.24866,0.69592,0.26713,1.5548,-1.1523,0.0,0.30906,0.43695,1.309,0.30408,...,0.24114,0.81774,0.76599,0.69484,4.9909,3.951,134.27,2.7185,5.2078,b'0'
3,0.081483,0.30734,0.45879,2.4928,51.952,0.14988,0.092704,1.8661,1.0571,0.57353,...,0.054015,0.14207,0.94598,0.0,4.5746,3.6147,86.435,4.2228,5.5497,b'0'
4,0.18732,0.61323,0.2296,1.4063,-7.3128,0.18732,0.18732,0.6307,1.1559,0.38677,...,0.13485,0.48431,0.86515,0.12444,6.3985,4.3158,127.21,2.8692,7.898,b'0'


In [51]:

print(all_years.describe())
print(len(all_years))

              Attr1         Attr2         Attr3         Attr4         Attr5  \
count  43397.000000  43397.000000  43397.000000  43271.000000  4.331600e+04   
mean       0.035160      0.590212      0.114431      6.314702 -3.853466e+02   
std        2.994109      5.842748      5.439429    295.434425  6.124303e+04   
min     -463.890000   -430.870000   -479.960000     -0.403110 -1.190300e+07   
25%        0.003429      0.268980      0.021521      1.049500 -4.908000e+01   
50%        0.049660      0.471900      0.196610      1.569800 -1.034500e+00   
75%        0.129580      0.688320      0.403390      2.787450  5.063425e+01   
max       94.280000    480.960000     28.336000  53433.000000  1.250100e+06   

              Attr6         Attr7         Attr8         Attr9        Attr10  \
count  43397.000000  43397.000000  43311.000000  43396.000000  43397.000000   
mean      -0.056107      0.093478     12.640779      2.652166      0.626868   
std        7.201326      5.713075    505.894281    

#### Check for missing values

In [52]:
#get the count of nas by column
na_counts = all_years.isna().sum()
print(na_counts)
#find the columns with nas
columns_with_na = all_years.columns[all_years.isna().any()].tolist()
print(len(columns_with_na))

#Sort the na_counts in descending order
sorted_na_counts = na_counts.sort_values(ascending=False)

#Display the top 10 of those and their counts
top_10_na_counts = sorted_na_counts.head(10)
print(top_10_na_counts)
print(100*top_10_na_counts/len(all_years))

Attr1       8
Attr2       8
Attr3       8
Attr4     134
Attr5      89
         ... 
Attr61    102
Attr62    127
Attr63    134
Attr64    812
class       0
Length: 65, dtype: int64
64
Attr37    18984
Attr21     5854
Attr27     2764
Attr60     2152
Attr45     2147
Attr24      922
Attr64      812
Attr53      812
Attr28      812
Attr54      812
dtype: int64
Attr37    43.736897
Attr21    13.486925
Attr27     6.367930
Attr60     4.957954
Attr45     4.946435
Attr24     2.124179
Attr64     1.870752
Attr53     1.870752
Attr28     1.870752
Attr54     1.870752
dtype: float64


#### Begin to prep data for imputation

In [53]:
#Redo the class labels from b'0' for bankruptcy and b'1' for not bankruptcy, to just 0 and 1
all_years['class'] = all_years['class'].apply(lambda y: int(y.decode('utf-8')))

#print the head to make sure it converted properly
all_years.head()

Unnamed: 0,Attr1,Attr2,Attr3,Attr4,Attr5,Attr6,Attr7,Attr8,Attr9,Attr10,...,Attr56,Attr57,Attr58,Attr59,Attr60,Attr61,Attr62,Attr63,Attr64,class
0,0.20055,0.37951,0.39641,2.0472,32.351,0.38825,0.24976,1.3305,1.1389,0.50494,...,0.12196,0.39718,0.87804,0.001924,8.416,5.1372,82.658,4.4158,7.4277,0
1,0.20912,0.49988,0.47225,1.9447,14.786,0.0,0.25834,0.99601,1.6996,0.49788,...,0.1213,0.42002,0.853,0.0,4.1486,3.2732,107.35,3.4,60.987,0
2,0.24866,0.69592,0.26713,1.5548,-1.1523,0.0,0.30906,0.43695,1.309,0.30408,...,0.24114,0.81774,0.76599,0.69484,4.9909,3.951,134.27,2.7185,5.2078,0
3,0.081483,0.30734,0.45879,2.4928,51.952,0.14988,0.092704,1.8661,1.0571,0.57353,...,0.054015,0.14207,0.94598,0.0,4.5746,3.6147,86.435,4.2228,5.5497,0
4,0.18732,0.61323,0.2296,1.4063,-7.3128,0.18732,0.18732,0.6307,1.1559,0.38677,...,0.13485,0.48431,0.86515,0.12444,6.3985,4.3158,127.21,2.8692,7.898,0


#### Using KNNImputer to impute missing values

In [54]:
#create a KNNImputer instance and set n_neighbors as 10
imputer = KNNImputer(n_neighbors=10)
all_years_imputed = imputer.fit_transform(all_years)
all_years_imputed = pd.DataFrame(all_years_imputed, columns=all_years.columns)


In [55]:

#check again for missing values
na_counts = all_years_imputed.isna().sum()
print(na_counts)

Attr1     0
Attr2     0
Attr3     0
Attr4     0
Attr5     0
         ..
Attr61    0
Attr62    0
Attr63    0
Attr64    0
class     0
Length: 65, dtype: int64


#### View Correlation between columns

In [56]:
#calculate correlation matrix
corr_matrix = all_years.corr()

#make a heatmap
heatmap = go.Figure(data=go.Heatmap(
                   z=corr_matrix,
                   x=list(corr_matrix.columns),
                   y=list(corr_matrix.index),
                   colorscale='Bluered_r',
                   zmin=-1,  # Ensure the colorscale covers the full range
                   zmax=1))

#annotations for correlations over 0.9 or under -0.9
for i in range(len(corr_matrix.columns)):
    for j in range(len(corr_matrix.index)):
        value = corr_matrix.iloc[i, j]
        if value > 0.9 or value < -0.9:
            heatmap.add_annotation(dict(
                x=corr_matrix.columns[j],
                y=corr_matrix.index[i],
                text=str(round(value, 2)),
                showarrow=False,
                font=dict(
                    size=4,
                    color="white"
                ),
            ))

heatmap.update_layout(
    title='Correlation Heatmap',
    width=1000,
    height=1000,
    xaxis=dict(
        title="Attributes",
        tickfont=dict(
            size=7,  # adjust as needed
        ),
    ),
    yaxis=dict(
        title="Attributes",
        tickfont=dict(
            size=7,  # adjust as needed
        ),
    ),
)

heatmap.show()


In [57]:
# Set the diagonal values to NaN
for i in range(corr_matrix.shape[0]):
    corr_matrix.iloc[i, i] = np.nan

# Unstack and filter correlations that are above 0.9 or below -0.9
high_corr = corr_matrix.unstack()
high_corr = high_corr[(high_corr > 0.9) | (high_corr < -0.9)].reset_index()

# Rename the columns
high_corr.columns = ['Attribute 1', 'Attribute 2', 'Correlation']
print(high_corr)


    Attribute 1 Attribute 2  Correlation
0         Attr2       Attr3    -0.926983
1         Attr2      Attr51     0.926773
2         Attr3       Attr2    -0.926983
3         Attr3      Attr51    -0.998219
4         Attr4      Attr46     0.999920
..          ...         ...          ...
123      Attr63      Attr35     0.909285
124      Attr63      Attr36     0.931282
125      Attr63      Attr48     0.901437
126      Attr64      Attr53     0.924560
127      Attr64      Attr54     0.925560

[128 rows x 3 columns]


In [58]:
class_counts = all_years_imputed['class'].value_counts()

#make  bar chart
fig = go.Figure(data=[
    go.Bar(x=class_counts.index, y=class_counts.values,
           marker_color=['lightblue' if x == 0 else 'lightcoral' for x in class_counts.index])
])
#put in annotations
for x, y in zip(class_counts.index, class_counts.values):
    fig.add_annotation(
        x=x,
        y=y,
        text=str(y),
        showarrow=False,
        font=dict(
            size=14,
            color="black"
        )
    )
#update layout for title and labels
fig.update_layout(
    title='Class Distribution',
    xaxis_title='Class',
    xaxis=dict(
        tickmode='array',
        tickvals=[0, 1],
        ticktext=['Did Not File Bankruptcy', 'Filed Bankruptcy']
    ),
    yaxis_title='Count',
)

fig.show()

#### Handle the class imbalance using imbalanced learn

In [59]:
#split into X and y
X = all_years_imputed.iloc[:,0:64]
y = all_years_imputed.iloc[:,64]
print(X.head())
print(y.head())

      Attr1    Attr2    Attr3   Attr4    Attr5    Attr6     Attr7    Attr8  \
0  0.200550  0.37951  0.39641  2.0472  32.3510  0.38825  0.249760  1.33050   
1  0.209120  0.49988  0.47225  1.9447  14.7860  0.00000  0.258340  0.99601   
2  0.248660  0.69592  0.26713  1.5548  -1.1523  0.00000  0.309060  0.43695   
3  0.081483  0.30734  0.45879  2.4928  51.9520  0.14988  0.092704  1.86610   
4  0.187320  0.61323  0.22960  1.4063  -7.3128  0.18732  0.187320  0.63070   

    Attr9   Attr10  ...    Attr55    Attr56   Attr57   Attr58    Attr59  \
0  1.1389  0.50494  ...  348690.0  0.121960  0.39718  0.87804  0.001924   
1  1.6996  0.49788  ...    2304.6  0.121300  0.42002  0.85300  0.000000   
2  1.3090  0.30408  ...    6332.7  0.241140  0.81774  0.76599  0.694840   
3  1.0571  0.57353  ...   20545.0  0.054015  0.14207  0.94598  0.000000   
4  1.1559  0.38677  ...    3186.6  0.134850  0.48431  0.86515  0.124440   

   Attr60  Attr61   Attr62  Attr63   Attr64  
0  8.4160  5.1372   82.658  4.4158

#### Create pipelines with sampling

In [71]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

##### XGBoost

In [76]:
#create a dict for scores for each model
scoring = {'accuracy': make_scorer(accuracy_score),
           'recall': make_scorer(recall_score),
           'f1_score': make_scorer(f1_score)
           }

xg_clf = make_pipeline_with_sampler(
    RandomOverSampler(random_state=123),
    XGBClassifier()
)
#allow the cross_validate function to handle the train test split
scores = cross_validate(xg_clf, X_scaled, y, scoring=scoring, cv=5, return_train_score=False)

mean_scores = {metric: np.mean(values) for metric, values in scores.items()}

print(mean_scores)


{'fit_time': 2.177486705780029, 'score_time': 0.007503604888916016, 'test_accuracy': 0.9663633221979036, 'test_recall': 0.6093147274782748, 'test_f1_score': 0.6364681609637282}


##### Support Vector Machine Classifier

In [77]:
#create a dict for scores for each model
scoring = {'accuracy': make_scorer(accuracy_score),
           'recall': make_scorer(recall_score),
           'f1_score': make_scorer(f1_score)
           }

svc_clf = make_pipeline_with_sampler(
    RandomOverSampler(random_state=123),
    SVC()
)
#allow the cross_validate function to handle the train test split
scores = cross_validate(svc_clf, X_scaled, y, scoring=scoring, cv=5, return_train_score=False)

mean_scores = {metric: np.mean(values) for metric, values in scores.items()}

print(mean_scores)

{'fit_time': 4.645179033279419, 'score_time': 0.11526618003845215, 'test_accuracy': 0.7221979034673425, 'test_recall': 0.5948795834237363, 'test_f1_score': 0.171906165296509}


##### Random Forest Classifier

In [79]:
#create a dict for scores for each model
scoring = {'accuracy': make_scorer(accuracy_score),
           'recall': make_scorer(recall_score),
           'f1_score': make_scorer(f1_score)
           }

rf_clf = make_pipeline_with_sampler(
    RandomOverSampler(random_state=123),
    RandomForestClassifier()
)
#allow the cross_validate function to handle the train test split
scores = cross_validate(rf_clf, X_scaled, y, scoring=scoring, cv=5, return_train_score=False)

mean_scores = {metric: np.mean(values) for metric, values in scores.items()}

print(mean_scores)

{'fit_time': 0.2938295841217041, 'score_time': 0.010000658035278321, 'test_accuracy': 0.9559728141919134, 'test_recall': 0.18126091971086317, 'test_f1_score': 0.2832907832405359}


##### Decision Tree Classifier

In [81]:
#create a dict for scores for each model
scoring = {'accuracy': make_scorer(accuracy_score),
           'recall': make_scorer(recall_score),
           'f1_score': make_scorer(f1_score)
           }

dt_clf = make_pipeline_with_sampler(
    RandomOverSampler(random_state=123),
    DecisionTreeClassifier()
)
#allow the cross_validate function to handle the train test split
scores = cross_validate(dt_clf, X_scaled, y, scoring=scoring, cv=5, return_train_score=False)

mean_scores = {metric: np.mean(values) for metric, values in scores.items()}

print(mean_scores)

{'fit_time': 2.2094356060028075, 'score_time': 0.006229925155639649, 'test_accuracy': 0.9380025342702453, 'test_recall': 0.3697045825672882, 'test_f1_score': 0.3650316863411459}
