In [104]:
#CS5228 project

# data manipulation
import pandas as pd
import numpy as np
import math
import random
import pickle
import torch.optim as optim
from scipy import stats


# visualiation
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

# model training
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader


# classifiers
from sklearn.naive_bayes import GaussianNB # naive bayes
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.svm import SVC # SVM
from sklearn.linear_model import LogisticRegression # logistic regression
from sklearn.tree import DecisionTreeClassifier # decision Tree
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import RFE # for feature selection of LR
from sklearn.ensemble import BaggingClassifier 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE


# ignore warnings
import warnings
warnings.filterwarnings('ignore')


In [2]:
def variance_threshold_selector(data, threshold=0.5):
    selector = VarianceThreshold(threshold)
    selector.fit(data)
    return data[data.columns[selector.get_support(indices=True)]]


In [3]:
#load into dataframe
f_data = pd.read_csv('financial_data.csv', na_values=['?']) 
revealed = pd.read_csv('revealed_businesses.csv')
t_data = pd.read_csv('testing_data.csv', na_values=['?'])

display(f_data.head())
display(t_data.head())
display(revealed.head())

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var56,Var57,Var58,Var59,Var60,Var61,Var62,Var63,Var64,Var65
0,18399,0.023954,0.15012,0.39567,3.6357,54.043,0.028822,0.031029,4.56831,1.0112,...,3871.001,0.011041,0.034914,0.98896,0.0,9.5214,5.8248,34.713,10.515,3.4752
1,15092,0.049699,0.065808,0.7268,12.944,233.11,0.0,0.063192,14.19601,0.89618,...,8751.901,0.059565,0.053189,0.93169,0.0,5.0492,11.152,24.784,14.727,4.2204
2,19821,-0.35631,0.39288,0.15884,1.4043,-2.619,-0.085597,-0.35632,1.54531,0.92963,...,44.859,-0.17277,-0.58691,1.3833,0.0,5.6112,15.779,154.26,2.3662,2.0738
3,14171,0.001417,0.70811,-0.052312,0.88978,-31.198,0.26952,0.001407,0.41222,1.9654,...,-331.879,-0.000535,0.00482,0.9993,0.74548,17.1011,7.9482,88.147,4.1408,3.4021
4,12900,0.020041,0.34652,0.33593,2.7613,39.05,0.0,0.020031,1.88591,1.2975,...,38170.001,0.21241,0.030652,0.80158,0.000862,9.767,6.757,53.651,6.8032,2.7412


Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var56,Var57,Var58,Var59,Var60,Var61,Var62,Var63,Var64,Var65
0,15260,0.10801,0.13924,0.8302,6.9622,473.71,0.0,0.108,6.18171,0.79295,...,617.791,0.28321,0.12547,0.73116,0.0,9.7199,3.4925,64.095,5.6947,25.95
1,14798,0.23663,0.86496,0.070858,1.0849,-18.866,-0.90779,0.27633,0.15613,2.1441,...,156.161,0.22827,1.7523,0.7946,0.0,22.7391,3.2655,142.16,2.5676,22.794
2,16288,0.010606,0.19772,0.42363,3.145,58.018,0.0,0.010596,4.05761,1.5065,...,3758.001,0.54662,0.013208,0.47022,0.0,3.9728,6.4937,47.851,7.6279,3.975
3,14843,0.00314,0.60277,-0.19351,0.24701,-1058.7,-0.00317,0.00313,0.659,0.11005,...,-22384.999,0.20705,0.00788,0.79303,0.87049,2.1823,8.841,852.31,0.42825,0.11752
4,16200,0.032548,0.35735,0.40921,2.1451,18.331,0.0,0.032538,1.79841,2.2632,...,3866.801,0.004981,0.050632,0.98583,0.0,7.622,6.3231,57.632,6.3332,9.6952


Unnamed: 0,Var1,Var66
0,18399,0
1,19821,0
2,17769,0
3,19309,0
4,20728,0


In [14]:
#Data Cleaning
#display(f_data.isna().sum().sort_values(ascending=False))
#f_data_2 = f_data.drop(columns=['Var38', 'Var22', 'Var61', 'Var28', 'Var61']) #Use in case of column reduction
#columns_no_output = list(f_data_2)  #Use in case of column reduction
#t_data_reduced = t_data[columns_no_output]
f_data_merged = f_data.merge(revealed)
f_data_revealed = f_data_merged[f_data_merged.Var66 != np.nan]

f_data_positive = f_data_revealed[f_data_revealed.Var66 == 0]
f_data_negative = f_data_revealed[f_data_revealed.Var66 == 1]

f_data_positive_clean = f_data_positive.fillna(method='ffill')
f_data_positive_clean = f_data_positive_clean.dropna()
f_data_negative_clean = f_data_negative.fillna(method='ffill')
f_data_negative_clean = f_data_negative_clean.dropna()

f_data_merged_clean = f_data_positive_clean.append(f_data_negative_clean).sort_index() 
f_data_no_index = f_data_merged_clean.drop(columns=['Var1', 'Var66']) #X Training
training_y = f_data_merged_clean.Var66 #Y training
columns_no_index = list(f_data_no_index)

f_data_positive_no_output = f_data_positive_clean.drop(columns=['Var66'])
f_data_negative_no_output = f_data_negative_clean.drop(columns=['Var66'])

t_data_clean = t_data.fillna(method='bfill').drop(columns=['Var1'])
cID = t_data['Var1'].tolist()
t_data_columns = list(t_data_clean)






#f_data_limited = variance_threshold_selector(f_data_no_index_no_outliers, 0.8) #x for training
#f_data_limited_output = f_data_limited.copy(deep='true')
#f_data_limited_output['Var66'] = training_y
#columns_no_index_limited = list(f_data_limited_output)

In [26]:
#display(f_data.isna().sum())
#display(f_data_merged_clean.isnull().sum())
#display(f_data_revealed.isnull().sum())
#display(f_data_merged_clean.Var66)
count = 0
for value in y_training:
    if value == 1:
        count+=1
count

126

Data Normalizing:

In [20]:
scaler = StandardScaler()

f_data_norm = scaler.fit_transform(f_data_no_index)
f_data_normalized = pd.DataFrame(f_data_norm, columns=columns_no_index)
f_data_normalized['Var66'] = training_y.values
#f_data_normalized = f_data_normalized.sample(frac=1)

t_data_norm = scaler.fit_transform(t_data_clean)
t_data_normalized = pd.DataFrame(t_data_norm, columns=t_data_columns)

display(f_data_normalized.head())
display(t_data_normalized.head())

Unnamed: 0,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,Var11,...,Var57,Var58,Var59,Var60,Var61,Var62,Var63,Var64,Var65,Var66
0,-0.031578,-0.217947,-0.023833,-0.065522,0.016436,0.043052,-0.028107,-0.024453,-0.442741,0.158892,...,0.014975,-0.010964,-0.015799,-0.038823,-0.016877,-0.103804,-0.027241,-0.21316,-0.088586,0
1,-0.114623,0.474405,-0.028456,-0.069338,0.016689,0.017261,-0.12848,-0.027264,-0.020177,-0.428242,...,0.014054,-0.036933,-0.015734,-0.044719,-0.016877,-0.134625,-0.016933,-0.303888,0.043696,0
2,-0.310244,0.519723,-0.142454,-0.065566,0.016455,0.017261,-0.323589,-0.027363,-0.643001,-0.471554,...,0.011261,-0.154133,-0.012374,0.028404,-0.016767,-0.122845,-0.022144,-0.273064,-0.096589,0
3,-0.147664,0.124218,-0.853632,-0.081879,0.015869,0.017261,-0.164845,-0.026206,-0.364337,-0.093558,...,0.014151,-0.034607,-0.015302,-0.044719,-0.016792,-0.088503,-0.016169,-0.307144,-0.094299,0
4,-0.187001,0.803238,-0.805485,-0.077174,0.022185,0.017261,-0.200669,-0.027868,0.742657,-0.742599,...,0.014853,0.613947,-0.015536,-0.194292,0.001046,-0.110785,-0.024033,-0.256083,-0.06059,0


Unnamed: 0,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,Var11,...,Var56,Var57,Var58,Var59,Var60,Var61,Var62,Var63,Var64,Var65
0,0.111465,-0.059198,0.082161,0.070248,0.214767,0.025733,0.090308,-0.023223,-0.466343,0.060784,...,-0.081084,0.495542,0.030066,-0.026149,-0.058331,-0.082314,-0.123844,-0.033871,-0.078187,0.00734
1,0.31212,0.003717,0.016274,-0.129639,0.017576,-0.046425,0.351322,-0.031098,0.313527,-0.002135,...,-0.086097,0.383066,0.265854,-0.026013,-0.058331,-0.072837,-0.127529,-0.025458,-0.166636,-0.003845
2,-0.040492,-0.054128,0.046884,-0.059575,0.048355,0.025733,-0.060728,-0.025999,-0.054489,0.055714,...,-0.046986,1.034808,0.013795,-0.026707,-0.058331,-0.086497,-0.075128,-0.035621,-0.023507,-0.07054
3,-0.052139,-0.019013,-0.006665,-0.158136,-0.398698,0.025481,-0.072305,-0.030441,-0.860506,0.020596,...,-0.330861,0.339624,0.013022,-0.026017,0.017325,-0.087801,-0.037025,0.051068,-0.227147,-0.084211
4,-0.006261,-0.040289,0.045632,-0.093582,0.032467,0.025733,-0.026704,-0.028952,0.38227,0.041874,...,-0.045804,-0.074061,0.019219,-0.025604,-0.058331,-0.083841,-0.077897,-0.034567,-0.060127,-0.050267


In [21]:
#Set Splitting
index_to_round = round(len(f_data_normalized.index)*0.75)
display("Dataset will be split on index: {}".format(index_to_round))


for_training = f_data_normalized.iloc[:index_to_round, :]
for_testing = f_data_normalized.iloc[index_to_round:, :]

x_training = for_training.drop(columns=['Var66'])
y_training = for_training.Var66


x_testing = for_testing.drop(columns=['Var66'])
y_testing = for_testing.Var66

'Dataset will be split on index: 3656'

In [97]:
clf = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=200, subsample=0.5,
                                 criterion='friedman_mse', min_samples_split=90, min_samples_leaf=1,
                                 min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0,
                                 min_impurity_split=None, init=None, random_state=None, max_features='auto', verbose=0,
                                 max_leaf_nodes=None, warm_start=False, presort='auto', validation_fraction=0.1,
                                 n_iter_no_change=None, tol=0.0001)
clf.fit(x_training, y_training)
train_score = clf.score(x_training, y_training)
test_score = clf.score(x_testing, y_testing)
test_f1 = f1_score(y_testing, clf.predict(x_testing))
print('Train Score:{} Test Score:{}'.format(train_score, test_score))
print('Test F1:{}'.format(test_f1))

Train Score:0.9860503282275711 Test Score:0.9630541871921182
Test F1:0.23728813559322032


In [56]:
clf = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30,
                           p=2, metric='minkowski', metric_params=None, n_jobs=-1)
clf.fit(x_training, y_training)
train_score = clf.score(x_training, y_training)
test_score = clf.score(x_testing, y_testing)
test_f1 = f1_score(y_testing, clf.predict(x_testing))
print('Train Score:{} Test Score:{}'.format(train_score, test_score))
print('Test F1:{}'.format(test_f1))

Train Score:0.9669037199124726 Test Score:0.9655172413793104
Test F1:0.04545454545454545


In [47]:
clf = AdaBoostClassifier(base_estimator=None, n_estimators=65, learning_rate=1, algorithm='SAMME.R', random_state=None)
clf.fit(x_training, y_training)
train_score = clf.score(x_training, y_training)
test_score = clf.score(x_testing, y_testing)
test_f1 = f1_score(y_testing, clf.predict(x_testing))
print('Train Score:{} Test Score:{}'.format(train_score, test_score))
print('Test F1:{}'.format(test_f1))

Train Score:0.9729212253829321 Test Score:0.9655172413793104
Test F1:0.22222222222222218


In [99]:
# write results to predictions.csv
f = open('predictions_basic.csv', 'w')
f.write('Business_ID,Is_Bankrupted\n')
for a,b in zip(cID, clf.predict(t_data_normalized)):
    f.write(str(a))
    f.write(',')
    f.write(str(round(b)))
    f.write('\n')
f.close()

In [100]:
#Re-sampling:
x_full = f_data_normalized.copy()
x_maj = x_full[x_full.Var66==0]
x_min = x_full[x_full.Var66==1]
x_maj_rus = resample(x_maj,replace=False,n_samples=len(x_min),random_state=44)
x_rus = pd.concat([x_maj_rus, x_min])
x_train_rus = x_rus.drop(columns=['Var66'])
y_train_rus = x_rus.Var66


In [105]:
#SMOTE Re-Sampling:
sm = SMOTE(random_state=42)
x_train_sm, y_train_sm = sm.fit_sample(x_training, y_training)

In [118]:
clf = GradientBoostingClassifier(loss='deviance', learning_rate=0.3, n_estimators=200, subsample=0.5,
                                 criterion='friedman_mse', min_samples_split=90, min_samples_leaf=1,
                                 min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0,
                                 min_impurity_split=None, init=None, random_state=None, max_features='auto', verbose=0,
                                 max_leaf_nodes=None, warm_start=False, presort='auto', validation_fraction=0.1,
                                 n_iter_no_change=None, tol=0.0001)
clf.fit(x_train_sm, y_train_sm)
train_score = clf.score(x_train_sm, y_train_sm)
test_score = clf.score(x_testing, y_testing)
test_f1 = f1_score(y_testing, clf.predict(x_testing))
print('Train Score:{} Test Score:{}'.format(train_score, test_score))
print('Test F1:{}'.format(test_f1))

Train Score:1.0 Test Score:0.9589490968801314
Test F1:0.40476190476190477


In [119]:
# write results to predictions.csv
f = open('predictions_upsampled.csv', 'w')
f.write('Business_ID,Is_Bankrupted\n')
for a,b in zip(cID, clf.predict(t_data_normalized)):
    f.write(str(a))
    f.write(',')
    f.write(str(round(b)))
    f.write('\n')
f.close()

In [None]:
#Clustering:
