In [1]:
#CS5228 project

# data manipulation
import pandas as pd
import numpy as np
import math
import random
import pickle
import torch.optim as optim
from scipy import stats


# visualiation
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

# model training
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader


# classifiers
from sklearn.naive_bayes import GaussianNB # naive bayes
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.svm import SVC # SVM
from sklearn.linear_model import LogisticRegression # logistic regression
from sklearn.tree import DecisionTreeClassifier # decision Tree
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import adjusted_rand_score
from sklearn.feature_selection import RFE # for feature selection of LR
from sklearn.ensemble import BaggingClassifier 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

#Clustering
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.cluster import Birch
from sklearn.cluster import SpectralClustering

# ignore warnings
import warnings
warnings.filterwarnings('ignore')


In [73]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 5x5 square convolution
        # kernel
        #self.conv1 = nn.Conv2d(1, 6, 5)
        #self.conv2 = nn.Conv2d(6, 16, 5)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(15, 100)
        self.do1 = nn.Dropout(p=0.2)
        self.fc2 = nn.Linear(100, 150)
        self.fc3 = nn.Linear(150, 200)
        self.fc4 = nn.Linear(200, 100)
        self.fc5 = nn.Linear(100, 50)
        self.fc6 = nn.Linear(50, 1)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        #x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square you can only specify a single number
        #x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        #x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = self.do1(x)
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.do1(x)
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = self.fc6(x)
        return x


In [3]:
#load into dataframe
f_data = pd.read_csv('financial_data.csv', na_values=['?']) 
revealed = pd.read_csv('revealed_businesses.csv')
t_data = pd.read_csv('testing_data.csv', na_values=['?'])

#display(f_data.head())
#display(t_data.head())
#display(revealed.head())

In [4]:
f_data = f_data.replace(0, np.nan) # assume 0 values also means missing
t_data = t_data.replace(0, np.nan)

f_data_merged = f_data.merge(revealed, how='outer')
f_data_first_clean = f_data_merged.copy(deep=True)

f_data_positive = f_data_merged[f_data_merged.Var66 == 0]
f_data_negative = f_data_merged[f_data_merged.Var66 == 1]

f_data_positive_clean = f_data_positive.fillna(method='ffill')
f_data_negative_clean = f_data_negative.fillna(method='ffill')
dfp = f_data_positive_clean.drop(columns=['Var1','Var66'])
dfn = f_data_negative_clean.drop(columns=['Var1','Var66'])
f_data_positive_clean_train = dfp[(np.abs(stats.zscore(dfp)) < 3).all(axis=1)]
f_data_negative_clean_train = dfn[(np.abs(stats.zscore(dfn)) < 3).all(axis=1)]

f_data_first_clean.update(f_data_positive_clean)
f_data_first_clean.update(f_data_negative_clean)
f_data_clean = f_data_first_clean.drop(columns=['Var66']).dropna()
f_data_training = f_data_clean.merge(revealed)
x_values = f_data_training.drop(columns=['Var66', 'Var1'])
y_values = f_data_training.Var66
x_columns = list(x_values)


In [5]:

t_data_clean = t_data.fillna(method='bfill').drop(columns=['Var1'])
cID = t_data['Var1'].tolist()
t_data_columns = list(t_data_clean)






#f_data_limited = variance_threshold_selector(f_data_no_index_no_outliers, 0.8) #x for training
#f_data_limited_output = f_data_limited.copy(deep='true')
#f_data_limited_output['Var66'] = training_y
#columns_no_index_limited = list(f_data_limited_output)

Data Normalizing:

In [6]:
scaler = StandardScaler()

f_data_norm = scaler.fit_transform(x_values)
f_data_normalized = pd.DataFrame(f_data_norm, columns=x_columns)

t_data_norm = scaler.fit_transform(t_data_clean)
t_data_normalized = pd.DataFrame(t_data_norm, columns=x_columns)

display(f_data_normalized.head())
display(t_data_normalized.head())

Unnamed: 0,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,Var11,...,Var56,Var57,Var58,Var59,Var60,Var61,Var62,Var63,Var64,Var65
0,-0.031578,-0.219367,-0.023833,-0.065522,0.016436,0.055855,-0.028101,-0.024453,-0.445821,0.158892,...,0.135385,0.014975,-0.011116,-0.015799,-0.057772,-0.016877,-0.103971,-0.027261,-0.213907,-0.088628
1,-0.114623,0.473085,-0.028456,-0.069338,0.016689,0.055855,-0.128474,-0.027264,-0.02279,-0.428242,...,-0.114428,0.014054,-0.037085,-0.015734,-0.057772,-0.016877,-0.134793,-0.016953,-0.304649,0.043654
2,-0.310244,0.51841,-0.142454,-0.065566,0.016455,0.055855,-0.323582,-0.027363,-0.646302,-0.471554,...,-0.104362,0.011261,-0.15428,-0.012374,0.00289,-0.016767,-0.123012,-0.022165,-0.273821,-0.096631
3,-0.147664,0.122847,-0.853632,-0.081879,0.015869,0.055855,-0.164838,-0.026206,-0.36733,-0.093558,...,-0.156144,0.014151,-0.034759,-0.015302,0.00289,-0.016792,-0.08867,-0.01619,-0.307905,-0.094341
4,-0.187001,0.801967,-0.805485,-0.077174,0.022185,0.055855,-0.200662,-0.027868,0.740887,-0.742599,...,-0.147965,0.014853,0.61377,-0.015536,-0.198063,0.001046,-0.110952,-0.024053,-0.256837,-0.060632


Unnamed: 0,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,Var11,...,Var56,Var57,Var58,Var59,Var60,Var61,Var62,Var63,Var64,Var65
0,0.111465,-0.059264,0.08215,0.070248,0.214773,-0.046135,0.089683,-0.023223,-0.472214,0.060784,...,-0.081084,0.495542,0.029833,-0.026152,-0.036325,-0.082314,-0.12545,-0.033877,-0.079251,0.006974
1,0.31212,0.003651,0.016262,-0.129639,0.017582,-0.046135,0.350676,-0.031098,0.308982,-0.002135,...,-0.086097,0.383066,0.265619,-0.026016,-0.036325,-0.072837,-0.129134,-0.025464,-0.167712,-0.004211
2,-0.040492,-0.054194,0.046872,-0.059575,0.04836,0.025709,-0.06134,-0.025999,-0.05966,0.055714,...,-0.046986,1.034808,0.013562,-0.02671,-0.036325,-0.086497,-0.076747,-0.035627,-0.024564,-0.070907
3,-0.052139,-0.019079,-0.006677,-0.158136,-0.398692,0.025709,-0.072916,-0.030441,-0.867046,0.020596,...,-0.330861,0.339624,0.01279,-0.026019,-0.036325,-0.087801,-0.038655,0.051062,-0.228231,-0.084578
4,-0.006261,-0.040355,0.045621,-0.093582,0.032473,0.041712,-0.027319,-0.028952,0.377842,0.041874,...,-0.045804,-0.074061,0.018986,-0.025607,-0.015155,-0.083841,-0.079515,-0.034573,-0.061189,-0.050634


In [7]:
#Set Splitting
index_to_round = round(len(f_data_normalized.index)*0.8)
display("Dataset will be split on index: {}".format(index_to_round))

x_training = f_data_normalized.iloc[:index_to_round, :]
y_training = y_values.iloc[:index_to_round]


x_testing = f_data_normalized.iloc[index_to_round:, :]
y_testing = y_values.iloc[index_to_round:]

print(x_training.shape)
print(y_training.shape)
print(x_testing.shape)
print(y_testing.shape)

'Dataset will be split on index: 3899'

(3899, 64)
(3899,)
(975, 64)
(975,)


Feature selection:

Best 10 features

In [8]:
clf = GradientBoostingClassifier()
#clf.fit(x_training, y_training)

rfe = RFE(clf, 15)
rfe = rfe.fit(x_training, y_training)
# summarize the selection of the attributes
#print(rfe.support_)
#print(rfe.ranking_)

In [9]:
useful_features = x_columns.copy()
GBCFeatures = []
for i, feature in enumerate(useful_features):
    if rfe.support_[i]:
        GBCFeatures.append(feature)
GBCFeatures


['Var6',
 'Var7',
 'Var17',
 'Var22',
 'Var23',
 'Var25',
 'Var28',
 'Var34',
 'Var35',
 'Var38',
 'Var40',
 'Var45',
 'Var47',
 'Var58',
 'Var60']

In [82]:
clf = AdaBoostClassifier(base_estimator=None, n_estimators=65, learning_rate=1, algorithm='SAMME.R', random_state=None)
rfe = RFE(clf, 10)
rfe = rfe.fit(x_training, y_training)
# summarize the selection of the attributes

In [83]:
useful_features = x_columns.copy()
ABFeatures = []
for i, feature in enumerate(useful_features):
    if rfe.support_[i]:
        ABFeatures.append(feature)
ABFeatures

['Var6',
 'Var7',
 'Var13',
 'Var25',
 'Var28',
 'Var31',
 'Var35',
 'Var38',
 'Var46',
 'Var60']

In [93]:
clf = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, 
                         class_weight=None, random_state=None, solver='warn', max_iter=100, multi_class='warn', 
                         verbose=0, warm_start=False, n_jobs=-1)
rfe = RFE(clf, 20)
rfe = rfe.fit(x_training, y_training)
# summarize the selection of the attributes

In [94]:
useful_features = x_columns.copy()
LRFeatures = []
for i, feature in enumerate(useful_features):
    if rfe.support_[i]:
        LRFeatures.append(feature)
LRFeatures

['Var5',
 'Var10',
 'Var11',
 'Var12',
 'Var13',
 'Var17',
 'Var23',
 'Var25',
 'Var27',
 'Var29',
 'Var35',
 'Var36',
 'Var37',
 'Var39',
 'Var47',
 'Var49',
 'Var51',
 'Var55',
 'Var62',
 'Var64']

In [10]:
clf = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=500, subsample=0.5,
                                 criterion='friedman_mse', min_samples_split=90, min_samples_leaf=1,
                                 min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0,
                                 min_impurity_split=None, init=None, random_state=None, max_features='auto', verbose=0,
                                 max_leaf_nodes=None, warm_start=False, presort='auto', validation_fraction=0.1,
                                 n_iter_no_change=None, tol=0.0001)
clf.fit(x_training[GBCFeatures], y_training)
train_score = clf.score(x_training[GBCFeatures], y_training)
test_score = clf.score(x_testing[GBCFeatures], y_testing)
test_f1 = f1_score(y_testing, clf.predict(x_testing[GBCFeatures]))
print('Train Score:{} Test Score:{}'.format(train_score, test_score))
print('Test F1:{}'.format(test_f1))

Train Score:0.9997435239805078 Test Score:0.9702564102564103
Test F1:0.21621621621621623


In [90]:
clf = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30,
                           p=1, metric='minkowski', metric_params=None, n_jobs=-1)
clf.fit(x_training[GBCFeatures], y_training)
train_score = clf.score(x_training[GBCFeatures], y_training)
test_score = clf.score(x_testing[GBCFeatures], y_testing)
test_f1 = f1_score(y_testing, clf.predict(x_testing[GBCFeatures]))
print('Train Score:{} Test Score:{}'.format(train_score, test_score))
print('Test F1:{}'.format(test_f1))

Train Score:0.9648627853295717 Test Score:0.9692307692307692
Test F1:0.0


In [15]:
clf = AdaBoostClassifier(base_estimator=None, n_estimators=85, learning_rate=1,
                         algorithm='SAMME.R', random_state=None)
clf.fit(x_training[ABFeatures], y_training)
train_score = clf.score(x_training[ABFeatures], y_training)
test_score = clf.score(x_testing[ABFeatures], y_testing)
test_f1 = f1_score(y_testing, clf.predict(x_testing[ABFeatures]))
print('Train Score:{} Test Score:{}'.format(train_score, test_score))
print('Test F1:{}'.format(test_f1))

Train Score:0.9745623632385121 Test Score:0.964696223316913
Test F1:0.18867924528301888


In [16]:
clf = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, 
                         class_weight=None, random_state=None, solver='warn', max_iter=100, multi_class='warn', 
                         verbose=0, warm_start=False, n_jobs=-1)
clf.fit(x_training[LRFeatures], y_training)
train_score = clf.score(x_training[LRFeatures], y_training)
test_score = clf.score(x_testing[LRFeatures], y_testing)
test_f1 = f1_score(y_testing, clf.predict(x_testing[LRFeatures]))
print('Train Score:{} Test Score:{}'.format(train_score, test_score))
print('Test F1:{}'.format(test_f1))

Train Score:0.9655361050328227 Test Score:0.9638752052545156
Test F1:0.0


In [17]:
# write results to predictions.csv
f = open('predictions_10features.csv', 'w')
f.write('Business_ID,Is_Bankrupted\n')
for a,b in zip(cID, clf.predict(t_data_normalized[GBCFeatures])):
    f.write(str(a))
    f.write(',')
    f.write(str(round(b)))
    f.write('\n')
f.close()

In [28]:
#SMOTE Re-Sampling:
sm = SMOTE(random_state=12)
x_train_sm, y_train_sm = sm.fit_sample(x_training[GBCFeatures], y_training)
x_train_sm_df = pd.DataFrame(data = x_train_sm, columns = GBCFeatures)
smote_df = x_train_sm_df.copy()
smote_df['Var66'] = y_train_sm

In [12]:
clf = GradientBoostingClassifier(loss='exponential', learning_rate=0.4, n_estimators=200, subsample=0.5,
                                 criterion='friedman_mse', min_samples_split=90, min_samples_leaf=1,
                                 min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0,
                                 min_impurity_split=None, init=None, random_state=None, max_features='auto', verbose=0,
                                 max_leaf_nodes=None, warm_start=False, presort='auto', validation_fraction=0.1,
                                 n_iter_no_change=None, tol=0.0001)
clf.fit(x_train_sm, y_train_sm)
train_score = clf.score(x_train_sm, y_train_sm)
test_score = clf.score(x_testing[GBCFeatures], y_testing)
test_f1 = f1_score(y_testing, clf.predict(x_testing[GBCFeatures]))
print('Train Score:{} Test Score:{}'.format(train_score, test_score))
print('Test F1:{}'.format(test_f1))

Train Score:0.9974734042553192 Test Score:0.9456410256410256
Test F1:0.34567901234567905


In [74]:
net = Net()
print(net)

Net(
  (fc1): Linear(in_features=15, out_features=100, bias=True)
  (do1): Dropout(p=0.2)
  (fc2): Linear(in_features=100, out_features=150, bias=True)
  (fc3): Linear(in_features=150, out_features=200, bias=True)
  (fc4): Linear(in_features=200, out_features=100, bias=True)
  (fc5): Linear(in_features=100, out_features=50, bias=True)
  (fc6): Linear(in_features=50, out_features=1, bias=True)
)


In [75]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Assuming that we are on a CUDA machine, this should print a CUDA device:

print(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)


cpu


In [83]:
for epoch in range(10):  # loop over the dataset multiple times

    running_loss = 0.0
    for i in range(0, 1500):
        randn = np.random.randint(0, len(x_train_sm))
        x_train = torch.Tensor(x_train_sm_df.iloc[randn].values).float()
        y_train = torch.tensor([y_train_sm[randn]]).float()

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(x_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i %500 == 499:    # print every 200 mini-batches
            print(outputs.data[0], y_train)
            print('[%d, %5d] loss: %.5f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0

print('Finished Training')


tensor(1.6642) tensor([0.])
[1,   500] loss: 0.10247
tensor(1.1332) tensor([1.])
[1,  1000] loss: 0.10557
tensor(1.6996) tensor([1.])
[1,  1500] loss: 0.10207
tensor(-1.3121) tensor([0.])
[2,   500] loss: 0.10449
tensor(0.9810) tensor([1.])
[2,  1000] loss: 0.10539
tensor(-3.8386) tensor([0.])
[2,  1500] loss: 0.10018
tensor(0.0064) tensor([0.])
[3,   500] loss: 0.10995
tensor(-8.6286) tensor([0.])
[3,  1000] loss: 0.10587
tensor(-5.9347) tensor([0.])
[3,  1500] loss: 0.10964
tensor(0.9686) tensor([1.])
[4,   500] loss: 0.09732
tensor(-1.7024) tensor([0.])
[4,  1000] loss: 0.09521
tensor(0.1214) tensor([1.])
[4,  1500] loss: 0.11120
tensor(0.5361) tensor([1.])
[5,   500] loss: 0.10536
tensor(1.9390) tensor([1.])
[5,  1000] loss: 0.09997
tensor(-9.1979) tensor([0.])
[5,  1500] loss: 0.10768
tensor(-6.4498) tensor([0.])
[6,   500] loss: 0.10053
tensor(1.3738) tensor([1.])
[6,  1000] loss: 0.10650
tensor(-13.1765) tensor([0.])
[6,  1500] loss: 0.07822
tensor(2.7947) tensor([1.])
[7,   500

In [81]:
correct = 0
wrong = 0
correct_bankruptcy = 0
incorrect_bankruptcy = 0
for i in range(0, len(x_testing[GBCFeatures].index)):
    ref_x_test1 = torch.tensor(x_testing[GBCFeatures].iloc[i].values).float()
    ref_y_test1 = torch.tensor(y_testing.iloc[i]).float()
    outputs = net(ref_x_test1)
    if (round(ref_y_test1.item()) == round(outputs.item())):
        correct += 1
    else:
        #print(outputs.data[0], ref_y_test1.item())
        wrong += 1
    if (round(ref_y_test1.item()) == 1 and round(outputs.item()) == 1):
        correct_bankruptcy +=1
    if (round(ref_y_test1.item()) == 1 and round(outputs.item()) == 0):
        incorrect_bankruptcy +=1
    #print('Bankruptcy guessed correctly!')
    #print('GroundTruth: {}'.format(ref_y_test1.item()))
    #print('Predicted: {}'.format(outputs.item()))
display('Number of correct: {}'.format(correct))
display('Number of wrong: {}'.format(wrong))
display('Number of correct bankruptcies: {}'.format(correct_bankruptcy))
display('Number of wrong bankruptcies: {}'.format(incorrect_bankruptcy))

'Number of correct: 123'

'Number of wrong: 852'

'Number of correct bankruptcies: 13'

'Number of wrong bankruptcies: 6'

In [82]:
f = open('predictions_nn.csv', 'w')
f.write('Business_ID,Is_Bankrupted\n')
for i in range(0, len(t_data_normalized.index)):
    x_test = torch.Tensor(t_data_normalized[GBCFeatures].iloc[i].values).float()
    outputs = net(x_test)
    f.write(str(cID[i]))
    f.write(',')
    f.write(str(round(outputs.item())))
    f.write('\n')
f.close()

In [None]:
# write results to predictions.csv
f = open('predictions_upsampled.csv', 'w')
f.write('Business_ID,Is_Bankrupted\n')
for a,b in zip(cID, clf.predict(t_data_normalized[GBCFeatures])):
    f.write(str(a))
    f.write(',')
    f.write(str(round(b)))
    f.write('\n')
f.close()