In [134]:
#CS5228 project

# data manipulation
import pandas as pd
import numpy as np
import math
import random
import pickle
import torch.optim as optim


# visualiation
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

# model training
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader


# classifiers
from sklearn.naive_bayes import GaussianNB # naive bayes
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.svm import SVC # SVM
from sklearn.linear_model import LogisticRegression # logistic regression
from sklearn.tree import DecisionTreeClassifier # decision Tree
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import RFE # for feature selection of LR
from sklearn.ensemble import BaggingClassifier 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier


# ignore warnings
import warnings
warnings.filterwarnings('ignore')


In [235]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 5x5 square convolution
        # kernel
        #self.conv1 = nn.Conv2d(1, 6, 5)
        #self.conv2 = nn.Conv2d(6, 16, 5)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(64, 100)
        self.fc2 = nn.Linear(100, 150)
        self.fc3 = nn.Linear(150, 200)
        self.fc4 = nn.Linear(200, 100)
        self.fc5 = nn.Linear(100, 50)
        self.fc6 = nn.Linear(50, 1)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        #x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square you can only specify a single number
        #x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        #x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = F.sigmoid(self.fc6(x))
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

In [81]:
#load into dataframe
f_data = pd.read_csv('financial_data.csv', na_values=['?']) 
revealed = pd.read_csv('revealed_businesses.csv')
t_data = pd.read_csv('testing_data.csv', na_values=['?'])

display(f_data.head())
display(revealed.head())

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var56,Var57,Var58,Var59,Var60,Var61,Var62,Var63,Var64,Var65
0,18399,0.023954,0.15012,0.39567,3.6357,54.043,0.028822,0.031029,4.56831,1.0112,...,3871.001,0.011041,0.034914,0.98896,0.0,9.5214,5.8248,34.713,10.515,3.4752
1,15092,0.049699,0.065808,0.7268,12.944,233.11,0.0,0.063192,14.19601,0.89618,...,8751.901,0.059565,0.053189,0.93169,0.0,5.0492,11.152,24.784,14.727,4.2204
2,19821,-0.35631,0.39288,0.15884,1.4043,-2.619,-0.085597,-0.35632,1.54531,0.92963,...,44.859,-0.17277,-0.58691,1.3833,0.0,5.6112,15.779,154.26,2.3662,2.0738
3,14171,0.001417,0.70811,-0.052312,0.88978,-31.198,0.26952,0.001407,0.41222,1.9654,...,-331.879,-0.000535,0.00482,0.9993,0.74548,17.1011,7.9482,88.147,4.1408,3.4021
4,12900,0.020041,0.34652,0.33593,2.7613,39.05,0.0,0.020031,1.88591,1.2975,...,38170.001,0.21241,0.030652,0.80158,0.000862,9.767,6.757,53.651,6.8032,2.7412


Unnamed: 0,Var1,Var66
0,18399,0
1,19821,0
2,17769,0
3,19309,0
4,20728,0


In [92]:
def fill_empty_std(dataframe):
    ones = revealed[revealed.Var66==1].Var1
    ones_df = f_data_zeros[f_data_zeros.Var1.isin(ones)]
    #display(ones_df)
    zeroes = revealed[revealed.Var66==0].Var1
    zeroes_df = f_data_zeros[f_data_zeros.Var1.isin(zeroes)]
    #display(zeroes_df)
    for index, row in dataframe.iterrows():
        for column in dataframe:
            #display((revealed[revealed.Var1==row.Var1].Var66==1).bool())
            if ((revealed[revealed.Var1==row.Var1].Var66==1).bool):
                col = ones_df[column]
            else:
                col = zeroes_df[column]
            mean = col.mean()
            standard_deviation = col.std()
            if math.isnan(row[column]):
                replacement_value = np.random.normal(mean, standard_deviation)
                dataframe.at[index, column] = replacement_value
                #print("Replaced empty value with {}".format(replacement_value))

In [98]:
#clean data
clean_and_save = False
if clean_and_save:
    f_data_clean = f_data.replace(0, np.nan) # assume 0 values also means missing
    fill_empty_std(f_data_clean)
    f_data_clean.to_pickle("./f_data_clean.pkl")

t_data_clean = t_data.replace(0, np.nan)
fill_empty_std(t_data_clean)


Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var56,Var57,Var58,Var59,Var60,Var61,Var62,Var63,Var64,Var65
0,15260,0.10801,0.13924,0.8302,6.9622,473.71,0.614014,0.108,6.18171,0.79295,...,617.791,0.28321,0.12547,0.73116,-23.076565,9.7199,3.4925,64.095,5.6947,25.95
1,14798,0.23663,0.86496,0.070858,1.0849,-18.866,-0.90779,0.27633,0.15613,2.1441,...,156.161,0.22827,1.7523,0.7946,-12.717664,22.7391,3.2655,142.16,2.5676,22.794
2,16288,0.010606,0.19772,0.42363,3.145,58.018,0.018703,0.010596,4.05761,1.5065,...,3758.001,0.54662,0.013208,0.47022,40.073896,3.9728,6.4937,47.851,7.6279,3.975
3,14843,0.00314,0.60277,-0.19351,0.24701,-1058.7,-0.00317,0.00313,0.659,0.11005,...,-22384.999,0.20705,0.00788,0.79303,0.87049,2.1823,8.841,852.31,0.42825,0.11752
4,16200,0.032548,0.35735,0.40921,2.1451,18.331,-0.284547,0.032538,1.79841,2.2632,...,3866.801,0.004981,0.050632,0.98583,-9.315263,7.622,6.3231,57.632,6.3332,9.6952


In [101]:
f_data_clean.head()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var56,Var57,Var58,Var59,Var60,Var61,Var62,Var63,Var64,Var65
0,18399,0.023954,0.15012,0.39567,3.6357,54.043,0.028822,0.031029,4.56831,1.0112,...,3871.001,0.011041,0.034914,0.98896,24.619609,9.5214,5.8248,34.713,10.515,3.4752
1,15092,0.049699,0.065808,0.7268,12.944,233.11,-0.265134,0.063192,14.19601,0.89618,...,8751.901,0.059565,0.053189,0.93169,-27.933393,5.0492,11.152,24.784,14.727,4.2204
2,19821,-0.35631,0.39288,0.15884,1.4043,-2.619,-0.085597,-0.35632,1.54531,0.92963,...,44.859,-0.17277,-0.58691,1.3833,38.349935,5.6112,15.779,154.26,2.3662,2.0738
3,14171,0.001417,0.70811,-0.052312,0.88978,-31.198,0.26952,0.001407,0.41222,1.9654,...,-331.879,-0.000535,0.00482,0.9993,0.74548,17.1011,7.9482,88.147,4.1408,3.4021
4,12900,0.020041,0.34652,0.33593,2.7613,39.05,0.525878,0.020031,1.88591,1.2975,...,38170.001,0.21241,0.030652,0.80158,0.000862,9.767,6.757,53.651,6.8032,2.7412


In [102]:
#preprocess data
fdatan = f_data_clean.copy(deep=True)
normald = StandardScaler()
fdatan_1 = normald.fit_transform(fdatan.ix[:,fdatan.columns!="Var1"]) # normalize all variables except Var1
fdatan = pd.DataFrame(np.column_stack((fdatan["Var1"].values,fdatan_1)),
                      columns = fdatan.columns).set_index(fdatan.index)
tdatan = tdata.copy(deep=True)
normald = StandardScaler()
tdatan_1 = normald.fit_transform(tdatan.ix[:,tdatan.columns!="Var1"]) # normalize all variables except Var1
tdatan = pd.DataFrame(np.column_stack((tdatan["Var1"].values,tdatan_1)),
                      columns = tdatan.columns).set_index(tdatan.index)

fdatan.head()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var56,Var57,Var58,Var59,Var60,Var61,Var62,Var63,Var64,Var65
0,18399.0,-0.048725,-0.085746,0.051995,-0.012682,0.012603,0.036679,-0.057024,-0.019398,-0.104522,...,-0.048021,0.0105,0.004109,-0.017771,0.245694,-0.015621,-0.099369,-0.002156,0.042712,-0.07974
1,15092.0,-0.009047,-0.102278,0.117008,0.003792,0.014003,-0.016611,-0.007704,-0.005818,-0.118623,...,0.044772,0.011304,0.005021,-0.018535,-0.356855,-0.015736,-0.037961,-0.002513,0.179885,-0.078069
2,19821.0,-0.634787,-0.038144,0.005497,-0.016632,0.012159,0.015936,-0.650996,-0.023662,-0.114522,...,-0.120762,0.007453,-0.026925,-0.012515,0.40312,-0.015722,0.015376,0.002148,-0.222671,-0.082881
3,14171.0,-0.083459,0.023668,-0.03596,-0.017542,0.011936,0.080314,-0.102447,-0.025261,0.012458,...,-0.127924,0.010308,0.002607,-0.017634,-0.028036,-0.015427,-0.074892,-0.000232,-0.164877,-0.079903
4,12900.0,-0.054756,-0.047235,0.040266,-0.01423,0.012485,0.126787,-0.073888,-0.023182,-0.069423,...,0.604054,0.013837,0.003896,-0.020269,-0.036574,-0.015615,-0.088623,-0.001474,-0.078171,-0.081385


In [244]:
#merge dataframe
fdatan_merged = fdatan.merge(revealed, how="outer")
fdatan_train1 = fdatan.merge(revealed) # dataframe, whose bankruptcy status is known (0 = good standing, 1 = bankrupt)

display(revealed.head())
display(len(fdatan_train1[fdatan_train1.Var66==0].index))
len(fdatan_train1[fdatan_train1.Var66==1].index)

Unnamed: 0,Var1,Var66
0,18399,0
1,19821,0
2,17769,0
3,19309,0
4,20728,0


4709

170

In [96]:
#classifier here
x1 = fdatan_train1.drop(columns=['Var1', 'Var66']) # Var 1 is company ID, Var 66 is the status
y1 = fdatan_train1['Var66']
t1 = tdatan.drop(columns=['Var1'])
cID = tdata['Var1'].tolist() # use original values, since nothing is done in cID of tdata
clas = AdaBoostClassifier() # select classifier here
clas.fit(x1, y1)

# write results to predictions.csv
f = open('predictions.csv', 'w')
f.write('Business_ID,Is_Bankrupted\n')
for a,b in zip(cID, clas.predict(t1)):
    f.write(str(a))
    f.write(',')
    f.write(str(round(b)))
    f.write('\n')
f.close()

In [253]:
index_to_round = round(len(fdatan_train1.index)*0.75)
display("Dataset will be split on index: {}".format(index_to_round))

for_training = fdatan_train1.iloc[:index_to_round, :]
for_testing = fdatan_train1.iloc[index_to_round:, :]

f_data_positives = for_training[for_training.Var66==0]
f_data_negatives = for_training[for_training.Var66==1]

x_pos = f_data_positives.drop(columns=['Var1', 'Var66'])
y_pos = f_data_positives.Var66
x_neg = f_data_negatives.drop(columns=['Var1', 'Var66'])
y_neg = f_data_negatives.Var66

x_testing = for_testing.drop(columns=['Var1', 'Var66'])
y_testing = for_testing.Var66

display(len(x_pos.index))
display(len(x_neg.index))

'Dataset will be split on index: 3659'

3532

127

In [113]:
clf = GradientBoostingClassifier(learning_rate=0.1)
clf.fit(x_train1, y_train1)
train_score = clf.score(x_train1, y_train1)
test_score = clf.score(x_test1, y_test1)
test_f1 = f1_score(y_test1, clf.predict(x_test1))
print('Train Score:{} Test Score:{}'.format(train_score, test_score))
print('Test F1:{}'.format(test_f1))

Train Score:0.9898879475266467 Test Score:0.9778688524590164
Test F1:0.490566037735849


In [114]:
# write results to predictions.csv
f = open('predictions.csv', 'w')
f.write('Business_ID,Is_Bankrupted\n')
for a,b in zip(cID, clf.predict(t1)):
    f.write(str(a))
    f.write(',')
    f.write(str(round(b)))
    f.write('\n')
f.close()

In [254]:
net = Net()
print(net)

Net(
  (fc1): Linear(in_features=64, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=150, bias=True)
  (fc3): Linear(in_features=150, out_features=200, bias=True)
  (fc4): Linear(in_features=200, out_features=100, bias=True)
  (fc5): Linear(in_features=100, out_features=50, bias=True)
  (fc6): Linear(in_features=50, out_features=1, bias=True)
)


In [260]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Assuming that we are on a CUDA machine, this should print a CUDA device:

print(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)


cpu


In [261]:
for epoch in range(50):  # loop over the dataset multiple times

    running_loss = 0.0
    for i in range(0, len(x_neg.index)):
        x_train = torch.Tensor(x_neg.iloc[i].values).float()
        y_train = torch.tensor(1).float()

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(x_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()
        
        x_train = torch.Tensor(x_pos.iloc[np.random.randint(0, len(x_pos.index))].values).float()
        y_train = torch.tensor(0).float()

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(x_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i == len(x_neg.index)-1:    # print every last mini-batch
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0

print('Finished Training')


[1,   127] loss: 0.031
[2,   127] loss: 0.030
[3,   127] loss: 0.032
[4,   127] loss: 0.033
[5,   127] loss: 0.030
[6,   127] loss: 0.037
[7,   127] loss: 0.030
[8,   127] loss: 0.043
[9,   127] loss: 0.031
[10,   127] loss: 0.029
[11,   127] loss: 0.018
[12,   127] loss: 0.028
[13,   127] loss: 0.030
[14,   127] loss: 0.026
[15,   127] loss: 0.025
[16,   127] loss: 0.028
[17,   127] loss: 0.028
[18,   127] loss: 0.024
[19,   127] loss: 0.025
[20,   127] loss: 0.026
[21,   127] loss: 0.025
[22,   127] loss: 0.024
[23,   127] loss: 0.024
[24,   127] loss: 0.023
[25,   127] loss: 0.031
[26,   127] loss: 0.027
[27,   127] loss: 0.018
[28,   127] loss: 0.025
[29,   127] loss: 0.025
[30,   127] loss: 0.023
[31,   127] loss: 0.027
[32,   127] loss: 0.026
[33,   127] loss: 0.024
[34,   127] loss: 0.025
[35,   127] loss: 0.034
[36,   127] loss: 0.022
[37,   127] loss: 0.023
[38,   127] loss: 0.023
[39,   127] loss: 0.020
[40,   127] loss: 0.023
[41,   127] loss: 0.020
[42,   127] loss: 0.020
[

In [266]:
correct = 0
wrong = 0
for i in range(0, len(x_testing.index)):
    ref_x_test1 = torch.tensor(x_testing.iloc[i].values).float()
    ref_y_test1 = torch.tensor(y_testing.iloc[i]).float()
    outputs = net(ref_x_test1)
    if (round(ref_y_test1.item()) == round(outputs.item())):
        correct += 1
    else:
        wrong += 1
    #print('GroundTruth: {}'.format(ref_y_test1.item()))
    #print('Predicted: {}'.format(outputs.item()))
display('Number of correct: {}'.format(correct))
display('Number of wrong: {}'.format(wrong))

'Number of correct: 1014'

'Number of wrong: 206'

In [263]:
f = open('predictions.csv', 'w')
f.write('Business_ID,Is_Bankrupted\n')
for i in range(0, len(t1.index)):
    x_test = torch.Tensor(t1.iloc[i].values).float()
    outputs = net(x_test)
    f.write(str(cID[i]))
    f.write(',')
    f.write(str(round(outputs.item())))
    f.write('\n')
f.close()