In [1]:
#CS5228 project

# data manipulation
import pandas as pd
import numpy as np
import math
import random
import pickle
import torch.optim as optim
from scipy import stats


# visualiation
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

# model training
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader


# classifiers
from sklearn.naive_bayes import GaussianNB # naive bayes
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.svm import SVC # SVM
from sklearn.linear_model import LogisticRegression # logistic regression
from sklearn.tree import DecisionTreeClassifier # decision Tree
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import adjusted_rand_score
from sklearn.feature_selection import RFE # for feature selection of LR
from sklearn.ensemble import BaggingClassifier 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from xgboost import XGBClassifier

#Clustering
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.cluster import Birch
from sklearn.cluster import SpectralClustering

# ignore warnings
import warnings
warnings.filterwarnings('ignore')


In [2]:
#load into dataframe
f_data = pd.read_csv('financial_data.csv', na_values=['?']) 
revealed = pd.read_csv('revealed_businesses.csv')
t_data = pd.read_csv('testing_data.csv', na_values=['?'])

#display(f_data.head())
#display(t_data.head())
#display(revealed.head())

In [3]:
#clean data
#fdata = fdata.replace(0, np.nan) # assume 0 values also means missing
f_data = f_data.fillna(f_data.mean())
#tdata = tdata.replace(0, np.nan)
t_data = t_data.fillna(t_data.mean())

Data Normalizing:

In [4]:
#preprocess data
fdatan = f_data.copy(deep=True)
normald = StandardScaler()
fdatan_1 = normald.fit_transform(fdatan.ix[:,fdatan.columns!="Var1"]) # normalize all variables except Var1
fdatan = pd.DataFrame(np.column_stack((fdatan["Var1"].values,fdatan_1)),columns = fdatan.columns).set_index(fdatan.index)
tdatan = t_data.copy(deep=True)
normald = StandardScaler()
tdatan_1 = normald.fit_transform(tdatan.ix[:,tdatan.columns!="Var1"]) # normalize all variables except Var1
tdatan = pd.DataFrame(np.column_stack((tdatan["Var1"].values,tdatan_1)),columns = tdatan.columns).set_index(tdatan.index)


In [5]:
#merge dataframe
fdatan_merged = fdatan.merge(revealed, how="outer")
fdatan_train1 = fdatan.merge(revealed) # dataframe, whose bankruptcy status is known (0 = good standing, 1 = bankrupt)

In [6]:
x1 = fdatan_train1.drop(columns=['Var1', 'Var66']) # Var 1 is company ID, Var 66 is the status
y1 = fdatan_train1['Var66']
t1 = tdatan.drop(columns=['Var1'])
cID = t_data['Var1'].tolist() # use original values, since nothing is done in cID of tdata

In [7]:
#Set Splitting
index_to_round = round(len(fdatan_train1.index)*0.8)
display("Dataset will be split on index: {}".format(index_to_round))

x_training = x1.iloc[:index_to_round, :]
y_training = y1.iloc[:index_to_round]


x_testing = x1.iloc[index_to_round:, :]
y_testing = y1.iloc[index_to_round:]

columns_no_index = list(x_training)

print(x_training.shape)
print(y_training.shape)
print(x_testing.shape)
print(y_testing.shape)

'Dataset will be split on index: 3903'

(3903, 64)
(3903,)
(976, 64)
(976,)


In [8]:
clf = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=0.5,
                                 criterion='friedman_mse', min_samples_split=90, min_samples_leaf=1,
                                 min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0,
                                 min_impurity_split=None, init=None, random_state=None, max_features='auto', verbose=0,
                                 max_leaf_nodes=None, warm_start=False, presort='auto', validation_fraction=0.1,
                                 n_iter_no_change=None, tol=0.0001)
clf.fit(x_training, y_training)
train_score = clf.score(x_training, y_training)
test_score = clf.score(x_testing, y_testing)
test_f1 = f1_score(y_testing, clf.predict(x_testing))
print('Train Score:{} Test Score:{}'.format(train_score, test_score))
print('Test F1:{}'.format(test_f1))

Train Score:0.9861644888547272 Test Score:0.9723360655737705
Test F1:0.509090909090909


In [9]:
clf = XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=100, verbosity=1, objective='binary:logistic',
                    booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1,
                    max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1,
                    reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=0, seed=None,
                    missing=None)
clf.fit(x_training, y_training)
train_score = clf.score(x_training, y_training)
test_score = clf.score(x_testing, y_testing)
test_f1 = f1_score(y_testing, clf.predict(x_testing))
print('Train Score:{} Test Score:{}'.format(train_score, test_score))
print('Test F1:{}'.format(test_f1))
confusion_matrix(clf.predict(x_testing),y_testing)

Train Score:0.988982833717653 Test Score:0.9825819672131147
Test F1:0.6046511627906976


array([[946,  17],
       [  0,  13]], dtype=int64)

Feature selection:

Best 10 features

In [10]:
clf = XGBClassifier(n_jobs=-1)
#clf.fit(x_training, y_training)

rfe = RFE(clf, 45)
rfe = rfe.fit(x_training, y_training)
# summarize the selection of the attributes
#print(rfe.support_)
#print(rfe.ranking_)

In [11]:
useful_features = columns_no_index.copy()
XGBFeatures = []
for i, feature in enumerate(useful_features):
    if rfe.support_[i]:
        XGBFeatures.append(feature)
#XGBFeatures


In [12]:
clf = XGBClassifier(max_depth=8, learning_rate=0.35, n_estimators=100, verbosity=1, objective='binary:logistic',
                    booster='gbtree', n_jobs=-1, gamma=0, min_child_weight=1,
                    max_delta_step=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=600, base_score=0.5, random_state=0,
                    missing=None)
clf.fit(x_training[XGBFeatures], y_training)
train_score = clf.score(x_training[XGBFeatures], y_training)
test_score = clf.score(x_testing[XGBFeatures], y_testing)
test_f1 = f1_score(y_testing, clf.predict(x_testing[XGBFeatures]))
print('Train Score:{} Test Score:{}'.format(train_score, test_score))
print('Test F1:{}'.format(test_f1))

Train Score:1.0 Test Score:0.985655737704918
Test F1:0.7407407407407408


In [13]:
# write results to predictions.csv
f = open('predictions_45features.csv', 'w')
f.write('Business_ID,Is_Bankrupted\n')
for a,b in zip(cID, clf.predict(t1[XGBFeatures])):
    f.write(str(a))
    f.write(',')
    f.write(str(round(b)))
    f.write('\n')
f.close()

In [14]:
#SMOTE Re-Sampling:
sm = SMOTE(random_state=12)
x_train_sm, y_train_sm = sm.fit_sample(x_training[XGBFeatures], y_training)
x_train_sm = pd.DataFrame(x_train_sm, columns=XGBFeatures)

In [15]:
clf = XGBClassifier(max_depth=3, learning_rate=0.25, n_estimators=125, verbosity=1, objective='binary:logistic',
                    booster='gbtree', n_jobs=-1, gamma=0, min_child_weight=0,
                    max_delta_step=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=0,
                    missing=None)
clf.fit(x_train_sm, y_train_sm)
train_score = clf.score(x_train_sm, y_train_sm)
test_score = clf.score(x_testing[XGBFeatures], y_testing)
test_f1 = f1_score(y_testing, clf.predict(x_testing[XGBFeatures]))
print('Train Score:{} Test Score:{}'.format(train_score, test_score))
print('Test F1:{}'.format(test_f1))
confusion_matrix(clf.predict(x_testing[XGBFeatures]),y_testing)

Train Score:0.9997342545841085 Test Score:0.9846311475409836
Test F1:0.7272727272727272


array([[941,  10],
       [  5,  20]], dtype=int64)

In [16]:
# write results to predictions.csv
f = open('predictions_upsampled.csv', 'w')
f.write('Business_ID,Is_Bankrupted\n')
for a,b in zip(cID, clf.predict(t1[XGBFeatures])):
    f.write(str(a))
    f.write(',')
    f.write(str(round(b)))
    f.write('\n')
f.close()

In [106]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 5x5 square convolution
        # kernel
        #self.conv1 = nn.Conv2d(1, 6, 5)
        #self.conv2 = nn.Conv2d(6, 16, 5)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(45, 100)
        self.do1 = nn.Dropout(p=0.2)
        self.fc2 = nn.Linear(100, 150)
        self.fc3 = nn.Linear(150, 200)
        self.fc4 = nn.Linear(200, 100)
        self.fc5 = nn.Linear(100, 50)
        self.fc6 = nn.Linear(50, 1)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        #x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square you can only specify a single number
        #x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        #x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = self.do1(x)
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.do1(x)
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = F.sigmoid(self.fc6(x))
        return x

In [146]:
net = Net()
print(net)

Net(
  (fc1): Linear(in_features=45, out_features=100, bias=True)
  (do1): Dropout(p=0.2)
  (fc2): Linear(in_features=100, out_features=150, bias=True)
  (fc3): Linear(in_features=150, out_features=200, bias=True)
  (fc4): Linear(in_features=200, out_features=100, bias=True)
  (fc5): Linear(in_features=100, out_features=50, bias=True)
  (fc6): Linear(in_features=50, out_features=1, bias=True)
)


In [147]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Assuming that we are on a CUDA machine, this should print a CUDA device:

print(device)
weight = torch.tensor([0.1, 0.9])
criterion = nn.BCELoss()
optimizer = optim.Adam(net.parameters(), lr=0.0003)


cuda:0


In [148]:
for epoch in range(10):  # loop over the dataset multiple times

    running_loss = 0.0
    for i in range(0, 1000):
        randn = np.random.randint(0, len(x_train_sm))
        x_train = torch.tensor(x_train_sm.iloc[randn].values).float()
        y_train = torch.tensor([y_train_sm[randn]]).float()
        #weight_ = weight[y_train.data.view(-1).long()].view_as(y_train)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(x_train)
        loss = criterion(outputs, y_train)
        loss_class_weighted = loss * weight_
        loss_class_weighted = loss_class_weighted.mean()
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i %200 == 199:    # print every 200 mini-batches
            print(outputs.data[0], y_train)
            print('[%d, %5d] loss: %.5f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0

print('Finished Training')


tensor(0.4786) tensor([0.])
[1,   200] loss: 0.06951
tensor(0.4914) tensor([1.])
[1,   400] loss: 0.06910
tensor(0.5169) tensor([1.])
[1,   600] loss: 0.06775
tensor(0.2694) tensor([0.])
[1,   800] loss: 0.06575
tensor(0.5084) tensor([0.])
[1,  1000] loss: 0.06171
tensor(0.7161) tensor([1.])
[2,   200] loss: 0.05820
tensor(0.5658) tensor([1.])
[2,   400] loss: 0.06317
tensor(0.3311) tensor([0.])
[2,   600] loss: 0.05965
tensor(0.5221) tensor([1.])
[2,   800] loss: 0.06109
tensor(0.2724) tensor([1.])
[2,  1000] loss: 0.05210
tensor(0.0863) tensor([0.])
[3,   200] loss: 0.05748
tensor(0.2326) tensor([0.])
[3,   400] loss: 0.05697
tensor(0.6844) tensor([0.])
[3,   600] loss: 0.05287
tensor(0.6815) tensor([1.])
[3,   800] loss: 0.06113
tensor(0.6350) tensor([1.])
[3,  1000] loss: 0.05217
tensor(0.6672) tensor([1.])
[4,   200] loss: 0.05607
tensor(0.7163) tensor([1.])
[4,   400] loss: 0.05061
tensor(0.7142) tensor([1.])
[4,   600] loss: 0.04904
tensor(0.2661) tensor([1.])
[4,   800] loss: 0

In [149]:
correct = 0
wrong = 0
correct_bankruptcy = 0
incorrect_bankruptcy = 0
test_output = []
for i in range(0, len(x_testing.index)):
    ref_x_test1 = torch.tensor(x_testing[XGBFeatures].iloc[i].values).float()
    ref_y_test1 = torch.tensor(y_testing.iloc[i]).float()
    outputs = net(ref_x_test1)
    test_output.append(round(outputs.item()))
display(confusion_matrix(test_output,y_testing))
f1_score(y_testing,test_output)

array([[651,   9],
       [295,  21]], dtype=int64)

0.1213872832369942