In [1]:
#CS5228 project

# data manipulation
import pandas as pd
import numpy as np
import math
import random
import pickle
import torch.optim as optim
from scipy import stats
from collections import Counter


# visualiation
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

# model training
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader


# classifiers
from sklearn.naive_bayes import GaussianNB # naive bayes
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.svm import SVC # SVM
from sklearn.linear_model import LogisticRegression # logistic regression
from sklearn.tree import DecisionTreeClassifier # decision Tree
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import adjusted_rand_score
from sklearn.feature_selection import RFE # for feature selection of LR
from sklearn.ensemble import BaggingClassifier 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import ClusterCentroids
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBClassifier

#Clustering
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.cluster import Birch
from sklearn.cluster import SpectralClustering

# ignore warnings
import warnings
warnings.filterwarnings('ignore')


In [2]:
#load into dataframe
f_data = pd.read_csv('financial_data.csv', na_values=['?']) 
revealed = pd.read_csv('revealed_businesses.csv')
t_data = pd.read_csv('testing_data.csv', na_values=['?'])

#display(f_data.head())
#display(t_data.head())
#display(revealed.head())

In [3]:
#clean data
#fdata = fdata.replace(0, np.nan) # assume 0 values also means missing
f_data = f_data.fillna(f_data.mean())
#tdata = tdata.replace(0, np.nan)
t_data = t_data.fillna(t_data.mean())

Data Normalizing:

In [4]:
#preprocess data
fdatan = f_data.copy(deep=True)
normald = StandardScaler()
fdatan_1 = normald.fit_transform(fdatan.ix[:,fdatan.columns!="Var1"]) # normalize all variables except Var1
fdatan = pd.DataFrame(np.column_stack((fdatan["Var1"].values,fdatan_1)),columns = fdatan.columns).set_index(fdatan.index)
tdatan = t_data.copy(deep=True)
normald = StandardScaler()
tdatan_1 = normald.fit_transform(tdatan.ix[:,tdatan.columns!="Var1"]) # normalize all variables except Var1
tdatan = pd.DataFrame(np.column_stack((tdatan["Var1"].values,tdatan_1)),columns = tdatan.columns).set_index(tdatan.index)


In [5]:
#merge dataframe
fdatan_merged = fdatan.merge(revealed, how="outer")
fdatan_train1 = fdatan.merge(revealed) # dataframe, whose bankruptcy status is known (0 = good standing, 1 = bankrupt)

In [6]:
x1 = fdatan_train1.drop(columns=['Var1', 'Var66']) # Var 1 is company ID, Var 66 is the status
y1 = fdatan_train1['Var66']
t1 = tdatan.drop(columns=['Var1'])
cID = t_data['Var1'].tolist() # use original values, since nothing is done in cID of tdata

In [7]:
#Set Splitting
index_to_round = round(len(fdatan_train1.index)*0.8)
display("Dataset will be split on index: {}".format(index_to_round))

x_training = x1.iloc[:index_to_round, :]
y_training = y1.iloc[:index_to_round]


x_testing = x1.iloc[index_to_round:, :]
y_testing = y1.iloc[index_to_round:]

columns_no_index = list(x_training)

print(x_training.shape)
print(y_training.shape)
print(x_testing.shape)
print(y_testing.shape)

'Dataset will be split on index: 3903'

(3903, 64)
(3903,)
(976, 64)
(976,)


In [8]:
clf = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=0.5,
                                 criterion='friedman_mse', min_samples_split=90, min_samples_leaf=1,
                                 min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0,
                                 min_impurity_split=None, init=None, random_state=None, max_features='auto', verbose=0,
                                 max_leaf_nodes=None, warm_start=False, presort='auto', validation_fraction=0.1,
                                 n_iter_no_change=None, tol=0.0001)
clf.fit(x_training, y_training)
train_score = clf.score(x_training, y_training)
test_score = clf.score(x_testing, y_testing)
test_f1 = f1_score(y_testing, clf.predict(x_testing))
print('Train Score:{} Test Score:{}'.format(train_score, test_score))
print('Test F1:{}'.format(test_f1))

Train Score:0.9869331283627979 Test Score:0.9743852459016393
Test F1:0.4897959183673469


In [84]:
clf = XGBClassifier(max_depth=8, learning_rate=0.4, n_estimators=100, objective='binary:logistic',
                    booster='gbtree', n_jobs=-1, gamma=0, min_child_weight=1,
                    max_delta_step=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=0.9, base_score=0.5, random_state=0,
                    missing=None)
clf.fit(x_training[XGBFeatures], y_training)
train_score = clf.score(x_training[XGBFeatures], y_training)
test_score = clf.score(x_testing[XGBFeatures], y_testing)
test_f1 = f1_score(y_testing, clf.predict(x_testing[XGBFeatures]))
print('Train Score:{} Test Score:{}'.format(train_score, test_score))
print('Test F1:{}'.format(test_f1))
confusion_matrix(clf.predict(x_testing[XGBFeatures]),y_testing)

Train Score:1.0 Test Score:0.9825819672131147
Test F1:0.6046511627906976


array([[946,  17],
       [  0,  13]], dtype=int64)

Feature selection:

Best 45 features

In [10]:
clf = XGBClassifier(n_jobs=-1)
#clf.fit(x_training, y_training)

rfe = RFE(clf, 45)
rfe = rfe.fit(x_training, y_training)
# summarize the selection of the attributes
#print(rfe.support_)
#print(rfe.ranking_)

In [11]:
useful_features = columns_no_index.copy()
XGBFeatures = []
for i, feature in enumerate(useful_features):
    if rfe.support_[i]:
        XGBFeatures.append(feature)
#XGBFeatures


In [59]:
model = SelectFromModel(clf, prefit=True)
x_new = model.transform(x_training)
new_params = model.get_support(True)
#XGBFeatures = useful_features[new_params]
#new_params
XGBFeatures = np.asarray(useful_features)[new_params]


In [61]:
#SMOTE Re-Sampling:
sm = SMOTE(random_state=12)
x_train_sm, y_train_sm = sm.fit_sample(x_training[XGBFeatures], y_training)
x_train_sm = pd.DataFrame(x_train_sm, columns=XGBFeatures)

In [62]:
clf = XGBClassifier(max_depth=3, learning_rate=0.25, n_estimators=125, verbosity=1, objective='binary:logistic',
                    booster='gbtree', n_jobs=-1, gamma=0, min_child_weight=0,
                    max_delta_step=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=0,
                    missing=None)
clf.fit(x_train_sm, y_train_sm)
train_score = clf.score(x_train_sm, y_train_sm)
test_score = clf.score(x_testing[XGBFeatures], y_testing)
test_f1 = f1_score(y_testing, clf.predict(x_testing[XGBFeatures]))
print('Train Score:{} Test Score:{}'.format(train_score, test_score))
print('Test F1:{}'.format(test_f1))
confusion_matrix(clf.predict(x_testing[XGBFeatures]),y_testing)

Train Score:0.9976082912569758 Test Score:0.9651639344262295
Test F1:0.46875


array([[927,  15],
       [ 19,  15]], dtype=int64)

In [89]:
print('Original dataset shape %s' % Counter(y1))
cc = ClusterCentroids(random_state=42)
x_res, y_res = cc.fit_resample(x1[XGBFeatures], y1)
print('Resampled dataset shape %s' % Counter(y_res))

Original dataset shape Counter({0: 4709, 1: 170})
Resampled dataset shape Counter({0: 170, 1: 170})


In [90]:
#Clustering:
#KMeans:
kmeans = KMeans(n_clusters=2, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto',
                verbose=0, random_state=0, copy_x=True, n_jobs=-1,
                algorithm='auto').fit(x_res)
ars_kmeans = adjusted_rand_score(kmeans.labels_, y_res)
print('K-Means Adjusted Random Score: {}'.format(ars_kmeans))

K-Means Adjusted Random Score: 0.0


In [86]:
#DBSCAN:
dbscan = DBSCAN(eps=0.8, min_samples=4, metric='euclidean', metric_params=None, algorithm='auto', leaf_size=30,
                p=None, n_jobs=-1).fit(x1[XGBFeatures])
ars_dbscan = adjusted_rand_score(dbscan.labels_, y1)
print('DBSCAN Adjusted Rand Score: {}'.format(ars_dbscan))
labels = dbscan.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print('Estimated number of clusters: %d' % n_clusters_)

DBSCAN Adjusted Rand Score: 0.013951941289629469
Estimated number of clusters: 5


In [87]:
#BIRCH:
brc = Birch(threshold=0.5, branching_factor=50, n_clusters=2, compute_labels=True, copy=True)
brc.fit(x1[XGBFeatures]) 
ars_birch = adjusted_rand_score(brc.labels_, y1)
print('BIRCH Adjusted Rand Score: {}'.format(ars_birch))
#brc.labels_

BIRCH Adjusted Rand Score: -0.00039304871653265765
