In [80]:
#CS5228 project

# data manipulation
import pandas as pd
import numpy as np
import math
import random

# visualiation
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

# model training
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# classifiers
from sklearn.naive_bayes import GaussianNB # naive bayes
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.svm import SVC # SVM
from sklearn.linear_model import LogisticRegression # logistic regression
from sklearn.tree import DecisionTreeClassifier # decision Tree
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import RFE # for feature selection of LR
from sklearn.ensemble import BaggingClassifier 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler

# ignore warnings
import warnings
warnings.filterwarnings('ignore')


In [81]:
#load into dataframe
f_data = pd.read_csv('financial_data.csv', na_values=['?']) 
revealed = pd.read_csv('revealed_businesses.csv')
t_data = pd.read_csv('testing_data.csv', na_values=['?'])

display(f_data.head())
display(revealed.head())

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var56,Var57,Var58,Var59,Var60,Var61,Var62,Var63,Var64,Var65
0,18399,0.023954,0.15012,0.39567,3.6357,54.043,0.028822,0.031029,4.56831,1.0112,...,3871.001,0.011041,0.034914,0.98896,0.0,9.5214,5.8248,34.713,10.515,3.4752
1,15092,0.049699,0.065808,0.7268,12.944,233.11,0.0,0.063192,14.19601,0.89618,...,8751.901,0.059565,0.053189,0.93169,0.0,5.0492,11.152,24.784,14.727,4.2204
2,19821,-0.35631,0.39288,0.15884,1.4043,-2.619,-0.085597,-0.35632,1.54531,0.92963,...,44.859,-0.17277,-0.58691,1.3833,0.0,5.6112,15.779,154.26,2.3662,2.0738
3,14171,0.001417,0.70811,-0.052312,0.88978,-31.198,0.26952,0.001407,0.41222,1.9654,...,-331.879,-0.000535,0.00482,0.9993,0.74548,17.1011,7.9482,88.147,4.1408,3.4021
4,12900,0.020041,0.34652,0.33593,2.7613,39.05,0.0,0.020031,1.88591,1.2975,...,38170.001,0.21241,0.030652,0.80158,0.000862,9.767,6.757,53.651,6.8032,2.7412


Unnamed: 0,Var1,Var66
0,18399,0
1,19821,0
2,17769,0
3,19309,0
4,20728,0


In [92]:
def fill_empty_std(dataframe):
    ones = revealed[revealed.Var66==1].Var1
    ones_df = f_data_zeros[f_data_zeros.Var1.isin(ones)]
    #display(ones_df)
    zeroes = revealed[revealed.Var66==0].Var1
    zeroes_df = f_data_zeros[f_data_zeros.Var1.isin(zeroes)]
    #display(zeroes_df)
    for index, row in dataframe.iterrows():
        for column in dataframe:
            #display((revealed[revealed.Var1==row.Var1].Var66==1).bool())
            if ((revealed[revealed.Var1==row.Var1].Var66==1).bool):
                col = ones_df[column]
            else:
                col = zeroes_df[column]
            mean = col.mean()
            standard_deviation = col.std()
            if math.isnan(row[column]):
                replacement_value = np.random.normal(mean, standard_deviation)
                dataframe.at[index, column] = replacement_value
                #print("Replaced empty value with {}".format(replacement_value))

In [93]:
#clean data
f_data_clean = f_data.replace(0, np.nan) # assume 0 values also means missing
#display(f_data_zeros.isna().sum())
fill_empty_std(f_data_clean)

#ones = revealed[revealed.Var66==1].Var1
#display(f_data_zeros[f_data_zeros.Var1.isin(ones)].head())

t_data_zeros = t_data.replace(0, np.nan)
display(t_data_zeros.isna().sum().nonzero())
t_data_clean = t_data_zeros.fillna(tdata.median())



(array([ 2,  3,  4,  5,  6,  7,  8,  9, 11, 12, 13, 14, 15, 16, 17, 18, 19,
        20, 21, 22, 23, 24, 26, 27, 28, 30, 31, 32, 33, 34, 35, 36, 37, 39,
        40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 56, 57,
        58, 59, 60, 61, 62, 63, 64]),)

In [94]:
#preprocess data
f_data_clean.to_pickle("./f_data_clean.pkl")
fdatan = f_data_clean.copy(deep=True)
normald = StandardScaler()
fdatan_1 = normald.fit_transform(fdatan.ix[:,fdatan.columns!="Var1"]) # normalize all variables except Var1
fdatan = pd.DataFrame(np.column_stack((fdatan["Var1"].values,fdatan_1)),
                      columns = fdatan.columns).set_index(fdatan.index)
tdatan = tdata.copy(deep=True)
normald = StandardScaler()
tdatan_1 = normald.fit_transform(tdatan.ix[:,tdatan.columns!="Var1"]) # normalize all variables except Var1
tdatan = pd.DataFrame(np.column_stack((tdatan["Var1"].values,tdatan_1)),
                      columns = tdatan.columns).set_index(tdatan.index)



In [95]:
#merge dataframe
fdatan_merged = fdatan.merge(revealed, how="outer")
fdatan_train1 = fdatan.merge(revealed) # dataframe, whose bankruptcy status is known (0 = good standing, 1 = bankrupt)



In [96]:
#classifier here
x1 = fdatan_train1.drop(columns=['Var1', 'Var66']) # Var 1 is company ID, Var 66 is the status
y1 = fdatan_train1['Var66']
t1 = tdatan.drop(columns=['Var1'])
cID = tdata['Var1'].tolist() # use original values, since nothing is done in cID of tdata
clas = AdaBoostClassifier() # select classifier here
clas.fit(x1, y1)

# write results to predictions.csv
f = open('predictions.csv', 'w')
f.write('Business_ID,Is_Bankrupted\n')
for a,b in zip(cID, clas.predict(t1)):
    f.write(str(a))
    f.write(',')
    f.write(str(round(b)))
    f.write('\n')
f.close()