# Bankruptcy Prediction

In [1]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

## 1. Using full data

### Load Data 5 Years

In [8]:
from scipy.io import arff
import pandas as pd

data = arff.loadarff('data/1year.arff')
df = pd.DataFrame(data[0])

#shuffle
df = df.sample(frac = 1)

#prep Data
df = df.fillna(0)
df['class'] = df['class'].astype(float, errors = 'raise')

#split
train=df.sample(frac=0.8,random_state=200) 
test=df.drop(train.index)

x, y = train.iloc[:,:-1], train.iloc[:,-1]
x_test, y_test = test.iloc[:,:-1], test.iloc[:,-1]

### Simple Logistic Regression

In [3]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0, multi_class='multinomial').fit(x, y)

res = clf.predict(x_test)
mse = mean_squared_error(y_test, res)
r2 = r2_score(y_test, res)

print("MSE: ", mse, " R2: ", r2)

MSE:  0.04341637010676157  R2:  -0.06201982651796789


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### Random Forest Regressor

In [4]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=0)

rf.fit(x, y)

res = rf.predict(x_test)
mse = mean_squared_error(y_test, res)
r2 = r2_score(y_test, res)

print("MSE: ", mse, " R2: ", r2)

MSE:  0.023373950177935945  R2:  0.42824334572490697


### SVR

In [5]:
from sklearn.svm import SVR
svr = SVR(kernel="rbf")
svr.fit(x, y)

res = svr.predict(x_test)
mse = mean_squared_error(y_test, res)
r2 = r2_score(y_test, res)

print("MSE: ", mse, " R2: ", r2)

MSE:  0.044126429954149254  R2:  -0.07938879665724263


### Gradient Boosting Regressor 

In [1]:
from sklearn import ensemble

reg = ensemble.GradientBoostingRegressor(random_state=0)
reg.fit(x, y)

res = reg.predict(x_test)
mse = mean_squared_error(y_test, res)
r2 = r2_score(y_test, res)

print("MSE: ", mse, " R2: ", r2)

NameError: name 'x' is not defined

## Feature Selection

In [20]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

fs = SelectKBest(score_func=f_regression, k='all')
fs.fit(x, y)
for i in range(len(fs.scores_)):
    print('Feature %d: %f' % (i, fs.scores_[i]))

Feature 0: 1.751601
Feature 1: 29.399117
Feature 2: 29.294639
Feature 3: 0.033786
Feature 4: 0.027389
Feature 5: 18.735862
Feature 6: 1.610640
Feature 7: 0.016882
Feature 8: 0.252008
Feature 9: 2.635839
Feature 10: 0.383698
Feature 11: 1.187550
Feature 12: 0.137564
Feature 13: 1.610640
Feature 14: 0.411219
Feature 15: 1.031872
Feature 16: 0.008963
Feature 17: 1.610640
Feature 18: 0.127256
Feature 19: 0.028410
Feature 20: 0.147386
Feature 21: 0.242282
Feature 22: 0.131014
Feature 23: 1.260257
Feature 24: 3.115841
Feature 25: 0.573527
Feature 26: 0.257703
Feature 27: 1.316431
Feature 28: 9.288272
Feature 29: 0.000349
Feature 30: 0.121750
Feature 31: 21.117259
Feature 32: 0.330179
Feature 33: 2.505114
Feature 34: 0.255824
Feature 35: 0.234771
Feature 36: 0.059571
Feature 37: 2.629851
Feature 38: 0.016874
Feature 39: 0.193846
Feature 40: 0.217592
Feature 41: 0.015624
Feature 42: 0.049150
Feature 43: 0.049750
Feature 44: 0.051326
Feature 45: 0.007476
Feature 46: 0.298886
Feature 47: 0.07806