In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.svm import SVC
from sklearn import svm
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.model_selection import train_test_split
import os
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [27]:
bank = pd.read_csv("C:\Aalesh and Mandar\Datasets\Cases\Bankruptcy\Bankruptcy.csv", index_col = 0)

In [28]:
bank.head()

Unnamed: 0_level_0,D,YR,R1,R2,R3,R4,R5,R6,R7,R8,...,R15,R16,R17,R18,R19,R20,R21,R22,R23,R24
NO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,78,0.23,0.08,0.02,0.03,0.46,0.12,0.19,10.36,...,0.05,0.57,0.15,0.23,3.56,0.26,1.55,0.43,0.11,0.17
2,0,77,0.19,0.07,0.09,0.12,0.02,0.02,0.03,3.13,...,0.09,0.12,0.16,0.22,3.78,1.29,1.4,0.06,0.07,0.1
3,0,72,0.07,0.02,0.03,0.05,0.06,0.1,0.14,2.41,...,-0.03,0.02,0.02,0.04,13.29,1.61,1.43,0.03,0.05,0.07
4,0,80,0.07,0.03,0.04,0.04,0.04,0.06,0.06,5.55,...,-0.02,0.01,0.02,0.02,5.36,1.3,1.12,-0.06,-0.08,-0.09
5,0,81,0.09,0.02,0.03,0.04,0.06,0.08,0.11,2.85,...,0.02,0.07,0.1,0.14,7.74,1.48,1.41,0.03,0.04,0.06


In [29]:
x = bank.drop(["D","YR"], axis=1)
y=bank["D"]

In [30]:
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify = y, random_state = 2022, train_size = 0.7)

In [31]:
svm = SVC(kernel="linear", probability = True,  random_state = 2022)
svm.fit(x_train, y_train)
y_pred = svm.predict(x_test)
print(accuracy_score(y_test, y_pred))

0.825


In [32]:
y_pred_prob = svm.predict_proba(x_test)[:,1]
print(roc_auc_score(y_test, y_pred_prob))

0.7875


In [34]:
scaler = StandardScaler()
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 2022)
params = {"C" : np.linspace(0.001, 10, 20)}
gcv = GridSearchCV(svm, param_grid = params, verbose = 3, scoring = "roc_auc", cv = kfold)
gcv.fit(x_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END ...........................C=0.001;, score=0.378 total time=   0.0s
[CV 2/5] END ...........................C=0.001;, score=0.811 total time=   0.0s
[CV 3/5] END ...........................C=0.001;, score=0.975 total time=   0.0s
[CV 4/5] END ...........................C=0.001;, score=0.778 total time=   0.0s
[CV 5/5] END ...........................C=0.001;, score=0.852 total time=   0.0s
[CV 1/5] END ..............C=0.5272631578947369;, score=0.733 total time=   0.0s
[CV 2/5] END ..............C=0.5272631578947369;, score=0.956 total time=   0.0s
[CV 3/5] END ..............C=0.5272631578947369;, score=0.938 total time=   0.0s
[CV 4/5] END ..............C=0.5272631578947369;, score=0.852 total time=   0.0s
[CV 5/5] END ..............C=0.5272631578947369;, score=0.864 total time=   0.0s
[CV 1/5] END ..............C=1.0535263157894736;, score=0.733 total time=   0.0s
[CV 2/5] END ..............C=1.0535263157894736

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=2022, shuffle=True),
             estimator=SVC(kernel='linear', probability=True,
                           random_state=2022),
             param_grid={'C': array([1.00000000e-03, 5.27263158e-01, 1.05352632e+00, 1.57978947e+00,
       2.10605263e+00, 2.63231579e+00, 3.15857895e+00, 3.68484211e+00,
       4.21110526e+00, 4.73736842e+00, 5.26363158e+00, 5.78989474e+00,
       6.31615789e+00, 6.84242105e+00, 7.36868421e+00, 7.89494737e+00,
       8.42121053e+00, 8.94747368e+00, 9.47373684e+00, 1.00000000e+01])},
             scoring='roc_auc', verbose=3)

In [35]:
print(gcv.best_params_)
print(gcv.best_score_)

{'C': 6.84242105263158}
0.8871604938271606


## with the help of scaling

In [40]:
pipe = Pipeline([("STD", scaler),("SVM",svm)])
pipe.get_params()

{'memory': None,
 'steps': [('STD', StandardScaler()),
  ('SVM', SVC(kernel='linear', probability=True, random_state=2022))],
 'verbose': False,
 'STD': StandardScaler(),
 'SVM': SVC(kernel='linear', probability=True, random_state=2022),
 'STD__copy': True,
 'STD__with_mean': True,
 'STD__with_std': True,
 'SVM__C': 1.0,
 'SVM__break_ties': False,
 'SVM__cache_size': 200,
 'SVM__class_weight': None,
 'SVM__coef0': 0.0,
 'SVM__decision_function_shape': 'ovr',
 'SVM__degree': 3,
 'SVM__gamma': 'scale',
 'SVM__kernel': 'linear',
 'SVM__max_iter': -1,
 'SVM__probability': True,
 'SVM__random_state': 2022,
 'SVM__shrinking': True,
 'SVM__tol': 0.001,
 'SVM__verbose': False}

In [37]:
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 2022)
params = {"SVM__C" : np.linspace(0.001, 10, 20)}
gcv = GridSearchCV(pipe, param_grid = params, verbose = 3, scoring = "roc_auc", cv = kfold)
gcv.fit(x_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END ......................SVM__C=0.001;, score=0.722 total time=   0.0s
[CV 2/5] END ......................SVM__C=0.001;, score=0.833 total time=   0.0s
[CV 3/5] END ......................SVM__C=0.001;, score=0.951 total time=   0.0s
[CV 4/5] END ......................SVM__C=0.001;, score=0.827 total time=   0.0s
[CV 5/5] END ......................SVM__C=0.001;, score=0.988 total time=   0.0s
[CV 1/5] END .........SVM__C=0.5272631578947369;, score=0.678 total time=   0.0s
[CV 2/5] END .........SVM__C=0.5272631578947369;, score=0.933 total time=   0.0s
[CV 3/5] END .........SVM__C=0.5272631578947369;, score=0.963 total time=   0.0s
[CV 4/5] END .........SVM__C=0.5272631578947369;, score=0.901 total time=   0.0s
[CV 5/5] END .........SVM__C=0.5272631578947369;, score=0.877 total time=   0.0s
[CV 1/5] END .........SVM__C=1.0535263157894736;, score=0.711 total time=   0.0s
[CV 2/5] END .........SVM__C=1.0535263157894736

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=2022, shuffle=True),
             estimator=Pipeline(steps=[('STD', StandardScaler()),
                                       ('SVM',
                                        SVC(kernel='linear', probability=True,
                                            random_state=2022))]),
             param_grid={'SVM__C': array([1.00000000e-03, 5.27263158e-01, 1.05352632e+00, 1.57978947e+00,
       2.10605263e+00, 2.63231579e+00, 3.15857895e+00, 3.68484211e+00,
       4.21110526e+00, 4.73736842e+00, 5.26363158e+00, 5.78989474e+00,
       6.31615789e+00, 6.84242105e+00, 7.36868421e+00, 7.89494737e+00,
       8.42121053e+00, 8.94747368e+00, 9.47373684e+00, 1.00000000e+01])},
             scoring='roc_auc', verbose=3)

In [38]:
print(gcv.best_params_)
print(gcv.best_score_)

{'SVM__C': 0.5272631578947369}
0.8703703703703705
