In [2]:
import pandas as pd #import packages required
import numpy as np
import seaborn as sns # visualisation
import matplotlib.pyplot as plt # visualisation
%matplotlib inline
from sklearn.svm import SVC # ML model
from sklearn.preprocessing import StandardScaler # scaling the data
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split # splitting the data
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,plot_confusion_matrix # data validation
from sklearn.utils import resample # used for imbalance data
from sklearn.model_selection import GridSearchCV # Hyperparameter tuning
from sklearn.decomposition import PCA
import pickle

In [3]:
df=pd.read_csv('credit_card_modified.csv')

In [4]:
df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,Default
0,20000,2,2,1,24,2,2,0,0,0,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,0,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,0,0,0,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [5]:
df['Default'].value_counts()
# it is an imbalanced data resize the defaulter data

0    23364
1     6636
Name: Default, dtype: int64

In [7]:
df_default=df[df['Default']==1]
df_notdefault=df[df['Default']==0]

In [8]:
df_default_resample=resample(df_default,n_samples=13000,random_state=0)
df_notdefault_resample=resample(df_notdefault,n_samples=24000,random_state=0)

In [8]:
df_notdefault_resample

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,Default
3517,450000,2,1,2,36,1,0,0,0,0,...,0,0,0,10267,200,0,0,0,0,0
13932,50000,2,2,1,27,1,2,2,2,2,...,16134,17380,16958,0,2137,0,1503,0,800,0
12694,200000,1,1,1,38,2,0,0,0,0,...,133233,115882,117534,7332,5818,4451,4472,3850,3602,0
25288,60000,2,1,2,26,0,0,0,0,0,...,30818,31314,30852,1765,1812,1794,1290,1268,1273,0
17013,30000,2,1,2,23,0,0,0,0,0,...,22721,17650,25602,1470,1889,1400,384,1000,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17422,390000,2,1,1,33,0,0,0,0,0,...,31219,24962,17002,48329,2287,1500,4962,815,10000,0
25996,200000,2,1,2,29,0,0,0,0,0,...,6577,551,2130,676,2012,6577,551,2130,1887,0
8009,20000,1,1,2,28,0,0,0,0,0,...,0,0,0,1650,0,0,0,0,0,0
24630,200000,1,3,1,43,0,0,0,0,0,...,3783,2522,1261,1453,1270,4000,0,0,1436,0


In [9]:
df_resample=pd.concat([df_default_resample,df_notdefault_resample])

In [10]:
df_resample.shape

(37000, 24)

In [10]:
X=df_resample.drop('Default',axis=1)
y=df_resample['Default']

In [12]:
# encoding the data one hot encoding for marraige and sex columns
X_encoded=pd.get_dummies(X,columns=['SEX','MARRIAGE'],drop_first=True)

In [13]:
X_encoded.head()

Unnamed: 0,LIMIT_BAL,EDUCATION,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,...,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,SEX_2,MARRIAGE_1,MARRIAGE_2,MARRIAGE_3
12188,100000,2,30,0,0,0,0,0,0,57998,...,3000,5000,5305,3000,6600,0,0,1,0,0
11616,30000,3,52,3,2,2,2,2,2,26494,...,900,3974,0,600,0,0,1,0,1,0
7396,200000,2,37,0,2,0,0,0,0,2681,...,0,2192,430,430,6136,2923,1,1,0,0
14496,60000,2,56,0,0,0,0,0,0,26344,...,1446,1500,960,1000,1027,860,0,1,0,0
21580,30000,3,23,1,2,0,0,0,2,30380,...,1,1433,1500,2500,1400,1400,1,1,0,0


In [14]:
# split the data into X and y
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.33, random_state=0)

In [15]:
# scaling the X data
X_train_scaled=scale(X_train)
X_test_scaled=scale(X_test)

In [None]:
# building SVM model
model=SVC(random_state=0)
model.fit(X_train_scaled,y_train)

In [None]:
plot_confusion_matrix(model,X_test_scaled,y_test,values_format='d',display_labels=['Did not default','Default'])
print('classification report:\n',classification_report(y_test,model.predict(X_test_scaled)))

In [None]:
print("accuracy score:",round(accuracy_score(y_test,model.predict(X_test_scaled)),2))

In [None]:
# hyperparamater tuning with Gridsearchcv and Crossvalidation
param_grid={'C':[0.5,1,10,100],'gamma':['scale',1,0.1,0.01,0.001,0.0001],'kernel':['rbf']}

In [None]:
optimal_paras=GridSearchCV(SVC(),param_grid,cv=5,scoring='accuracy')

In [None]:
optimal_paras.fit(X_train_scaled,y_train)

In [None]:
optimal_paras.best_estimator_

In [None]:
model_1=SVC(random_state=0,C=100,gamma=1)
model_1.fit(X_train_scaled,y_train)

In [None]:
plot_confusion_matrix(model_1,X_test_scaled,y_test,values_format='d',display_labels=['Did not default','Default'])
print('classification report:\n',classification_report(y_test,model_1.predict(X_test_scaled)))
print("accuracy score:",round(accuracy_score(y_test,model_1.predict(X_test_scaled)),2))

In [None]:
accuracy_score(y_train,model_1.predict(X_train_scaled))

In [None]:
pca=PCA()

In [None]:
X_train_pca=pca.fit_transform(X_train_scaled)

In [None]:
per_var= np.round(pca.explained_variance_ratio_*100,decimals=1)
label=[str(x) for x in range(1,len(per_var)+1)]
plt.bar(x=range(1,len(per_var)+1),height=per_var)
plt.tick_params(axis='x',which='both',bottom=False,top=False,labelbottom=False)
plt.ylabel('Percentage of Explained Variance')
plt.xlabel('Principal Components')
plt.title('Scree plot')

In [None]:
with open('SVM_model.sav','wb') as f:
    pickle.dump(model_1,f)
    f.close()

In [None]:
# testing our saved model
x=X_train_scaled[0:19]

In [4]:
with open('SVM_model.sav','rb') as f_in:
    model_= pickle.load(f_in)

In [None]:
model_.predict(x)

In [5]:
import json

In [21]:
columns={'data_columns':[col.lower() for col in X.columns]}
with open("columns.json",'w') as f:
    f.write(json.dumps(columns))