# Support Vector Classifier

### Importing Libraries

In [1]:
#importing libraries 
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

from sklearn.svm import SVC

### Importing the data

In [2]:
data=pd.read_csv('Loan data cleaned.csv')

In [3]:
#shape of the data
data.shape

(4368, 64)

In [4]:
#first five rows of the data
data.head()

Unnamed: 0,loannumber,loanamount,totaldue,termdays,good_bad_flag,bank_account_type,longitude_gps,latitude_gps,bank_name_clients,employment_status_clients,...,ploan_additional charges_50%_x,ploan_additional charges_75%_x,ploan_additional charges_max_x,ploan_additional charges_mean_y,ploan_additional charges_min_y,ploan_additional charges_25%_y,ploan_additional charges_50%_y,ploan_additional charges_75%_y,ploan_additional charges_max_y,not_repaid_percentage
0,12,30000.0,34500.0,30,Good,Other,3.43201,6.433055,Diamond Bank,Permanent,...,3800.0,4500.0,4500.0,3900.0,3000.0,3400.0,3800.0,4500.0,4500.0,64.0
1,2,15000.0,17250.0,30,Good,Savings,3.885298,7.3207,GT Bank,Permanent,...,2665.558293,3037.525723,3543.851778,2730.359972,2102.661161,2364.738902,2665.558293,3037.525723,3543.851778,63.573297
2,7,20000.0,22250.0,15,Good,Other,11.13935,10.292041,EcoBank,Permanent,...,1500.0,1500.0,3000.0,1750.0,1500.0,1500.0,1500.0,1500.0,3000.0,50.0
3,3,10000.0,11500.0,15,Good,Savings,3.98577,7.491708,First Bank,Permanent,...,2250.0,2625.0,3000.0,2250.0,1500.0,1875.0,2250.0,2625.0,3000.0,0.0
4,9,40000.0,44000.0,30,Good,Other,7.457913,9.076574,GT Bank,Permanent,...,3800.0,5100.0,9000.0,4800.0,3000.0,3000.0,3800.0,5100.0,9000.0,100.0


In [5]:
#checking missing values in the data
data.isnull().sum()

loannumber                        0
loanamount                        0
totaldue                          0
termdays                          0
good_bad_flag                     0
                                 ..
ploan_additional charges_25%_y    0
ploan_additional charges_50%_y    0
ploan_additional charges_75%_y    0
ploan_additional charges_max_y    0
not_repaid_percentage             0
Length: 64, dtype: int64

In [6]:
#make dummies of categorical columns('bank_account_type','bank_name_clients','employment_status_clients')
data=pd.get_dummies(data=data,columns=['bank_account_type','bank_name_clients','employment_status_clients'],drop_first=True)

### DEFAULT MODEL

In [7]:
x = data.drop(['good_bad_flag'], axis=1)
y = data['good_bad_flag'].replace({'Good':0, 'Bad':1})

In [8]:
#importing train_test_split
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state = 101, stratify=y, test_size=0.15)

In [10]:
#Scalig the train and testing datasets
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(X_train)
X_train = pd.DataFrame(x_train_scaled, columns=X_train.columns)

x_test_scaled = scaler.transform(X_test)
X_test = pd.DataFrame(x_test_scaled, columns=X_test.columns)

In [11]:
# from sklearn.svm import SVC
svc = SVC(class_weight='balanced')
svc.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [12]:
predict_train=svc.predict(X_train)

In [13]:
f1_train=f1_score(y_train,predict_train)

In [14]:
f1_train

0.5704793545325108

In [15]:
predict_test=svc.predict(X_test)

In [16]:
f1_test=f1_score(y_test,predict_test)

In [17]:
f1_test

0.4545454545454546

In [19]:
print('\033[1m',"Classification Report for training data",'\033[0m')
print(classification_report(predict_train, y_train))

[1m Classification Report for training data [0m
              precision    recall  f1-score   support

           0       0.76      0.91      0.83      2414
           1       0.74      0.46      0.57      1298

    accuracy                           0.76      3712
   macro avg       0.75      0.69      0.70      3712
weighted avg       0.75      0.76      0.74      3712



In [20]:
print('\033[1m',"Classification Report for test data",'\033[0m')
print(classification_report(predict_test, y_test))

[1m Classification Report for test data [0m
              precision    recall  f1-score   support

           0       0.78      0.86      0.82       469
           1       0.52      0.40      0.45       187

    accuracy                           0.73       656
   macro avg       0.65      0.63      0.64       656
weighted avg       0.71      0.73      0.71       656



#### SVM with LINEAR

In [38]:
svc_linear= SVC(kernel='linear',class_weight='balanced')

In [39]:
svc_linear.fit(X_train,y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [41]:
predict_train=svc_linear.predict(X_train)

In [42]:
predict_test=svc_linear.predict(X_test)

In [43]:
f1_train=f1_score(y_train,predict_train)

In [44]:
f1_test=f1_score(y_test,predict_test)

In [45]:
f1_train,f1_test

(0.4645522388059702, 0.44314868804664725)

####  SVM with polynomial kernel

In [18]:
svc_poly= SVC(kernel='poly',class_weight='balanced')

In [19]:
svc_poly.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='poly',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [20]:
predict_train=svc_poly.predict(X_train)

In [21]:
f1_train=f1_score(y_train,predict_train)

In [22]:
f1_train

0.5741254858411992

In [23]:
predict_test=svc_poly.predict(X_test)

In [24]:
f1_test=f1_score(y_test,predict_test)

In [25]:
f1_test

0.36363636363636365

#### SVM with 'rbf' / gaussian kernel

In [27]:
svc_rbf= SVC(kernel = 'rbf',class_weight='balanced')
svc_rbf.fit(X_train,y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [31]:
predict_train=svc_rbf.predict(X_train)

In [32]:
f1_train=f1_score(y_train,predict_train)

In [33]:
f1_train

0.5704793545325108

In [35]:
predict_test=svc_rbf.predict(X_test)

In [36]:
f1_test=f1_score(y_test,predict_test)

In [37]:
f1_test

0.4545454545454546

### USING RBF FOR FURTHER ANALYSIS

#### Change with respect to gamma ( C = constant)

In [14]:
g = [0.001,0.01,0.1,0.5,0.9,1,10,15,100,1000]
train_f1=[]
test_f1=[]
for i in range(len(g)):
    svc = SVC(kernel = 'rbf', gamma = g[i],class_weight='balanced')
    svc.fit(X_train, y_train)
    predict_train=svc.predict(X_train)
    f1_train=f1_score(y_train,predict_train)
    predict_test=svc.predict(X_test)
    f1_test=f1_score(y_test,predict_test)
    train_f1.append(f1_train)
    test_f1.append(f1_test)

In [17]:
frame=pd.DataFrame({'Gamma':g,'train_f1':train_f1,'test_f1':test_f1})

In [18]:
frame

Unnamed: 0,Gamma,train_f1,test_f1
0,0.001,0.454685,0.448598
1,0.01,0.55455,0.456456
2,0.1,0.805277,0.344569
3,0.5,0.957561,0.079096
4,0.9,0.985348,0.025641
5,1.0,0.98959,0.013072
6,10.0,1.0,0.0
7,15.0,1.0,0.0
8,100.0,1.0,0.0
9,1000.0,1.0,0.0


**AT 0.01** F1 score is 0.4564 

#### Change with respect to C ( gamma = constant)

In [27]:
train_f1=[]
test_f1=[]
c =[1,10,100,1000,1500,2000,5000,10000,100000]
for i in range(len(c)):
    svc = SVC(kernel = 'rbf', C = c[i],gamma=0.01,class_weight='balanced')
    svc.fit(X_train, y_train)
    predict_train=svc.predict(X_train)
    f1_train=f1_score(y_train,predict_train)
    predict_test=svc.predict(X_test)
    f1_test=f1_score(y_test,predict_test)
    train_f1.append(f1_train)
    test_f1.append(f1_test)

In [28]:
frame1=pd.DataFrame({'Cost':c,'train_f1':train_f1,'test_f1':test_f1})

In [29]:
frame1

Unnamed: 0,Cost,train_f1,test_f1
0,1,0.55455,0.456456
1,10,0.667629,0.422619
2,100,0.78741,0.364198
3,1000,0.892515,0.333333
4,1500,0.908467,0.322368
5,2000,0.916619,0.315789
6,5000,0.950178,0.318644
7,10000,0.964029,0.327645
8,100000,0.990803,0.310345


# GRID SEARCH

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [12]:
svc = SVC(kernel='rbf',class_weight='balanced',random_state=101)

In [18]:
svc_para = {'gamma':[0.001,0.01,0.1,0.5,0.9,1,10,15,100,1000],'C':[1,10,100,1000,1500,2000,5000,10000,100000]}
clf = GridSearchCV(estimator=svc,param_grid=svc_para,scoring='f1',cv=StratifiedKFold(10))
clf=clf.fit(X_train, y_train)

In [19]:
clf.best_params_

{'C': 1, 'gamma': 0.001}