In [1]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
bank_df = pd.read_csv('bank.csv') 

In [3]:
bank_df.head(5)

Unnamed: 0,age,job,marital,education,default,balance,housing-loan,personal-loan,current-campaign,previous-campaign,subscribed
0,30,unemployed,married,primary,no,1787,no,no,1,0,no
1,33,services,married,secondary,no,4789,yes,yes,1,4,no
2,35,management,single,tertiary,no,1350,yes,no,1,1,no
3,30,management,married,tertiary,no,1476,yes,yes,4,0,no
4,59,blue-collar,married,secondary,no,0,yes,no,1,0,no


In [4]:
bank_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   age                4521 non-null   int64 
 1   job                4521 non-null   object
 2   marital            4521 non-null   object
 3   education          4521 non-null   object
 4   default            4521 non-null   object
 5   balance            4521 non-null   int64 
 6   housing-loan       4521 non-null   object
 7   personal-loan      4521 non-null   object
 8   current-campaign   4521 non-null   int64 
 9   previous-campaign  4521 non-null   int64 
 10  subscribed         4521 non-null   object
dtypes: int64(4), object(7)
memory usage: 388.6+ KB


In [5]:
bank_df['subscribed'].value_counts()

no     4000
yes     521
Name: subscribed, dtype: int64

In [6]:
## Importing resample from *sklearn.utils* package.
from sklearn.utils import resample

In [7]:
# Separate the case of yes-subscribes and no-subscribes
bank_subscribed_no = bank_df[bank_df['subscribed'] == 'no']
bank_subscribed_yes = bank_df[bank_df['subscribed'] == 'yes']

In [8]:
##Upsample the yes-subscribed cases.
df_minority_upsampled = resample(bank_subscribed_yes, replace=True, n_samples=2000, random_state=42)

In [9]:
# Combine majority class with upsampled minority class
new_bank_df = pd.concat([bank_subscribed_no, df_minority_upsampled])

In [10]:
new_bank_df['subscribed'].value_counts()

no     4000
yes    2000
Name: subscribed, dtype: int64

In [11]:
# Assigning list of all column names in the DataFrame
X_features = list(new_bank_df.columns)
# Remove the response variable from the list
X_features.remove('subscribed')
X_features

['age',
 'job',
 'marital',
 'education',
 'default',
 'balance',
 'housing-loan',
 'personal-loan',
 'current-campaign',
 'previous-campaign']

In [12]:
## get_dummies() will convert all the columns with data type as 
## objects

encoded_bank_df = pd.get_dummies(new_bank_df[X_features], drop_first = True )

X = encoded_bank_df

In [13]:
# X

In [14]:
# Encoding the subscribed column and assigning to Y
Y = new_bank_df['subscribed'].map(lambda x: int(x =='yes'))

In [15]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, Y, test_size=0.3, random_state=42)

In [16]:
# Create support vector classifier
model = SVC(kernel='rbf', random_state=101, gamma=0.001, C=0.1, probability=True)

# Train the classifier
model.fit(train_X, train_y)

SVC(C=0.1, gamma=0.001, probability=True, random_state=101)

### Predictions and Performance Metrics

In [17]:
pred_y = model.predict(test_X)

In [18]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics

In [19]:
print(confusion_matrix(test_y,pred_y))

[[1220    5]
 [ 558   17]]


In [20]:
print(classification_report(test_y,pred_y))

              precision    recall  f1-score   support

           0       0.69      1.00      0.81      1225
           1       0.77      0.03      0.06       575

    accuracy                           0.69      1800
   macro avg       0.73      0.51      0.43      1800
weighted avg       0.71      0.69      0.57      1800



In [21]:
metrics.roc_auc_score(test_y, model.predict_proba(test_X)[:,1])

0.7146981366459627

### Grid Search CV

In [22]:
from sklearn.model_selection import GridSearchCV

## Creating a dictionary with hyperparameters and possible values for searching

params =  {'gamma': [0.001, 0.01, 0.1, 1, 1.5, 2.0], 'C': [0.01, 0.1]}

## Configuring grid search

modelCV = GridSearchCV(estimator=SVC(),  param_grid=params, cv=10,  scoring='roc_auc')

## fit the search with training set
modelCV.fit(train_X, train_y)

GridSearchCV(cv=10, estimator=SVC(),
             param_grid={'C': [0.01, 0.1],
                         'gamma': [0.001, 0.01, 0.1, 1, 1.5, 2.0]},
             scoring='roc_auc')

In [23]:
modelCV.best_params_

{'C': 0.1, 'gamma': 2.0}

In [24]:
modelCV.best_score_

0.9598611046843359

### Model with best params

In [25]:
# Create support vector classifier
model = SVC(kernel='rbf', random_state=101, gamma=2, C=0.1, probability=True)

# Train the classifier
model.fit(train_X, train_y)

SVC(C=0.1, gamma=2, probability=True, random_state=101)

In [26]:
pred_y = model.predict(test_X)

In [27]:
metrics.roc_auc_score(test_y, model.predict_proba(test_X)[:,1])

0.9763251109139306

### Support Vectors

In [28]:
#Identifying Support Vectors

In [29]:
# Viewing Support Vectors
model.support_vectors_

array([[2.500e+01, 7.200e+01, 2.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [3.900e+01, 0.000e+00, 4.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [4.800e+01, 3.064e+03, 2.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       ...,
       [5.900e+01, 2.013e+03, 4.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [3.300e+01, 3.935e+03, 1.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [6.300e+01, 1.490e+03, 1.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00]])

In [30]:
# view the indices of the support vectors
model.support_

array([   1,    2,    3, ..., 4196, 4197, 4198])

In [31]:
# use n_support_ to find the number of support vectors belonging to each class
model.n_support_

array([2773, 1425])