In [15]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [3]:
B = pd.read_csv('bank.csv')
B.head()

Unnamed: 0,age,job,marital,education,default,balance,housing-loan,personal-loan,current-campaign,previous-campaign,subscribed
0,30,unemployed,married,primary,no,1787,no,no,1,0,no
1,33,services,married,secondary,no,4789,yes,yes,1,4,no
2,35,management,single,tertiary,no,1350,yes,no,1,1,no
3,30,management,married,tertiary,no,1476,yes,yes,4,0,no
4,59,blue-collar,married,secondary,no,0,yes,no,1,0,no


In [4]:
B.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   age                4521 non-null   int64 
 1   job                4521 non-null   object
 2   marital            4521 non-null   object
 3   education          4521 non-null   object
 4   default            4521 non-null   object
 5   balance            4521 non-null   int64 
 6   housing-loan       4521 non-null   object
 7   personal-loan      4521 non-null   object
 8   current-campaign   4521 non-null   int64 
 9   previous-campaign  4521 non-null   int64 
 10  subscribed         4521 non-null   object
dtypes: int64(4), object(7)
memory usage: 388.6+ KB


In [5]:
B['subscribed'].value_counts()

no     4000
yes     521
Name: subscribed, dtype: int64

In [6]:
from sklearn.utils import resample

In [7]:
bsn = B[B['subscribed']=='no']
bsy = B[B['subscribed'] =='yes']

In [8]:
BMU = resample(bsy, replace=True, n_samples=2000, random_state=42)   # bank minority unsampled

In [9]:
NB = pd.concat([bsn, BMU])         # new bank
NB['subscribed'].value_counts()


no     4000
yes    2000
Name: subscribed, dtype: int64

In [10]:
# NB = shuffle(NB)

In [11]:
Xf = list(NB.columns)
Xf.remove('subscribed')
Xf

['age',
 'job',
 'marital',
 'education',
 'default',
 'balance',
 'housing-loan',
 'personal-loan',
 'current-campaign',
 'previous-campaign']

In [12]:
EB = pd.get_dummies(NB[Xf], drop_first=True)
X = EB
X

Unnamed: 0,age,balance,current-campaign,previous-campaign,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,...,job_unemployed,job_unknown,marital_married,marital_single,education_secondary,education_tertiary,education_unknown,default_yes,housing-loan_yes,personal-loan_yes
0,30,1787,1,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
1,33,4789,1,4,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,1,1
2,35,1350,1,1,0,0,0,1,0,0,...,0,0,0,1,0,1,0,0,1,0
3,30,1476,4,0,0,0,0,1,0,0,...,0,0,1,0,0,1,0,0,1,1
4,59,0,1,0,1,0,0,0,0,0,...,0,0,1,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
619,35,7050,3,4,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
1177,28,4579,2,0,0,0,0,0,0,1,...,0,0,0,1,0,1,0,0,0,0
3498,58,462,1,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4366,59,0,1,0,0,0,0,0,1,0,...,0,0,1,0,1,0,0,0,0,0


In [13]:
Y = NB['subscribed'].map(lambda x: int(x=='yes'))

In [14]:
from sklearn.model_selection import train_test_split
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,Y,test_size=0.3, random_state=42)

In [16]:
model = SVC(kernel='rbf', random_state=101, gamma=0.001, C=.1, probability=True)
model.fit(Xtrain, Ytrain)

SVC(C=0.1, gamma=0.001, probability=True, random_state=101)

In [None]:
perf mat

In [17]:
predy = model.predict(Xtest)

In [19]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics

In [20]:
print(confusion_matrix(Ytest,predy))

[[1220    5]
 [ 558   17]]


In [24]:
print(classification_report(Ytest,predy))

              precision    recall  f1-score   support

           0       0.69      1.00      0.81      1225
           1       0.77      0.03      0.06       575

    accuracy                           0.69      1800
   macro avg       0.73      0.51      0.43      1800
weighted avg       0.71      0.69      0.57      1800



In [23]:
auc1 = metrics.roc_auc_score(Ytest,model.predict_proba(Xtest)[:,1])
auc1

0.7146981366459627

### Performance Metrics

In [25]:
from sklearn.model_selection import GridSearchCV
params = {'gamma': [ .001,.01,.1,1,1.5,2], 'C':[.01,.1,1,2,3]}
modelcv = GridSearchCV(estimator=SVC(), param_grid=params, cv=10, scoring='roc_auc')
modelcv.fit(Xtrain, Ytrain)

GridSearchCV(cv=10, estimator=SVC(),
             param_grid={'C': [0.01, 0.1, 1, 2, 3],
                         'gamma': [0.001, 0.01, 0.1, 1, 1.5, 2]},
             scoring='roc_auc')

In [26]:
modelcv.best_params_

{'C': 0.1, 'gamma': 2}

In [27]:
modelcv.best_score_

0.9598611046843359

### model with best params

In [28]:
model1 = SVC(kernel='rbf', random_state=101, gamma=2, C=.1, probability=True)
model1.fit(Xtrain, Ytrain)

SVC(C=0.1, gamma=2, probability=True, random_state=101)

In [29]:
predy = model1.predict(Xtest)

In [30]:
print(confusion_matrix(Ytest,predy))

[[1225    0]
 [ 575    0]]


In [31]:
print(classification_report(Ytest,predy))

              precision    recall  f1-score   support

           0       0.68      1.00      0.81      1225
           1       0.00      0.00      0.00       575

    accuracy                           0.68      1800
   macro avg       0.34      0.50      0.40      1800
weighted avg       0.46      0.68      0.55      1800



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
auc = metrics.roc_auc_score(Ytest,model1.predict_proba(Xtest)[:,1])
auc

0.9763251109139306

In [None]:
Support Vectors

In [36]:
model1.support_vectors_        #support vectors

array([[2.500e+01, 7.200e+01, 2.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [3.900e+01, 0.000e+00, 4.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [4.800e+01, 3.064e+03, 2.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       ...,
       [5.900e+01, 2.013e+03, 4.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [3.300e+01, 3.935e+03, 1.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [6.300e+01, 1.490e+03, 1.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00]])

In [35]:
model1.support_   # indices

array([   1,    2,    3, ..., 4196, 4197, 4198])

In [37]:
model1.n_support_

array([2773, 1425])