Using SVM, the more complex the data, the more accurate the predictor will become. 

SVM allows for more accurate machine learning because it’s multidimensional.

We need to choose the best Kernel according to our need.

The linear kernel is mostly preferred for text classification problems as it performs well for large datasets.
Gaussian kernels tend to give good results when there is no additional information regarding data that is not available.
Rbf kernel is also a kind of Gaussian kernel which projects the high dimensional data and then searches a linear separation for it.
Polynomial kernels give good results for problems where all the training data is normalized.

In [None]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.svm import LinearSVC,SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
df_engineered = pd.read_csv('/content/engineered_df_without_category_interactions.csv')
df_engineered.head()

Unnamed: 0,age,height,weight,ap_hi,ap_lo,BMI,MAP,age height,age weight,age ap_hi,...,gluc_1,gluc_2,gluc_3,smoke_0,smoke_1,alco_0,alco_1,active_0,active_1,cardio
0,-0.433788,0.48811,-0.888124,-0.992845,-0.124219,-1.112179,-0.569948,-0.211736,0.385257,0.430684,...,1,0,0,1,0,1,0,0,1,0
1,0.309455,-1.100511,0.969301,0.825095,0.941111,1.707131,0.952717,-0.340559,0.299955,0.25533,...,1,0,0,1,0,1,0,0,1,1
2,-0.245851,0.090955,-0.726609,0.219115,-1.189548,-0.776977,-0.569948,-0.022361,0.178637,-0.05387,...,1,0,0,1,0,1,0,1,0,1
3,-0.745666,0.620495,0.727028,1.431075,2.00644,0.354575,1.866298,-0.462682,-0.54212,-1.067104,...,1,0,0,1,0,1,0,0,1,1
4,-0.806016,-1.100511,-1.37267,-1.598825,-2.254877,-0.885085,-2.092613,0.88703,1.106394,1.288679,...,1,0,0,1,0,1,0,1,0,0


In [None]:
df = pd.read_csv('/content/cleaned_df.csv')
df.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,BMI,MAP
0,18393,2,168,62.0,110,80,1,1,0,0,1,0,21.967,90.0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1,34.928,106.667
2,18857,1,165,64.0,130,70,3,1,0,0,0,1,23.508,90.0
3,17623,2,169,82.0,150,100,1,1,0,0,1,1,28.71,116.667
4,17474,1,156,56.0,100,60,1,1,0,0,0,0,23.011,73.333


In [None]:
continuous_features = [feature for feature in df.columns if len(df[feature].unique())>25]
print('Continuous Values are : {}'.format(continuous_features))

Continuous Values are : ['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'BMI', 'MAP']


In [None]:
categorical_features = [feature for feature in df.columns if feature not in continuous_features]

df[categorical_features]=df[categorical_features].astype("category")

In [None]:
#change categorical features to have right dtype
df.dtypes

age               int64
gender         category
height            int64
weight          float64
ap_hi             int64
ap_lo             int64
cholesterol    category
gluc           category
smoke          category
alco           category
active         category
cardio         category
BMI             float64
MAP             float64
dtype: object

In [None]:
df.describe()

Unnamed: 0,age,height,weight,ap_hi,ap_lo,BMI,MAP
count,66489.0,66489.0,66489.0,66489.0,66489.0,66489.0,66489.0
mean,19463.983275,164.312954,72.997406,126.384124,81.166012,27.079937,96.238615
std,2468.911974,7.55372,12.382733,16.502193,9.38677,4.597225,10.945937
min,10798.0,143.0,40.0,60.0,30.0,13.521,46.667
25%,17657.0,159.0,65.0,120.0,80.0,23.828,93.333
50%,19702.0,165.0,71.0,120.0,80.0,26.219,93.333
75%,21323.0,170.0,81.0,140.0,90.0,29.758,103.333
max,23713.0,186.0,107.0,240.0,182.0,50.892,186.667


In [None]:
df.columns

Index(['age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol',
       'gluc', 'smoke', 'alco', 'active', 'cardio', 'BMI', 'MAP'],
      dtype='object')

In [None]:
X = df.drop('cardio', axis=1)
y = df.cardio

In [None]:
X_engineered = df_engineered.drop('cardio', axis=1)
y_engineered = df_engineered.cardio

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                      train_size=0.8,
                                      test_size=0.2,
                                      shuffle=True,
                                      random_state=0)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_engineered, y_engineered,
                                      train_size=0.8,
                                      test_size=0.2,
                                      shuffle=True,
                                      random_state=0)

In [None]:
X_train.isnull().sum()

age              0
height           0
weight           0
ap_hi            0
ap_lo            0
BMI              0
MAP              0
age height       0
age weight       0
age ap_hi        0
age ap_lo        0
age BMI          0
age MAP          0
height weight    0
height ap_hi     0
height ap_lo     0
height MAP       0
weight ap_hi     0
weight ap_lo     0
weight MAP       0
ap_hi ap_lo      0
ap_hi BMI        0
ap_lo BMI        0
BMI MAP          0
gender_1         0
gender_2         0
cholesterol_1    0
cholesterol_2    0
cholesterol_3    0
gluc_1           0
gluc_2           0
gluc_3           0
smoke_0          0
smoke_1          0
alco_0           0
alco_1           0
active_0         0
active_1         0
dtype: int64

In [None]:
y_train.isnull().sum()

0

In [None]:
np.unique(y_train)

array([0, 1])

In [None]:
# train the model on train set
model = SVC(random_state=0, C=100, gamma=0.001, kernel='rbf')
model.fit(X_train, y_train)
  
# print prediction resulsts
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.70      0.79      0.75      6787
           1       0.75      0.66      0.70      6511

    accuracy                           0.72     13298
   macro avg       0.73      0.72      0.72     13298
weighted avg       0.73      0.72      0.72     13298



In [None]:
# train the model on train set
model = SVC(random_state=0)
model.fit(X_train, y_train)
  
# print prediction resulsts
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.60      0.68      0.64      6787
           1       0.62      0.54      0.57      6511

    accuracy                           0.61     13298
   macro avg       0.61      0.61      0.61     13298
weighted avg       0.61      0.61      0.61     13298



##Before hyperparameter tuning
72% training accuracy 

In [None]:
#default params
#default kernel: rbf
model.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

When training an SVM with the Radial Basis Function (RBF) kernel, two parameters must be considered: C and gamma. The parameter C, common to all SVM kernels, trades off misclassification of training examples against simplicity of the decision surface. A low C makes the decision surface smooth, while a high C aims at classifying all training examples correctly. gamma defines how much influence a single training example has. The larger gamma is, the closer other examples must be to be affected.

In [None]:
#Calculating the accuracy of tuned model
grid_svc = grid.predict(X_test)
accuracy_score(y_test,grid_svc)

In [None]:
#Classification report for the tuned model
print(classification_report(y_test,grid_svc))

## Best parameters using GridSearchCV

Best Parameters of GridSearchCV for SVM Model: {'C': 100, 'gamma': 0.0001}

Best Estimator of GridSearchCV for SVM Model: SVC(C=100, gamma=0.0001, random_state=0)

In [None]:
#refer to SVM_project_hyperparam_tuning.ipynb