In [55]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
import os
import pandas_profiling as pp
sb.set() # set the default Seaborn style for graphics
sb.set_style("darkgrid")
sb.set_context("poster", font_scale = .5, rc={"grid.linewidth": 0.6})

# For Support Vector Machine
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC, LinearSVC
from sklearn.preprocessing import StandardScaler

# Data Preparation

Features:

- Age | Objective Feature | age | int (days) 
- Height | Objective Feature | height | int (cm) | 
- Weight | Objective Feature | weight | float (kg) | 
- Gender | Objective Feature | gender | categorical code | 
- Systolic blood pressure | Examination Feature | ap_hi | int | 
- Diastolic blood pressure | Examination Feature | ap_lo | int | 
- Cholesterol | Examination Feature | cholesterol | 1: normal, 2: above normal, 3: well above normal | 
- Glucose | Examination Feature | gluc | 1: normal, 2: above normal, 3: well above normal | 
- Smoking | Subjective Feature | smoke | binary | 
- Alcohol intake | Subjective Feature | alco | binary | 
- Physical activity | Subjective Feature | active | binary | 
- Presence or absence of cardiovascular disease | Target Variable | cardio | binary | 

In [56]:
cardio = pd.read_csv('cardio_train.csv', sep=';')
cardio.head(10)

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
5,8,21914,1,151,67.0,120,80,2,2,0,0,0,0
6,9,22113,1,157,93.0,130,80,3,1,0,0,1,0
7,12,22584,2,178,95.0,130,90,3,3,0,0,1,1
8,13,17668,1,158,71.0,110,70,1,1,0,0,1,0
9,14,19834,1,164,68.0,110,60,1,1,0,0,0,0


In [57]:
print("Data type : ", type(cardio))
print("Data dims : ", cardio.shape)

Data type :  <class 'pandas.core.frame.DataFrame'>
Data dims :  (70000, 13)


In [58]:
cardio_data = cardio.copy()

In [59]:
cardio_data = cardio_data.drop_duplicates()
# Drop id column as it doesnt help 
cardio_data = cardio_data.drop(columns='id')

# Convert age into years
cardio_data['age'] = cardio_data['age']//365

# Combine age & weight into BMI
cardio_data['bmi'] = cardio_data["weight"] / (cardio_data["height"]/100)**2
cardio_data = cardio_data.drop(columns=['weight', 'height'])

In [60]:
# Convert categorical variables into “category” data type
cat_attribs = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio']

# Create a list that contains numerical attributes
num_attribs = ['age', 'bmi', 'ap_hi', 'ap_lo']

# Covert the data type of categorical attributes into "category"
for cat_attrib in cat_attribs:                                # iterate over each categorical attribute
  cardio_data[cat_attrib] = cardio_data[cat_attrib].astype('category')  # convert data type

cardio_data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,70000.0,52.840671,6.766774,29.0,48.0,53.0,58.0,64.0
ap_hi,70000.0,128.817286,154.011419,-150.0,120.0,120.0,140.0,16020.0
ap_lo,70000.0,96.630414,188.47253,-70.0,80.0,80.0,90.0,11000.0
bmi,70000.0,27.556513,6.091511,3.471784,23.875115,26.374068,30.222222,298.666667


In [61]:
#Convert gender to binary From Female:1 Male:2 -> Female:0 Male:1
dummygen = pd.get_dummies(cardio_data["gender"])
#Take first column (Female:0) 
dummygen = dummygen.drop([2], axis=1)
cardio_data = pd.concat((dummygen, cardio_data), axis=1)
cardio_data = cardio_data.drop(["gender"], axis=1)
cardio_data = cardio_data.rename(columns={1:"gender"})
cardio_data.sample(5)

Unnamed: 0,gender,age,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,bmi
51827,1,55,120,80,3,3,0,0,1,0,27.239224
6164,1,59,120,79,1,1,0,0,1,0,31.487889
26003,1,54,120,80,1,1,0,0,1,0,24.977043
23828,0,54,130,90,3,1,0,0,1,1,29.053288
25495,1,41,140,80,1,1,0,0,1,1,24.973985


In [62]:
#filter out bad bp data
filter_max = ((cardio_data["ap_hi"]>220) | (cardio_data["ap_lo"]>120))
filter_min = ((cardio_data["ap_hi"]<90) | (cardio_data["ap_lo"]<60))
cardio_data = cardio_data[~filter_max]
cardio_data = cardio_data[~filter_min]
len(cardio_data)

  cardio_data = cardio_data[~filter_min]


68452

In [63]:
# Extract feature bmi from the dataset
bmi = cardio_data['bmi']

# Drop outliers
cardio_data = cardio_data.drop(cardio_data.loc[(bmi < 10) | (bmi > 80)].index)

In [64]:
# Create dummy variables for non-binary categorical data
dummies = pd.get_dummies(cardio_data[['cholesterol','gluc']])
dummies.head(10)

Unnamed: 0,cholesterol_1,cholesterol_2,cholesterol_3,gluc_1,gluc_2,gluc_3
0,1,0,0,1,0,0
1,0,0,1,1,0,0
2,0,0,1,1,0,0
3,1,0,0,1,0,0
4,1,0,0,1,0,0
5,0,1,0,0,1,0
6,0,0,1,1,0,0
7,0,0,1,0,0,1
8,1,0,0,1,0,0
9,1,0,0,1,0,0


In [65]:
cardio_data = pd.concat([cardio_data,dummies],axis=1)
cardio_data.drop(['cholesterol','gluc'],axis=1,inplace=True)
cardio_data.rename(columns = {'cholesterol_1':'cholesterol_n','cholesterol_2':'cholesterol_an','cholesterol_3':'cholesterol_wan'}, inplace = True)
cardio_data.rename(columns = {'gluc_1':'gluc_n','gluc_2':'gluc_an','gluc_3':'gluc_wan'}, inplace = True)


In [67]:
cardio_data.sample(10)

Unnamed: 0,gender,age,ap_hi,ap_lo,smoke,alco,active,cardio,bmi,cholesterol_n,cholesterol_an,cholesterol_wan,gluc_n,gluc_an,gluc_wan
56763,0,54,130,80,0,0,1,0,26.573129,1,0,0,1,0,0
18704,0,64,130,90,0,0,1,0,28.905076,1,0,0,0,1,0
63285,1,60,110,70,0,0,1,1,23.91883,0,0,1,0,0,1
30690,1,45,110,70,0,0,1,0,26.775934,1,0,0,1,0,0
2545,1,59,140,90,0,0,1,1,26.709402,1,0,0,1,0,0
47234,0,57,140,80,0,0,1,1,23.120624,1,0,0,1,0,0
2215,0,56,120,80,1,0,1,0,26.42357,1,0,0,1,0,0
36417,0,49,130,90,0,0,1,0,18.314459,1,0,0,1,0,0
21945,1,51,140,110,0,0,1,1,27.815882,1,0,0,1,0,0
33291,0,42,120,80,0,0,0,1,26.218821,1,0,0,1,0,0


# Conduct feature scaling

In [68]:
to_be_scaled_feat = ['age', 'ap_hi', 'ap_lo','bmi']
scaler=StandardScaler()
scaler.fit(cardio_data[to_be_scaled_feat])
cardio_data[to_be_scaled_feat] = scaler.transform(cardio_data[to_be_scaled_feat])

In [69]:
cardio_data.sample(10)

Unnamed: 0,gender,age,ap_hi,ap_lo,smoke,alco,active,cardio,bmi,cholesterol_n,cholesterol_an,cholesterol_wan,gluc_n,gluc_an,gluc_wan
56853,1,0.320342,-0.405622,-0.144666,0,0,1,0,-0.768534,1,0,0,1,0,0
66052,1,0.468106,-0.405622,-0.144666,0,0,1,1,-0.484022,1,0,0,1,0,0
21386,1,-0.566243,-0.405622,-0.144666,0,0,1,0,-0.654236,1,0,0,1,0,0
44754,1,-0.12295,0.201038,-0.144666,0,0,1,1,-0.152221,1,0,0,1,0,0
13348,0,0.763634,-1.618942,-1.226534,1,0,1,1,-0.171658,0,1,0,0,1,0
44717,1,0.024814,-0.405622,-0.144666,0,0,0,0,-0.752252,1,0,0,1,0,0
42996,1,0.911399,-0.405622,-0.144666,0,0,1,0,-1.139498,1,0,0,1,0,0
5156,0,0.61587,-0.405622,-0.144666,0,0,1,1,-0.570507,0,0,1,1,0,0
29656,1,1.206927,1.414358,-0.144666,0,0,1,0,1.338917,0,0,1,0,0,1
3239,0,-0.566243,-0.405622,-0.252853,0,0,1,1,-0.843003,1,0,0,1,0,0


## Split train_set test_set

In [70]:
target_name = 'cardio'
data_target = cardio_data[target_name] #Extract cardio column as target variable
data = cardio_data.drop([target_name], axis=1) #Drop cardio column from rest of data

Split 30% test, remaining 70% train

In [71]:
Xtrain, Ytrain, Xtest, Ytest = train_test_split(data, data_target, test_size=0.3, random_state=0)

In [72]:
Xtrain.head()

Unnamed: 0,gender,age,ap_hi,ap_lo,smoke,alco,active,bmi,cholesterol_n,cholesterol_an,cholesterol_wan,gluc_n,gluc_an,gluc_wan
30434,1,-0.418479,-0.405622,-0.144666,0,0,1,-0.408886,1,0,0,1,0,0
16936,0,1.354691,-0.405622,-0.144666,1,0,1,-0.198398,1,0,0,1,0,0
35045,1,-0.861771,0.807698,0.937202,0,0,1,1.25007,0,1,0,1,0,0
36807,1,0.172578,-0.405622,-0.144666,0,0,0,-0.682481,1,0,0,1,0,0
20368,0,0.024814,0.201038,-0.144666,1,1,0,1.096583,1,0,0,0,1,0


In [73]:
Xtest.head()

30434    0
16936    1
35045    1
36807    0
20368    0
Name: cardio, dtype: category
Categories (2, int64): [0, 1]

In [74]:
Ytrain.head()

Unnamed: 0,gender,age,ap_hi,ap_lo,smoke,alco,active,bmi,cholesterol_n,cholesterol_an,cholesterol_wan,gluc_n,gluc_an,gluc_wan
7239,1,0.024814,0.807698,0.937202,0,0,1,0.71839,1,0,0,1,0,0
30316,1,1.059163,0.807698,2.019069,0,0,0,2.363471,1,0,0,1,0,0
15137,0,-1.009535,-0.405622,-1.226534,0,0,1,-0.573797,0,0,1,0,0,1
58119,0,-2.043884,0.807698,-0.144666,0,0,0,0.205194,1,0,0,1,0,0
8889,1,0.911399,0.201038,-0.144666,0,0,1,1.36377,0,0,1,1,0,0


In [75]:
Ytest.head()

7239     1
30316    1
15137    0
58119    1
8889     1
Name: cardio, dtype: category
Categories (2, int64): [0, 1]

# SVC 

In [76]:
svc = SVC(kernel='linear',C=100, cache_size=2000)
svc.fit(Xtrain, Xtest)

pred = svc.predict(Ytrain)

print('Model accuracy score with kernel=linear c=100 hyperparameters: {0:0.4f}'.
      format(accuracy_score(Ytest, pred)))

KeyboardInterrupt: 

In [None]:
svc = SVC(kernel='linear', C=0.1, cache_size=2000)
svc.fit(Xtrain, Xtest)

pred = svc.predict(Ytrain)

print('Model accuracy score with kernel=linear c=0.1 hyperparameters: {0:0.4f}'.
      format(accuracy_score(Ytest, pred)))

In [None]:
svc = SVC(kernel='rbf',C=100, cache_size=2000)
svc.fit(Xtrain, Xtest)

pred = svc.predict(Ytrain)

print('Model accuracy score with kernel=rbf c=100 hyperparameters: {0:0.4f}'.
      format(accuracy_score(Ytest, pred)))

In [None]:
svc = SVC(kernel='rbf',C=0.1, cache_size=2000)
svc.fit(Xtrain, Xtest)

pred = svc.predict(Ytrain)

print('Model accuracy score with kernel=rbf c=0.1 hyperparameters: {0:0.4f}'.
      format(accuracy_score(Ytest, pred)))

In [None]:
svc = SVC(kernel='poly',C=100, cache_size=2000)
svc.fit(Xtrain, Xtest)

pred = svc.predict(Ytrain)

print('Model accuracy score with kernel=polynomial c=100 hyperparameters: {0:0.4f}'.
      format(accuracy_score(Ytest, pred)))

In [None]:
svc = SVC(kernel='poly',C=0.1, cache_size=2000)
svc.fit(Xtrain, Xtest)

pred = svc.predict(Ytrain)

print('Model accuracy score with kernel=polynomial c=0.1 hyperparameters: {0:0.4f}'.
      format(accuracy_score(Ytest, pred)))

In [None]:
svc = SVC(kernel='sigmoid',C=100, cache_size=2000)
svc.fit(Xtrain, Xtest)

pred = svc.predict(Ytrain)

print('Model accuracy score with kernel=sigmoid c=100 hyperparameters: {0:0.4f}'.
      format(accuracy_score(Ytest, pred)))

In [None]:
svc = SVC(kernel='sigmoid',C=0.1, cache_size=2000)
svc.fit(Xtrain, Xtest)

pred = svc.predict(Ytrain)

print('Model accuracy score with kernel=sigmoid c=0.1 hyperparameters: {0:0.4f}'.
      format(accuracy_score(Ytest, pred)))

## Improve accuracy by hyperparameter tuning
### Optimal Hyperparameter via gridsearch

In [None]:
# instantiate classifier with default hyperparameters with kernel=rbf, C=1.0 and gamma=auto
svc = SVC()
# declare parameters for hyperparameter tuning
parameters = [ {'C':[1, 10, 50], 'kernel':['rbf'], cache_size=4000},
               {'C':[1, 10, 50], 'kernel':['poly'], 'degree': [2,4,8,16,32], cache_size=4000} 
             ]

grid_search = GridSearchCV(estimator = svc,  
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 5,
                           verbose=1,
                           n_jobs=-1)

grid_search.fit(Xtrain, Xtest)

In [None]:
# examine the best model


# best score achieved during the GridSearchCV
print('GridSearch CV best score : {:.4f}\n\n'.format(grid_search.best_score_))


# print parameters that give the best results
print('Parameters that give the best results :','\n\n', (grid_search.best_params_))


# print estimator that was chosen by the GridSearch
print('\n\nEstimator that was chosen by the search :','\n\n', (grid_search.best_estimator_))

In [None]:
# calculate GridSearch CV score on test set

print('GridSearch CV score on test set: {0:0.4f}'.format(grid_search.score(Ytrain, Ytest)))