In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
import os
import pandas_profiling as pp
sb.set() # set the default Seaborn style for graphics
sb.set_style("darkgrid")
sb.set_context("poster", font_scale = .5, rc={"grid.linewidth": 0.6})

# For Support Vector Machine
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC, LinearSVC
from sklearn.preprocessing import StandardScaler

# Data Preparation

Features:

- Age | Objective Feature | age | int (days) 
- Height | Objective Feature | height | int (cm) | 
- Weight | Objective Feature | weight | float (kg) | 
- Gender | Objective Feature | gender | categorical code | 
- Systolic blood pressure | Examination Feature | ap_hi | int | 
- Diastolic blood pressure | Examination Feature | ap_lo | int | 
- Cholesterol | Examination Feature | cholesterol | 1: normal, 2: above normal, 3: well above normal | 
- Glucose | Examination Feature | gluc | 1: normal, 2: above normal, 3: well above normal | 
- Smoking | Subjective Feature | smoke | binary | 
- Alcohol intake | Subjective Feature | alco | binary | 
- Physical activity | Subjective Feature | active | binary | 
- Presence or absence of cardiovascular disease | Target Variable | cardio | binary | 

In [2]:
cardio = pd.read_csv('cardio_train.csv', sep=';')
cardio.head(10)

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
5,8,21914,1,151,67.0,120,80,2,2,0,0,0,0
6,9,22113,1,157,93.0,130,80,3,1,0,0,1,0
7,12,22584,2,178,95.0,130,90,3,3,0,0,1,1
8,13,17668,1,158,71.0,110,70,1,1,0,0,1,0
9,14,19834,1,164,68.0,110,60,1,1,0,0,0,0


In [3]:
print("Data type : ", type(cardio))
print("Data dims : ", cardio.shape)

Data type :  <class 'pandas.core.frame.DataFrame'>
Data dims :  (70000, 13)


In [4]:
cardio_data = cardio.copy()

In [5]:
cardio_data = cardio_data.drop_duplicates()
# Drop id column as it doesnt help 
cardio_data = cardio_data.drop(columns='id')

# Convert age into years
cardio_data['age'] = cardio_data['age']//365

# Combine age & weight into BMI
cardio_data['bmi'] = cardio_data["weight"] / (cardio_data["height"]/100)**2
cardio_data = cardio_data.drop(columns=['weight', 'height'])

In [6]:
# Convert categorical variables into “category” data type
cat_attribs = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio']

# Create a list that contains numerical attributes
num_attribs = ['age', 'bmi', 'ap_hi', 'ap_lo']

# Covert the data type of categorical attributes into "category"
for cat_attrib in cat_attribs:                                # iterate over each categorical attribute
  cardio_data[cat_attrib] = cardio_data[cat_attrib].astype('category')  # convert data type

cardio_data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,70000.0,52.840671,6.766774,29.0,48.0,53.0,58.0,64.0
ap_hi,70000.0,128.817286,154.011419,-150.0,120.0,120.0,140.0,16020.0
ap_lo,70000.0,96.630414,188.47253,-70.0,80.0,80.0,90.0,11000.0
bmi,70000.0,27.556513,6.091511,3.471784,23.875115,26.374068,30.222222,298.666667


In [7]:
#Convert gender to binary From Female:1 Male:2 -> Female:0 Male:1
dummygen = pd.get_dummies(cardio_data["gender"])
#Take first column (Female:0) 
dummygen = dummygen.drop([2], axis=1)
cardio_data = pd.concat((dummygen, cardio_data), axis=1)
cardio_data = cardio_data.drop(["gender"], axis=1)
cardio_data = cardio_data.rename(columns={1:"gender"})
cardio_data.sample(5)

Unnamed: 0,gender,age,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,bmi
63623,0,41,120,70,1,1,0,0,1,0,22.857143
47969,1,61,120,80,2,1,0,0,0,1,30.836531
53674,1,61,170,100,3,2,0,0,0,1,30.427198
42737,1,64,140,80,1,1,0,0,1,1,31.297374
41975,1,48,110,70,1,1,0,0,1,0,24.447279


In [8]:
#filter out bad bp data
filter_max = ((cardio_data["ap_hi"]>220) | (cardio_data["ap_lo"]>120))
filter_min = ((cardio_data["ap_hi"]<90) | (cardio_data["ap_lo"]<60))
cardio_data = cardio_data[~filter_max]
cardio_data = cardio_data[~filter_min]
len(cardio_data)

  cardio_data = cardio_data[~filter_min]


68452

In [9]:
# Extract feature bmi from the dataset
bmi = cardio_data['bmi']

# Drop outliers
cardio_data = cardio_data.drop(cardio_data.loc[(bmi < 10) | (bmi > 80)].index)

In [10]:
# Create dummy variables for non-binary categorical data
dummies = pd.get_dummies(cardio_data[['cholesterol','gluc']])
dummies.head(10)

Unnamed: 0,cholesterol_1,cholesterol_2,cholesterol_3,gluc_1,gluc_2,gluc_3
0,1,0,0,1,0,0
1,0,0,1,1,0,0
2,0,0,1,1,0,0
3,1,0,0,1,0,0
4,1,0,0,1,0,0
5,0,1,0,0,1,0
6,0,0,1,1,0,0
7,0,0,1,0,0,1
8,1,0,0,1,0,0
9,1,0,0,1,0,0


In [11]:
cardio_data = pd.concat([cardio_data,dummies],axis=1)
cardio_data.drop(['cholesterol','gluc'],axis=1,inplace=True)
cardio_data.rename(columns = {'cholesterol_1':'cholesterol_n','cholesterol_2':'cholesterol_an','cholesterol_3':'cholesterol_wan'}, inplace = True)
cardio_data.rename(columns = {'gluc_1':'gluc_n','gluc_2':'gluc_an','gluc_3':'gluc_wan'}, inplace = True)


In [12]:
cardio_data.sample(10)

Unnamed: 0,gender,age,ap_hi,ap_lo,smoke,alco,active,cardio,bmi,cholesterol_n,cholesterol_an,cholesterol_wan,gluc_n,gluc_an,gluc_wan
21674,1,63,140,90,0,0,1,1,27.767098,1,0,0,1,0,0
6740,0,58,118,85,0,1,1,1,29.025311,0,0,1,0,1,0
15702,1,59,140,90,0,0,1,1,24.141519,0,0,1,0,0,1
32538,1,43,140,100,0,0,1,1,31.955923,0,1,0,1,0,0
45889,1,50,120,80,0,0,1,1,27.1809,1,0,0,1,0,0
20747,1,53,120,80,0,0,1,0,24.6755,0,1,0,1,0,0
30901,0,60,120,80,0,0,1,0,37.653587,1,0,0,1,0,0
1358,0,58,120,80,0,0,0,0,25.605536,1,0,0,1,0,0
50854,0,58,170,80,0,0,1,1,40.272498,0,1,0,1,0,0
30111,1,48,120,80,0,0,1,0,24.977043,1,0,0,1,0,0


In [13]:
cardio_data.shape

(68416, 15)

# Conduct feature scaling

In [14]:
to_be_scaled_feat = ['age', 'ap_hi', 'ap_lo','bmi']
scaler=StandardScaler()
scaler.fit(cardio_data[to_be_scaled_feat])
cardio_data[to_be_scaled_feat] = scaler.transform(cardio_data[to_be_scaled_feat])

In [15]:
cardio_data.sample(10)

Unnamed: 0,gender,age,ap_hi,ap_lo,smoke,alco,active,cardio,bmi,cholesterol_n,cholesterol_an,cholesterol_wan,gluc_n,gluc_an,gluc_wan
52078,1,-0.861771,2.021018,2.019069,0,0,1,1,0.273191,1,0,0,1,0,0
21403,1,1.354691,0.201038,-0.144666,0,0,1,1,0.503025,0,0,1,0,0,1
1654,1,0.763634,-0.405622,0.179894,0,0,0,1,1.905587,1,0,0,1,0,0
10728,1,0.911399,0.201038,-0.144666,0,0,1,1,0.562097,0,0,1,0,0,1
18106,1,0.763634,0.807698,-0.144666,0,0,1,0,-0.348916,1,0,0,0,0,1
32074,1,0.61587,-1.012282,-1.226534,0,0,1,0,-0.543007,0,1,0,0,1,0
25430,1,-1.89612,-0.405622,-1.226534,0,0,1,0,0.14443,1,0,0,1,0,0
47065,1,-1.009535,-0.405622,-0.144666,0,0,1,1,-1.285444,1,0,0,1,0,0
68588,1,0.320342,-1.012282,-2.308401,0,0,1,0,0.050592,1,0,0,1,0,0
52494,1,0.763634,-0.405622,-1.226534,0,0,1,0,-0.775702,0,0,1,0,0,1


## Split train_set test_set

In [16]:
target_name = 'cardio'
data_target = cardio_data[target_name] #Extract cardio column as target variable
data = cardio_data.drop([target_name], axis=1) #Drop cardio column from rest of data

Split 30% test, remaining 70% train

In [17]:
Xtrain, Ytrain, Xtest, Ytest = train_test_split(data, data_target, test_size=0.3, random_state=0)

Extract 30% from training set as validation set

In [18]:
Ztrain, Zval, Ztest, Zvaltest = train_test_split(Xtrain, Xtest, test_size=0.3, random_state=0)

In [19]:
print(Ztrain.shape)
print(Zval.shape)

(33523, 14)
(14368, 14)


# SVC 

In [20]:
svc = SVC(kernel='linear',C=100, cache_size=1999)
svc.fit(Xtrain, Xtest)

pred = svc.predict(Ytrain)

print('Model accuracy score with kernel=linear c=100 hyperparameters: {0:0.4f}'.
      format(accuracy_score(Ytest, pred)))

Model accuracy score with kernel=linear c=100 hyperparameters: 0.7289


In [22]:
svc = SVC(kernel='linear', C=0.1, cache_size=1999)
svc.fit(Xtrain, Xtest)

pred = svc.predict(Ytrain)

print('Model accuracy score with kernel=linear c=0.1 hyperparameters: {0:0.4f}'.
      format(accuracy_score(Ytest, pred)))

Model accuracy score with kernel=linear c=0.1 hyperparameters: 0.7290


In [23]:
svc = SVC(kernel='rbf',C=100, cache_size=1999)
svc.fit(Xtrain, Xtest)

pred = svc.predict(Ytrain)

print('Model accuracy score with kernel=rbf c=100 hyperparameters: {0:0.4f}'.
      format(accuracy_score(Ytest, pred)))

Model accuracy score with kernel=rbf c=100 hyperparameters: 0.7301


In [24]:
svc = SVC(kernel='rbf',C=0.1, cache_size=1999)
svc.fit(Xtrain, Xtest)

pred = svc.predict(Ytrain)

print('Model accuracy score with kernel=rbf c=0.1 hyperparameters: {0:0.4f}'.
      format(accuracy_score(Ytest, pred)))

Model accuracy score with kernel=rbf c=0.1 hyperparameters: 0.7313


In [25]:
svc = SVC(kernel='poly',C=100, cache_size=1999)
svc.fit(Xtrain, Xtest)

pred = svc.predict(Ytrain)

print('Model accuracy score with kernel=polynomial c=100 hyperparameters: {0:0.4f}'.
      format(accuracy_score(Ytest, pred)))

Model accuracy score with kernel=polynomial c=100 hyperparameters: 0.7316


In [26]:
svc = SVC(kernel='poly',C=0.1, cache_size=1999)
svc.fit(Xtrain, Xtest)

pred = svc.predict(Ytrain)

print('Model accuracy score with kernel=polynomial c=0.1 hyperparameters: {0:0.4f}'.
      format(accuracy_score(Ytest, pred)))

Model accuracy score with kernel=polynomial c=0.1 hyperparameters: 0.7320


In [27]:
svc = SVC(kernel='sigmoid',C=100, cache_size=1999)
svc.fit(Xtrain, Xtest)

pred = svc.predict(Ytrain)

print('Model accuracy score with kernel=sigmoid c=100 hyperparameters: {0:0.4f}'.
      format(accuracy_score(Ytest, pred)))

Model accuracy score with kernel=sigmoid c=100 hyperparameters: 0.6330


In [28]:
svc = SVC(kernel='sigmoid',C=0.1, cache_size=1999)
svc.fit(Xtrain, Xtest)

pred = svc.predict(Ytrain)

print('Model accuracy score with kernel=sigmoid c=0.1 hyperparameters: {0:0.4f}'.
      format(accuracy_score(Ytest, pred)))

Model accuracy score with kernel=sigmoid c=0.1 hyperparameters: 0.6353


## Improve accuracy by hyperparameter tuning
### Optimal Hyperparameter via gridsearch

Determined that rbf and polynomial kernel, and lower C values works best.
Use gridsearch to finetune hyperparameter on validation set to find optimal.

In [21]:
# instantiate classifier with default hyperparameters with kernel=rbf, C=1.0 and gamma=auto
svc = SVC()
# declare parameters for hyperparameter tuning
parameters = [ {'C':[0.1, 0.5, 1, 10, 50], 'kernel':['rbf'], 'cache_size':[1000]},
               {'C':[0.1, 0.5, 1, 10, 50], 'kernel':['poly'], 'degree': [2,4,8,16], 'cache_size':[1000]} 
             ]

grid_search = GridSearchCV(estimator = svc,  
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 4,
                           verbose=3,
                           n_jobs=-3)

grid_search.fit(Zval, Zvaltest)

Fitting 4 folds for each of 25 candidates, totalling 100 fits


GridSearchCV(cv=4, estimator=SVC(), n_jobs=-3,
             param_grid=[{'C': [0.1, 0.5, 1, 10, 50], 'cache_size': [1000],
                          'kernel': ['rbf']},
                         {'C': [0.1, 0.5, 1, 10, 50], 'cache_size': [1000],
                          'degree': [2, 4, 8, 16], 'kernel': ['poly']}],
             scoring='accuracy', verbose=3)

In [22]:
# examine the best model


# best score achieved during the GridSearchCV
print('GridSearch CV best score : {:.4f}\n\n'.format(grid_search.best_score_))


# print parameters that give the best results
print('Parameters that give the best results :','\n\n', (grid_search.best_params_))


# print estimator that was chosen by the GridSearch
print('\n\nEstimator that was chosen by the search :','\n\n', (grid_search.best_estimator_))

GridSearch CV best score : 0.7340


Parameters that give the best results : 

 {'C': 1, 'cache_size': 1000, 'kernel': 'rbf'}


Estimator that was chosen by the search : 

 SVC(C=1, cache_size=1000)


In [23]:
# calculate GridSearch CV score on test set

print('GridSearch CV score on test set: {0:0.4f}'.format(grid_search.score(Ytrain, Ytest)))

GridSearch CV score on test set: 0.7316
