In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
import os
import pandas_profiling as pp
sb.set() # set the default Seaborn style for graphics
sb.set_style("darkgrid")
sb.set_context("poster", font_scale = .5, rc={"grid.linewidth": 0.6})

# For Support Vector Machine
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC, LinearSVC
from sklearn.preprocessing import StandardScaler

# Data Preparation

Features:

- Age | Objective Feature | age | int (days) 
- Height | Objective Feature | height | int (cm) | 
- Weight | Objective Feature | weight | float (kg) | 
- Gender | Objective Feature | gender | categorical code | 
- Systolic blood pressure | Examination Feature | ap_hi | int | 
- Diastolic blood pressure | Examination Feature | ap_lo | int | 
- Cholesterol | Examination Feature | cholesterol | 1: normal, 2: above normal, 3: well above normal | 
- Glucose | Examination Feature | gluc | 1: normal, 2: above normal, 3: well above normal | 
- Smoking | Subjective Feature | smoke | binary | 
- Alcohol intake | Subjective Feature | alco | binary | 
- Physical activity | Subjective Feature | active | binary | 
- Presence or absence of cardiovascular disease | Target Variable | cardio | binary | 

In [2]:
cardio = pd.read_csv('cardio_train.csv', sep=';')
cardio.head(10)

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
5,8,21914,1,151,67.0,120,80,2,2,0,0,0,0
6,9,22113,1,157,93.0,130,80,3,1,0,0,1,0
7,12,22584,2,178,95.0,130,90,3,3,0,0,1,1
8,13,17668,1,158,71.0,110,70,1,1,0,0,1,0
9,14,19834,1,164,68.0,110,60,1,1,0,0,0,0


In [3]:
print("Data type : ", type(cardio))
print("Data dims : ", cardio.shape)

Data type :  <class 'pandas.core.frame.DataFrame'>
Data dims :  (70000, 13)


In [4]:
cardio_data = cardio.copy()

In [5]:
cardio_data = cardio_data.drop_duplicates()
# Drop id column as it doesnt help 
cardio_data = cardio_data.drop(columns='id')

# Convert age into years
cardio_data['age'] = cardio_data['age']//365

# Combine age & weight into BMI
cardio_data['bmi'] = cardio_data["weight"] / (cardio_data["height"]/100)**2
cardio_data = cardio_data.drop(columns=['weight', 'height'])

In [6]:
# Convert categorical variables into “category” data type
cat_attribs = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio']

# Create a list that contains numerical attributes
num_attribs = ['age', 'bmi', 'ap_hi', 'ap_lo']

# Covert the data type of categorical attributes into "category"
for cat_attrib in cat_attribs:                                # iterate over each categorical attribute
  cardio_data[cat_attrib] = cardio_data[cat_attrib].astype('category')  # convert data type

cardio_data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,70000.0,52.840671,6.766774,29.0,48.0,53.0,58.0,64.0
ap_hi,70000.0,128.817286,154.011419,-150.0,120.0,120.0,140.0,16020.0
ap_lo,70000.0,96.630414,188.47253,-70.0,80.0,80.0,90.0,11000.0
bmi,70000.0,27.556513,6.091511,3.471784,23.875115,26.374068,30.222222,298.666667


In [7]:
#filter out bad bp data
filter_max = ((cardio_data["ap_hi"]>220) | (cardio_data["ap_lo"]>120))
filter_min = ((cardio_data["ap_hi"]<90) | (cardio_data["ap_lo"]<60))
cardio_data = cardio_data[~filter_max]
cardio_data = cardio_data[~filter_min]
len(cardio_data)

  cardio_data = cardio_data[~filter_min]


68452

In [8]:
# Extract feature bmi from the dataset
bmi = cardio_data['bmi']

# Drop outliers
cardio_data = cardio_data.drop(cardio_data.loc[(bmi < 10) | (bmi > 80)].index)

In [9]:
# Create dummy variables for non-binary categorical data
dummies = pd.get_dummies(cardio_data[['cholesterol','gluc']])
dummies.head(10)

Unnamed: 0,cholesterol_1,cholesterol_2,cholesterol_3,gluc_1,gluc_2,gluc_3
0,1,0,0,1,0,0
1,0,0,1,1,0,0
2,0,0,1,1,0,0
3,1,0,0,1,0,0
4,1,0,0,1,0,0
5,0,1,0,0,1,0
6,0,0,1,1,0,0
7,0,0,1,0,0,1
8,1,0,0,1,0,0
9,1,0,0,1,0,0


In [10]:
cardio_data = pd.concat([cardio_data,dummies],axis=1)
cardio_data.drop(['cholesterol','gluc'],axis=1,inplace=True)
cardio_data.rename(columns = {'cholesterol_1':'cholesterol_n','cholesterol_2':'cholesterol_an','cholesterol_3':'cholesterol_wan'}, inplace = True)
cardio_data.rename(columns = {'gluc_1':'gluc_n','gluc_2':'gluc_an','gluc_3':'gluc_wan'}, inplace = True)


In [11]:
cardio_data.sample(10)

Unnamed: 0,age,gender,ap_hi,ap_lo,smoke,alco,active,cardio,bmi,cholesterol_n,cholesterol_an,cholesterol_wan,gluc_n,gluc_an,gluc_wan
8428,56,2,100,80,1,0,1,0,27.776706,1,0,0,0,0,1
19219,51,1,120,80,0,0,1,0,23.507805,1,0,0,1,0,0
65789,51,2,120,80,1,0,1,0,28.731747,1,0,0,1,0,0
23254,56,2,120,60,0,0,1,1,24.489796,1,0,0,1,0,0
19828,63,2,130,80,0,0,0,0,38.390606,1,0,0,1,0,0
29032,45,2,120,80,1,0,1,0,23.450918,1,0,0,1,0,0
37761,57,1,120,70,0,0,1,1,38.21499,0,0,1,0,0,1
39508,50,1,120,80,0,0,1,1,22.491349,1,0,0,1,0,0
23212,39,1,100,70,0,0,1,1,22.213678,1,0,0,1,0,0
55031,60,2,130,80,0,0,1,0,21.306818,0,1,0,1,0,0


# Conduct feature scaling

In [20]:
to_be_scaled_feat = ['age', 'ap_hi', 'ap_lo','bmi']
scaler=StandardScaler()
scaler.fit(cardio_data[to_be_scaled_feat])
cardio_data[to_be_scaled_feat] = scaler.transform(cardio_data[to_be_scaled_feat])

In [21]:
cardio_data.sample(10)

Unnamed: 0,age,gender,ap_hi,ap_lo,smoke,alco,active,cardio,bmi,cholesterol_n,cholesterol_an,cholesterol_wan,gluc_n,gluc_an,gluc_wan
47132,1.059163,2,2.021018,0.937202,1,0,1,1,-0.593452,0,0,1,1,0,0
57887,1.206927,1,1.414358,2.019069,0,0,1,1,-0.534714,1,0,0,1,0,0
25182,-0.270714,1,-0.405622,-0.144666,0,0,1,0,0.079239,1,0,0,1,0,0
13754,0.024814,1,-0.405622,-0.144666,0,0,1,0,0.562097,1,0,0,1,0,0
2805,0.911399,1,3.234338,2.019069,0,0,1,1,0.571741,1,0,0,1,0,0
41306,0.61587,2,1.414358,0.937202,0,0,1,1,-0.313687,1,0,0,1,0,0
56207,0.61587,1,-0.405622,-0.144666,0,0,0,0,-0.838879,1,0,0,1,0,0
10245,-1.305063,1,-0.405622,0.937202,0,0,1,0,0.492747,0,1,0,0,1,0
34915,1.502455,1,-0.405622,-0.144666,0,0,0,0,0.503025,1,0,0,1,0,0
22327,-1.009535,1,-0.405622,-0.144666,0,0,1,1,1.355122,1,0,0,1,0,0


## Split train_set test_set

In [12]:
target_name = 'cardio'
data_target = cardio_data[target_name] #Extract cardio column as target variable
data = cardio_data.drop([target_name], axis=1) #Drop cardio column from rest of data

Split 30% test, remaining 70% train

In [31]:
Xtrain, Ytrain, Xtest, Ytest = train_test_split(data, data_target, test_size=0.3, random_state=0)

In [32]:
Xtrain.head()

Unnamed: 0,age,gender,ap_hi,ap_lo,smoke,alco,active,bmi,cholesterol_n,cholesterol_an,cholesterol_wan,gluc_n,gluc_an,gluc_wan
30434,50,1,120,80,0,0,1,25.315454,1,0,0,1,0,0
16936,62,2,120,80,1,0,1,26.42357,1,0,0,1,0,0
35045,47,1,140,90,0,0,1,34.049031,0,1,0,1,0,0
36807,54,1,120,80,0,0,0,23.875115,1,0,0,1,0,0
20368,53,2,130,80,1,1,0,33.240997,1,0,0,0,1,0


In [34]:
Xtest.head()

30434    0
16936    1
35045    1
36807    0
20368    0
Name: cardio, dtype: category
Categories (2, int64): [0, 1]

In [33]:
Ytrain.head()

Unnamed: 0,age,gender,ap_hi,ap_lo,smoke,alco,active,bmi,cholesterol_n,cholesterol_an,cholesterol_wan,gluc_n,gluc_an,gluc_wan
7239,53,1,140,90,0,0,1,31.25,1,0,0,1,0,0
30316,60,1,140,100,0,0,0,39.91053,1,0,0,1,0,0
15137,46,2,120,70,0,0,1,24.447279,0,0,1,0,0,1
58119,39,2,140,80,0,0,0,28.548281,1,0,0,1,0,0
8889,59,1,130,80,0,0,1,34.647603,0,0,1,1,0,0


In [35]:
Ytest.head()

7239     1
30316    1
15137    0
58119    1
8889     1
Name: cardio, dtype: category
Categories (2, int64): [0, 1]

# SVC 

svc = SVC(kernel='linear',C=100, cache_size=2000)
svc.fit(Xtrain, target)

pred = svc.predict(test)

print('Model accuracy score with kernel=linear c=100 hyperparameters: {0:0.4f}'.
      format(accuracy_score(target_test, pred)))

svc = SVC(kernel='linear', C=0.1)
svc.fit(Xtrain, target)

pred = svc.predict(test)

print('Model accuracy score with kernel=linear c=0.1 hyperparameters: {0:0.4f}'.
      format(accuracy_score(target_test, pred)))

svc = SVC(kernel='rbf',C=100)
svc.fit(Xtrain, target)

pred = svc.predict(test)

print('Model accuracy score with kernel=rbf c=100 hyperparameters: {0:0.4f}'.
      format(accuracy_score(target_test, pred)))

svc = SVC(kernel='rbf',C=0.1)
svc.fit(Xtrain, target)

pred = svc.predict(test)

print('Model accuracy score with kernel=rbf c=0.1 hyperparameters: {0:0.4f}'.
      format(accuracy_score(target_test, pred)))

svc = SVC(kernel='poly',C=100)
svc.fit(Xtrain, target)

pred = svc.predict(test)

print('Model accuracy score with kernel=polynomial c=100 hyperparameters: {0:0.4f}'.
      format(accuracy_score(target_test, pred)))

svc = SVC(kernel='poly',C=0.1)
svc.fit(Xtrain, target)

pred = svc.predict(test)

print('Model accuracy score with kernel=polynomial c=0.1 hyperparameters: {0:0.4f}'.
      format(accuracy_score(target_test, pred)))

svc = SVC(kernel='sigmoid',C=100)
svc.fit(Xtrain, target)

pred = svc.predict(test)

print('Model accuracy score with kernel=sigmoid c=100 hyperparameters: {0:0.4f}'.
      format(accuracy_score(target_test, pred)))

svc = SVC(kernel='sigmoid',C=0.1)
svc.fit(Xtrain, target)

pred = svc.predict(test)

print('Model accuracy score with kernel=sigmoid c=0.1 hyperparameters: {0:0.4f}'.
      format(accuracy_score(target_test, pred)))