In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
import os
import pandas_profiling as pp
sb.set() # set the default Seaborn style for graphics
sb.set_style("darkgrid")
sb.set_context("poster", font_scale = .5, rc={"grid.linewidth": 0.6})

# For Support Vector Machine
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.svm import SVC, LinearSVC

# Data Preparation

In [2]:
cardio = pd.read_csv('cardio_train.csv', sep=';')
#cardio.head(10)

In [3]:
print("Data type : ", type(cardio))
print("Data dims : ", cardio.shape)

Data type :  <class 'pandas.core.frame.DataFrame'>
Data dims :  (70000, 13)


In [4]:
cardio_data = cardio.copy()

In [5]:
# Drop id column as it doesnt help 
cardio_data = cardio_data.drop(columns='id')

# Convert age into years
cardio_data['age'] = cardio_data['age']//365.25

# Combine age & weight into BMI
cardio_data['bmi'] = cardio_data["weight"] / (cardio_data["height"]/100)**2
cardio_data = cardio_data.drop(columns=['weight', 'height'])

In [6]:
# Convert categorical variables into “category” data type
cat_attribs = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio']

# Create a list that contains numerical attributes
num_attribs = ['age', 'bmi', 'ap_hi', 'ap_lo']

# Covert the data type of categorical attributes into "category"
for cat_attrib in cat_attribs:                                # iterate over each categorical attribute
  cardio_data[cat_attrib] = cardio_data[cat_attrib].astype('category')  # convert data type

cardio_data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,70000.0,52.803257,6.762462,29.0,48.0,53.0,58.0,64.0
ap_hi,70000.0,128.817286,154.011419,-150.0,120.0,120.0,140.0,16020.0
ap_lo,70000.0,96.630414,188.47253,-70.0,80.0,80.0,90.0,11000.0
bmi,70000.0,27.556513,6.091511,3.471784,23.875115,26.374068,30.222222,298.666667


In [7]:
#filter out bad bp data
filter_max = ((cardio_data["ap_hi"]>220) | (cardio_data["ap_lo"]>120))
filter_min = ((cardio_data["ap_hi"]<90) | (cardio_data["ap_lo"]<60))
cardio_data = cardio_data[~filter_max]
cardio_data = cardio_data[~filter_min]
len(cardio_data)

  cardio_data = cardio_data[~filter_min]


68452

In [8]:
# Extract feature bmi from the dataset
bmi = cardio_data['bmi']

# Drop outliers
cardio_data = cardio_data.drop(cardio_data.loc[(bmi < 10) | (bmi > 80)].index)

In [9]:
cardio_data.sample(10)

Unnamed: 0,age,gender,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,bmi
26720,61.0,2,110,70,1,1,0,0,0,0,24.801587
37893,59.0,1,120,80,1,1,0,0,1,0,32.882414
30530,61.0,1,120,79,1,1,0,0,1,0,33.694938
62753,58.0,2,120,80,1,1,0,0,0,0,29.639889
63335,50.0,2,120,80,1,1,0,0,1,1,25.951557
29728,47.0,2,120,80,1,1,0,0,1,0,23.836735
40040,55.0,1,120,80,3,3,0,0,1,1,31.0056
9797,45.0,2,130,90,1,1,0,0,1,1,29.377583
65426,55.0,1,130,80,3,1,0,0,1,1,28.25699
4618,43.0,1,110,70,2,1,0,0,1,0,23.422091


## Split train_set test_set

In [10]:
target_name = 'cardio'
data_target = cardio_data[target_name]
data = cardio_data.drop([target_name], axis=1)

Split 30% test, remaining 70% train

In [11]:
train, test, target, target_test = train_test_split(data, data_target, test_size=0.2, random_state=0)

In [12]:
train.head(3)
test.head(3)

Unnamed: 0,age,gender,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,bmi
7239,53.0,1,140,90,1,1,0,0,1,31.25
30316,60.0,1,140,100,1,1,0,0,0,39.91053
15137,46.0,2,120,70,3,3,0,0,1,24.447279


In [13]:
#%% split training set to validation set
Xtrain, Xval, Ztrain, Zval = train_test_split(train, target, test_size=0.2, random_state=0)

# SVC 

In [14]:
svc = SVC()
svc.fit(train, target)
acc_svc = round(svc.score(train, target) * 100, 2)
acc_svc

72.29

In [16]:
acc_test_svc = round(svc.score(test, target_test) * 100, 2)
acc_test_svc

72.5