In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
import os
import pandas_profiling as pp
sb.set() # set the default Seaborn style for graphics
sb.set_style("darkgrid")
sb.set_context("poster", font_scale = .5, rc={"grid.linewidth": 0.6})

# For Support Vector Machine
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC, LinearSVC

# Data Preparation

In [2]:
cardio = pd.read_csv('cardio_train.csv', sep=';')
#cardio.head(10)

In [3]:
print("Data type : ", type(cardio))
print("Data dims : ", cardio.shape)

Data type :  <class 'pandas.core.frame.DataFrame'>
Data dims :  (70000, 13)


In [4]:
cardio_data = cardio.copy()

In [5]:
# Drop id column as it doesnt help 
cardio_data = cardio_data.drop(columns='id')
cardio_data = cardio_data.drop_duplicates(inplace=True)

# Convert age into years
cardio_data['age'] = cardio_data['age']//365.25

# Combine age & weight into BMI
cardio_data['bmi'] = cardio_data["weight"] / (cardio_data["height"]/100)**2
cardio_data = cardio_data.drop(columns=['weight', 'height'])

TypeError: 'NoneType' object is not subscriptable

In [None]:
# Convert categorical variables into “category” data type
cat_attribs = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio']

# Create a list that contains numerical attributes
num_attribs = ['age', 'bmi', 'ap_hi', 'ap_lo']

# Covert the data type of categorical attributes into "category"
for cat_attrib in cat_attribs:                                # iterate over each categorical attribute
  cardio_data[cat_attrib] = cardio_data[cat_attrib].astype('category')  # convert data type

cardio_data.describe().transpose()

In [None]:
#filter out bad bp data
filter_max = ((cardio_data["ap_hi"]>220) | (cardio_data["ap_lo"]>120))
filter_min = ((cardio_data["ap_hi"]<90) | (cardio_data["ap_lo"]<60))
cardio_data = cardio_data[~filter_max]
cardio_data = cardio_data[~filter_min]
len(cardio_data)

In [8]:
# Extract feature bmi from the dataset
bmi = cardio_data['bmi']

# Drop outliers
cardio_data = cardio_data.drop(cardio_data.loc[(bmi < 10) | (bmi > 80)].index)

In [9]:
cardio_data.sample(10)

Unnamed: 0,age,gender,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,bmi
45561,53.0,1,120,80,1,1,0,0,1,1,29.242109
6099,52.0,2,120,80,3,3,0,0,1,0,24.221453
64514,63.0,1,170,100,3,1,0,0,1,1,30.844416
53855,41.0,1,110,70,1,1,0,0,1,1,22.481329
28628,50.0,2,120,80,1,1,0,0,0,0,27.143037
16320,56.0,1,120,80,3,1,0,0,1,1,30.863036
48445,55.0,2,130,80,1,3,0,0,1,0,28.731921
31592,52.0,1,140,90,3,3,0,0,1,1,38.103948
6442,59.0,1,140,90,3,1,0,0,1,1,24.835763
59774,55.0,1,120,80,1,1,0,0,1,0,27.639801


## Split train_set test_set

In [10]:
target_name = 'cardio'
data_target = cardio_data[target_name] #Extract cardio column as target variable
data = cardio_data.drop([target_name], axis=1) #Drop cardio column from rest of data

Split 20% test, remaining 80% train

In [11]:
train, test, target, target_test = train_test_split(data, data_target, test_size=0.3, random_state=0)

# print(train.head(3))
print(test.head(3))

From the training set, split 20% of it to serve as validation set for hyperparameter tuning and estimate model skill.

In [12]:
#%% split training set to validation set
#Xtrain, Xval, Ztrain, Zval = train_test_split(train, target, test_size=0.2, random_state=0)

In [13]:
data_target.value_counts()
#No class imbalance issue

0    34555
1    33861
Name: cardio, dtype: int64

# Conduct Feature Scaling

In [14]:
cols = train.columns

In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(train)

X_test = scaler.transform(test)

In [16]:
X_train = pd.DataFrame(X_train, columns=[cols])

In [17]:
X_test = pd.DataFrame(X_test, columns=[cols])

In [18]:
X_train.describe()

Unnamed: 0,age,gender,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,bmi
count,54732.0,54732.0,54732.0,54732.0,54732.0,54732.0,54732.0,54732.0,54732.0,54732.0
mean,5.240739e-16,9.41586e-16,2.028472e-16,-6.521434e-16,1.725223e-15,2.003136e-16,1.702143e-15,-4.693995e-16,-3.977488e-16,6.90511e-16
std,1.000009,1.000009,1.000009,1.000009,1.000009,1.000009,1.000009,1.000009,1.000009,1.000009
min,-3.521114,-0.7317909,-2.228766,-2.304415,-0.5375761,-0.3940818,-0.3084894,-0.2365375,-2.022357,-3.176289
25%,-0.7109743,-0.7317909,-0.4058914,-0.1450579,-0.5375761,-0.3940818,-0.3084894,-0.2365375,0.4944725,-0.6812843
50%,0.02853625,-0.7317909,-0.4058914,-0.1450579,-0.5375761,-0.3940818,-0.3084894,-0.2365375,0.4944725,-0.2123247
75%,0.7680468,1.366511,0.8093581,0.9346207,0.9363551,-0.3940818,-0.3084894,-0.2365375,0.4944725,0.5036031
max,1.655459,1.366511,5.670356,4.173656,2.410286,3.104237,3.241603,4.227659,0.4944725,8.902364


# SVC 

In [None]:
svc = SVC(kernel='linear',C=100)
svc.fit(train, target)

pred = svc.predict(test)

print('Model accuracy score with kernel=linear c=100 hyperparameters: {0:0.4f}'.
      format(accuracy_score(target_test, pred)))

In [None]:
svc = SVC(kernel='linear', C=0.1)
svc.fit(train, target)

pred = svc.predict(test)

print('Model accuracy score with kernel=linear c=0.1 hyperparameters: {0:0.4f}'.
      format(accuracy_score(target_test, pred)))

In [None]:
svc = SVC(kernel='rbf',C=100)
svc.fit(train, target)

pred = svc.predict(test)

print('Model accuracy score with kernel=rbf c=100 hyperparameters: {0:0.4f}'.
      format(accuracy_score(target_test, pred)))

In [None]:
svc = SVC(kernel='rbf',C=0.1)
svc.fit(train, target)

pred = svc.predict(test)

print('Model accuracy score with kernel=rbf c=0.1 hyperparameters: {0:0.4f}'.
      format(accuracy_score(target_test, pred)))

In [None]:
svc = SVC(kernel='poly',C=100)
svc.fit(train, target)

pred = svc.predict(test)

print('Model accuracy score with kernel=polynomial c=100 hyperparameters: {0:0.4f}'.
      format(accuracy_score(target_test, pred)))

In [None]:
svc = SVC(kernel='poly',C=0.1)
svc.fit(train, target)

pred = svc.predict(test)

print('Model accuracy score with kernel=polynomial c=0.1 hyperparameters: {0:0.4f}'.
      format(accuracy_score(target_test, pred)))

In [None]:
svc = SVC(kernel='sigmoid',C=100)
svc.fit(train, target)

pred = svc.predict(test)

print('Model accuracy score with kernel=sigmoid c=100 hyperparameters: {0:0.4f}'.
      format(accuracy_score(target_test, pred)))

In [None]:
svc = SVC(kernel='sigmoid',C=0.1)
svc.fit(train, target)

pred = svc.predict(test)

print('Model accuracy score with kernel=sigmoid c=0.1 hyperparameters: {0:0.4f}'.
      format(accuracy_score(target_test, pred)))