In [1]:
import pandas as pd
import numpy as np

In [2]:
dataset = pd.read_csv('datasetforproject.csv')
dataset.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [3]:
# Split the data into features (X) and target variable (y)
X = dataset.drop('cardio', axis=1)
print("X Dataset:")
X.head()

X Dataset:


Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,0,18393,2,168,62.0,110,80,1,1,0,0,1
1,1,20228,1,156,85.0,140,90,3,1,0,0,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0
3,3,17623,2,169,82.0,150,100,1,1,0,0,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0


The given dataset has 12 features meaning and types can be listed as follows :


| Column | Description |
| ------ | ----------- |
| age | Age (int)|
| height | Height (int) |
| weight | Weight (float) |
| gender | Gender (categorical code )|
| ap_hi | Systolic blood pressure (int) |
| ap_lo | Diastolic blood pressure (int) |
| cholesterol | Cholesterol (1:normal, 2: above normal, 3: well above normal) |
| gluc | Glucose (1:normal, 2: above normal, 3: well above normal) |
| smoke | Smoking (binary) |
| alco | Alcohol intake (binary) |
| active | Physical activity (binary) |
| cardio | Presence or absence of cardiovascular disease (binary) |


---


Our data has three types of data: **Quantitative data**, **Categorical data** and **Binary data**

In [5]:
y = dataset['cardio']
print("Y Dataset:")
y.head()

Y Dataset:


0    0
1    1
2    1
3    1
4    0
Name: cardio, dtype: int64

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression

In [7]:
# Model training and accuracy using Logistic Regression
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

logreg_train_acc = accuracy_score(y_train, y_pred_train)
logreg_test_acc = accuracy_score(y_test, y_pred_test)

print('Train Accuracy:', logreg_train_acc)
print('Test Accuracy:', logreg_test_acc)

Train Accuracy: 0.69775
Test Accuracy: 0.6997857142857142


# RandomForestClassifier

In [8]:
# RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

rd = RandomForestClassifier()
rd.fit(X_train, y_train)

y_pred_train = rd.predict(X_train)
y_pred_test = rd.predict(X_test)

randforest_train_acc = accuracy_score(y_train, y_pred_train)
randforest_test_acc = accuracy_score(y_test, y_pred_test)

print('Train Accuracy:', randforest_train_acc)
print('Test Accuracy:', randforest_test_acc)


Train Accuracy: 0.9999821428571428
Test Accuracy: 0.7253571428571428


# Decision Tree Classifier

In [9]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

y_pred_train = dt.predict(X_train)
y_pred_test = dt.predict(X_test)

dt_train_acc = accuracy_score(y_train, y_pred_train)
dt_test_acc = accuracy_score(y_test, y_pred_test)

print('Train Accuracy:', dt_train_acc)
print('Test Accuracy:', dt_test_acc)

Train Accuracy: 1.0
Test Accuracy: 0.6328571428571429


# KNearest Neighbors Classifier 

In [10]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

y_pred_train = knn.predict(X_train)
y_pred_test = knn.predict(X_test)

knn_train_acc = accuracy_score(y_train, y_pred_train)
knn_test_acc = accuracy_score(y_test, y_pred_test)

print('Train Accuracy:', knn_train_acc)
print('Test Accuracy:', knn_test_acc)

Train Accuracy: 0.7155357142857143
Test Accuracy: 0.5541428571428572


# Support Vector Machine

In [11]:
from sklearn.svm import SVC

In [12]:
#Performing feature scaling for better SVM performance
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [14]:
#Implement SVM algorithm
svm = SVC()
svm.fit(X_train, y_train)

In [16]:
print('Train Accuracy: ', svm.score(X_train,y_train))
print('Test Accuracy: ', svm.score(X_test,y_test))

Train Accuracy:  0.7328035714285714
Test Accuracy:  0.7277142857142858
