In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# read data
df = pd.read_csv('Data/cardio_data_processed.csv')
# remove excessive columns
df = df.drop(columns=['id', 'bp_category_encoded'])

In [3]:
# basic pre-processing
df = df[(df['bmi'] >= 15) & (df['bmi'] <= 70) & (df['height'] >= 150) & (df['height'] <= 220) & (df['weight'] > 50)]

In [4]:
# one-hot encoding for categorical features
df = pd.get_dummies(df, columns=['bp_category'], prefix='bp_category')
# data-label separation
labels = df['cardio'].to_numpy()
df = df.drop(columns=['cardio'])
df

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,age_years,bmi,bp_category_Elevated,bp_category_Hypertension Stage 1,bp_category_Hypertension Stage 2,bp_category_Normal
0,18393,2,168,62.0,110,80,1,1,0,0,1,50,21.967120,0,1,0,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,55,34.927679,0,0,1,0
2,18857,1,165,64.0,130,70,3,1,0,0,0,51,23.507805,0,1,0,0
3,17623,2,169,82.0,150,100,1,1,0,0,1,48,28.710479,0,0,1,0
4,17474,1,156,56.0,100,60,1,1,0,0,0,47,23.011177,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68200,19240,2,168,76.0,120,80,1,1,1,0,1,52,26.927438,0,1,0,0
68201,22601,1,158,126.0,140,90,2,2,0,0,1,61,50.472681,0,0,1,0
68202,19066,2,183,105.0,180,90,3,1,0,1,0,52,31.353579,0,0,1,0
68203,22431,1,163,72.0,135,80,1,2,0,0,0,61,27.099251,0,1,0,0


In [5]:
# convert to numpy
data = df.to_numpy()
# train-test split
trainx, testx, trainy, testy = train_test_split(data, labels, test_size=0.15, random_state=42)

In [6]:
# normalization
scaler = StandardScaler()
# Fit the scaler on the training data and transform both the training and test data
trainx = scaler.fit_transform(trainx)
testx = scaler.transform(testx)

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier
model = LogisticRegression()

# Train the model on the training set
model.fit(trainx, trainy)

# Make predictions on the test set
testy_pred = model.predict(testx)

# Evaluate the model
accuracy = accuracy_score(testy, testy_pred)
report = classification_report(testy, testy_pred)

# Print the results
print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.7246362063702045
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.80      0.74      4957
           1       0.76      0.65      0.70      4870

    accuracy                           0.72      9827
   macro avg       0.73      0.72      0.72      9827
weighted avg       0.73      0.72      0.72      9827

