# Data

In [1]:
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns  

df = pd.read_csv("./data/CARDIO_TRAIN.csv")
df = df.drop(columns="Unnamed: 0")

In [2]:
df.columns

Index(['ID', 'AGE', 'GENDER', 'HEIGHT', 'WEIGHT', 'AP_HI', 'AP_LO',
       'CHOLESTEROL', 'GLUC', 'SMOKE', 'ALCO', 'ACTIVE', 'CARDIO'],
      dtype='object')

In [3]:
df.corr()["CARDIO"].sort_values(ascending=False)

CARDIO         1.000000
AGE            0.237985
CHOLESTEROL    0.221147
WEIGHT         0.181659
GLUC           0.089307
AP_LO          0.065719
AP_HI          0.054475
GENDER         0.008109
ID             0.003799
ALCO          -0.007330
HEIGHT        -0.010821
SMOKE         -0.015486
ACTIVE        -0.035653
Name: CARDIO, dtype: float64

In [4]:
useful_feature = ['CHOLESTEROL', 'GLUC', 'AGE', 'WEIGHT','AP_HI', 'AP_LO', 'HEIGHT', 'SMOKE', 'ACTIVE']
df[useful_feature]


Unnamed: 0,CHOLESTEROL,GLUC,AGE,WEIGHT,AP_HI,AP_LO,HEIGHT,SMOKE,ACTIVE
0,1,1,50,62.0,110,80,168,0,1
1,3,1,55,85.0,140,90,156,0,1
2,3,1,51,64.0,130,70,165,0,0
3,1,1,48,82.0,150,100,169,0,1
4,1,1,47,56.0,100,60,156,0,0
...,...,...,...,...,...,...,...,...,...
69995,1,1,52,76.0,120,80,168,1,1
69996,2,2,61,126.0,140,90,158,0,1
69997,3,1,52,105.0,180,90,183,0,0
69998,1,2,61,72.0,135,80,163,0,0


In [5]:
sc = ['AGE', 'WEIGHT','AP_HI', 'AP_LO', 'CHOLESTEROL', 'GLUC']

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data = scaler.fit_transform(df[['AGE', 'WEIGHT', 'AP_HI', 'AP_LO', 'HEIGHT']])
data = np.concatenate([data,df[['CHOLESTEROL', 'GLUC','CARDIO', 'SMOKE', 'ACTIVE']]], axis=1)
X = data[:,:-1] 
y = data[:,-1]

In [6]:
X.shape, y.shape

((70000, 9), (70000,))

# Model

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [8]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

lr.fit(X_train, y_train)
y_hat = lr.predict(X_test)

accuracy_score(y_test, y_hat)

0.8044761904761905

In [9]:
from sklearn.ensemble import AdaBoostClassifier

lr = LogisticRegression()
adac = AdaBoostClassifier(n_estimators=64, learning_rate=1, random_state=0, estimator=lr)
adac.fit(X_train, y_train) 
#Evaluation
y_hat = adac.predict(X_test)
accuracy_score(y_test, y_hat) # 0.6494285714285715

0.8044761904761905

In [10]:
from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()
bnb.fit(X_train, y_train)
y_hat = bnb.predict(X_test)

accuracy_score(y_test, y_hat)

0.8044761904761905

In [11]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
baggc = BaggingClassifier(KNeighborsClassifier(), max_samples=0.7, max_features=0.5, n_estimators=200) 
baggc.fit(X_train,y_train)
y_hat = baggc.predict(X_test)
accuracy_score(y_test, y_hat) # 0.7332857142857143

0.8044761904761905

## Neural Network

In [30]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Generate some example data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert data to PyTorch tensors
X_train = torch.FloatTensor(X_train)
y_train = torch.FloatTensor(y_train)
X_test = torch.FloatTensor(X_test)
y_test = torch.FloatTensor(y_test)

# Define the SimpleNN model
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNN, self).__init__()
        self.input_layer = nn.Linear(input_size, hidden_size)
        self.hidden_layer = nn.Linear(hidden_size, hidden_size)
        self.output_layer = nn.Linear(hidden_size, output_size)
        self.tanh = nn.Tanh()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.input_layer(x)
        x = self.tanh(x)
        x = self.hidden_layer(x)
        x = self.tanh(x)
        x = self.hidden_layer(x)
        x = self.tanh(x)
        x = self.output_layer(x)
        x = self.sigmoid(x)
        return x

# Instantiate the model
input_size = 9  # Number of input features
hidden_size = 64  # Number of hidden units
output_size = 1  # Output size for binary classification
model = SimpleNN(input_size, hidden_size, output_size)

# Define loss and optimizer
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(outputs, y_train.view(-1, 1))
    loss.backward()
    optimizer.step()

    # Compute training accuracy and test accuracy
    train_preds = (outputs >= 0.5).float()
    train_accuracy = accuracy_score(y_train, train_preds)
    with torch.no_grad():
        test_outputs = model(X_test)
        test_preds = (test_outputs >= 0.5).float()
        test_accuracy = accuracy_score(y_test, test_preds)

    print(f"Epoch [{epoch + 1}/{num_epochs}], "
          f"Train Loss: {loss.item():.4f}, Train Accuracy: {train_accuracy * 100:.2f}%, "
          f"Test Loss: {criterion(test_outputs, y_test.view(-1, 1)).item():.4f}, Test Accuracy: {test_accuracy * 100:.2f}%")


Epoch [1/100], Train Loss: 0.6942, Train Accuracy: 48.93%, Test Loss: 0.6908, Test Accuracy: 54.96%
Epoch [2/100], Train Loss: 0.6909, Train Accuracy: 54.63%, Test Loss: 0.6877, Test Accuracy: 60.68%
Epoch [3/100], Train Loss: 0.6878, Train Accuracy: 60.38%, Test Loss: 0.6846, Test Accuracy: 66.06%
Epoch [4/100], Train Loss: 0.6847, Train Accuracy: 65.78%, Test Loss: 0.6815, Test Accuracy: 70.18%
Epoch [5/100], Train Loss: 0.6816, Train Accuracy: 70.13%, Test Loss: 0.6785, Test Accuracy: 73.69%
Epoch [6/100], Train Loss: 0.6787, Train Accuracy: 73.74%, Test Loss: 0.6756, Test Accuracy: 76.31%
Epoch [7/100], Train Loss: 0.6757, Train Accuracy: 76.04%, Test Loss: 0.6727, Test Accuracy: 77.92%
Epoch [8/100], Train Loss: 0.6728, Train Accuracy: 77.69%, Test Loss: 0.6699, Test Accuracy: 78.65%
Epoch [9/100], Train Loss: 0.6700, Train Accuracy: 78.58%, Test Loss: 0.6671, Test Accuracy: 79.10%
Epoch [10/100], Train Loss: 0.6672, Train Accuracy: 79.05%, Test Loss: 0.6643, Test Accuracy: 79.31%