In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import optuna

In [2]:
df = pd.read_csv("/kaggle/input/heart-attack-in-youth-vs-adult-in-russia/heart_attack_russia_youth_vs_adult.csv")
df.head()

Unnamed: 0,ID,Age,Gender,Region,Blood_Pressure,Cholesterol,BMI,Heart_Rate,Exercise_Level,Smoking,...,Income_Level,Physical_Activity,Education_Level,Marital_Status,Urban_Rural,Medication,Health_Awareness,Daily_Water_Intake,Mental_Health,Obesity
0,1,50,Male,Rural,110.0,196.5,15.9,76,High,False,...,Low,Low,Primary,Married,Rural,False,5,2.3,5,False
1,2,40,Female,Urban,138.8,157.5,27.1,82,Moderate,False,...,Low,Moderate,Higher,Married,Urban,False,1,5.0,4,False
2,3,26,Male,Rural,116.0,210.1,27.2,71,Moderate,False,...,Middle,High,Primary,Married,Urban,False,4,2.4,8,False
3,4,54,Female,Rural,133.5,170.5,26.0,74,Moderate,True,...,Middle,Moderate,Higher,Married,Urban,False,2,2.7,6,True
4,5,19,Female,Urban,108.0,224.5,27.5,67,Low,False,...,Middle,Low,Higher,Widowed,Urban,False,4,3.5,4,True


In [3]:
print('Number of Columns:', df.shape[1])
print('Number of Rowa:', df.shape[0])

Number of Columns: 30
Number of Rowa: 50000


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 30 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ID                     50000 non-null  int64  
 1   Age                    50000 non-null  int64  
 2   Gender                 50000 non-null  object 
 3   Region                 50000 non-null  object 
 4   Blood_Pressure         50000 non-null  float64
 5   Cholesterol            50000 non-null  float64
 6   BMI                    50000 non-null  float64
 7   Heart_Rate             50000 non-null  int64  
 8   Exercise_Level         50000 non-null  object 
 9   Smoking                50000 non-null  bool   
 10  Alcohol_Consumption    24976 non-null  object 
 11  Diabetes               50000 non-null  bool   
 12  Family_History         50000 non-null  bool   
 13  Stress_Level           50000 non-null  int64  
 14  Heart_Attack           50000 non-null  bool   
 15  An

In [5]:
df.isna().sum()

ID                           0
Age                          0
Gender                       0
Region                       0
Blood_Pressure               0
Cholesterol                  0
BMI                          0
Heart_Rate                   0
Exercise_Level               0
Smoking                      0
Alcohol_Consumption      25024
Diabetes                     0
Family_History               0
Stress_Level                 0
Heart_Attack                 0
Angina                       0
Heart_Disease_History        0
Diet                         0
Sleep_Hours                  0
Occupation                   0
Income_Level                 0
Physical_Activity            0
Education_Level              0
Marital_Status               0
Urban_Rural                  0
Medication                   0
Health_Awareness             0
Daily_Water_Intake           0
Mental_Health                0
Obesity                      0
dtype: int64

In [6]:
print("Number of duplicates : ",df.duplicated().sum())

Number of duplicates :  0


In [7]:
for i in df.select_dtypes(include = 'object').columns:
    print(df[i].value_counts())
    print('****'*20)

Gender
Female    24155
Male      23944
Other      1901
Name: count, dtype: int64
********************************************************************************
Region
Urban       25034
Rural       14946
Suburban    10020
Name: count, dtype: int64
********************************************************************************
Exercise_Level
Moderate    24946
Low         15100
High         9954
Name: count, dtype: int64
********************************************************************************
Alcohol_Consumption
Moderate    20022
Heavy        4954
Name: count, dtype: int64
********************************************************************************
Diet
Healthy      19789
Mixed        15185
Unhealthy    15026
Name: count, dtype: int64
********************************************************************************
Occupation
Employed      25166
Student       10068
Unemployed     9843
Retired        4923
Name: count, dtype: int64
*********************************************

In [8]:
df.drop(columns = ['ID', 'Alcohol_Consumption', 'Urban_Rural'], inplace=True)

In [9]:
df.head()

Unnamed: 0,Age,Gender,Region,Blood_Pressure,Cholesterol,BMI,Heart_Rate,Exercise_Level,Smoking,Diabetes,...,Occupation,Income_Level,Physical_Activity,Education_Level,Marital_Status,Medication,Health_Awareness,Daily_Water_Intake,Mental_Health,Obesity
0,50,Male,Rural,110.0,196.5,15.9,76,High,False,False,...,Unemployed,Low,Low,Primary,Married,False,5,2.3,5,False
1,40,Female,Urban,138.8,157.5,27.1,82,Moderate,False,False,...,Employed,Low,Moderate,Higher,Married,False,1,5.0,4,False
2,26,Male,Rural,116.0,210.1,27.2,71,Moderate,False,True,...,Employed,Middle,High,Primary,Married,False,4,2.4,8,False
3,54,Female,Rural,133.5,170.5,26.0,74,Moderate,True,False,...,Student,Middle,Moderate,Higher,Married,False,2,2.7,6,True
4,19,Female,Urban,108.0,224.5,27.5,67,Low,False,False,...,Employed,Middle,Low,Higher,Widowed,False,4,3.5,4,True


In [10]:
summary = df.describe().transpose()
print(summary)

                      count        mean        std   min    25%     50%  \
Age                 50000.0   35.991820  14.110139  12.0   24.0   36.00   
Blood_Pressure      50000.0  120.058636  14.975835  60.0  109.9  120.05   
Cholesterol         50000.0  199.852762  49.998331 -18.7  166.1  199.90   
BMI                 50000.0   24.983912   5.003784   2.9   21.6   25.00   
Heart_Rate          50000.0   79.988980  11.804567  60.0   70.0   80.00   
Stress_Level        50000.0    5.503700   2.870741   1.0    3.0    6.00   
Sleep_Hours         50000.0    7.011464   1.740651   4.0    5.5    7.00   
Health_Awareness    50000.0    3.007100   1.410351   1.0    2.0    3.00   
Daily_Water_Intake  50000.0    2.997782   1.149608   1.0    2.0    3.00   
Mental_Health       50000.0    5.486740   2.860760   1.0    3.0    5.00   

                      75%    max  
Age                  48.0   60.0  
Blood_Pressure      130.2  188.4  
Cholesterol         233.5  398.8  
BMI                  28.4   46.1  

In [11]:
df = df[df['Gender'].isin(['Female', 'Male'])]

In [12]:
df

Unnamed: 0,Age,Gender,Region,Blood_Pressure,Cholesterol,BMI,Heart_Rate,Exercise_Level,Smoking,Diabetes,...,Occupation,Income_Level,Physical_Activity,Education_Level,Marital_Status,Medication,Health_Awareness,Daily_Water_Intake,Mental_Health,Obesity
0,50,Male,Rural,110.0,196.5,15.9,76,High,False,False,...,Unemployed,Low,Low,Primary,Married,False,5,2.3,5,False
1,40,Female,Urban,138.8,157.5,27.1,82,Moderate,False,False,...,Employed,Low,Moderate,Higher,Married,False,1,5.0,4,False
2,26,Male,Rural,116.0,210.1,27.2,71,Moderate,False,True,...,Employed,Middle,High,Primary,Married,False,4,2.4,8,False
3,54,Female,Rural,133.5,170.5,26.0,74,Moderate,True,False,...,Student,Middle,Moderate,Higher,Married,False,2,2.7,6,True
4,19,Female,Urban,108.0,224.5,27.5,67,Low,False,False,...,Employed,Middle,Low,Higher,Widowed,False,4,3.5,4,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49994,37,Male,Rural,137.2,163.2,22.2,91,Low,True,False,...,Employed,Low,Moderate,Higher,Single,False,2,3.6,4,False
49996,16,Male,Rural,125.2,102.9,25.1,77,Moderate,True,False,...,Unemployed,Middle,High,Secondary,Single,False,3,4.4,4,True
49997,46,Female,Suburban,116.2,160.6,27.7,63,Low,True,False,...,Employed,Middle,Low,Secondary,Single,False,2,2.4,7,False
49998,24,Female,Rural,104.6,180.4,29.2,60,Moderate,True,False,...,Unemployed,Low,Moderate,Primary,Widowed,False,3,3.1,5,False


In [13]:
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
df['Region'] = df['Region'].map({'Urban': 0, 'Rural': 1, 'Suburban': 0.5})
df['Exercise_Level'] = df['Exercise_Level'].map({'Low': 0, 'High': 1, 'Moderate': 0.5})
df['Diet'] = df['Diet'].map({'Unhealthy': 0, 'Healthy': 1, 'Mixed': 0.5})
df['Occupation'] = df['Occupation'].map({'Unemployed': 0, 'Employed': 2, 'Student': 1, 'Retired': 3})
df['Income_Level'] = df['Income_Level'].map({'Low': 0, 'High': 1, 'Middle': 0.5})
df['Physical_Activity'] = df['Physical_Activity'].map({'Low': 0, 'High': 1, 'Moderate': 0.5})
df['Education_Level'] = df['Education_Level'].map({'Primary': 0, 'Higher': 2, 'Secondary': 1})
df['Marital_Status'] = df['Marital_Status'].map({'Single': 0, 'Married': 1, 'Divorced': 2, 'Widowed': 3})

for i in df.select_dtypes(include="bool").columns:
    df[i] = df[i].map({True: 1, False: 0})

In [14]:
df.head()

Unnamed: 0,Age,Gender,Region,Blood_Pressure,Cholesterol,BMI,Heart_Rate,Exercise_Level,Smoking,Diabetes,...,Occupation,Income_Level,Physical_Activity,Education_Level,Marital_Status,Medication,Health_Awareness,Daily_Water_Intake,Mental_Health,Obesity
0,50,1,1.0,110.0,196.5,15.9,76,1.0,0,0,...,0,0.0,0.0,0,1,0,5,2.3,5,0
1,40,0,0.0,138.8,157.5,27.1,82,0.5,0,0,...,2,0.0,0.5,2,1,0,1,5.0,4,0
2,26,1,1.0,116.0,210.1,27.2,71,0.5,0,1,...,2,0.5,1.0,0,1,0,4,2.4,8,0
3,54,0,1.0,133.5,170.5,26.0,74,0.5,1,0,...,1,0.5,0.5,2,1,0,2,2.7,6,1
4,19,0,0.0,108.0,224.5,27.5,67,0.0,0,0,...,2,0.5,0.0,2,3,0,4,3.5,4,1


In [15]:
X = df.drop("Heart_Attack", axis=1)
y = df["Heart_Attack"]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [17]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((38479, 26), (9620, 26), (38479,), (9620,))

In [18]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [19]:
X_train_tensor = torch.from_numpy(X_train)
X_test_tensor = torch.from_numpy(X_test)
y_train_tensor = torch.from_numpy(y_train.to_numpy())
y_test_tensor = torch.from_numpy(y_test.to_numpy())

In [20]:
X_train_tensor = X_train_tensor.float()
y_train_tensor = y_train_tensor.float()
X_test_tensor = X_test_tensor.float()
y_test_tensor = y_test_tensor.float()

In [21]:
from torch.utils.data import Dataset,DataLoader

class CustomDataset(Dataset):
    def __init__(self,features,labels):
        self.features = features
        self.labels = labels
    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

In [22]:
train_dataset = CustomDataset(X_train_tensor, y_train_tensor)
test_dataset = CustomDataset(X_test_tensor, y_test_tensor)

In [23]:
class NeuralNetwork(nn.Module):
    def __init__(self,input_size,hidden_layers,hidden_units):
        super(NeuralNetwork,self).__init__()
        layers = []
        insize = input_size

        for i in range(hidden_layers):
            layers.append(nn.Linear(insize,hidden_units))
            layers.append(nn.ReLU())
            insize = hidden_units
        layers.append(nn.Linear(insize,1))
        self.model = nn.Sequential(*layers)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self,X):
        return self.sigmoid(self.model(X))
        

In [24]:
def objective(trial):
    # Suggest hyperparameters
    hyperparams = {
        'lr': trial.suggest_float('lr', 1e-5, 1e-2, log=True),
        'batch_size': trial.suggest_categorical('batch_size', [64, 128, 256]),
        'hidden_layers': trial.suggest_int('hidden_layers', 1, 3),
        'hidden_units': trial.suggest_categorical('hidden_units', [32, 64, 128]),
        # 'dropout_rate': trial.suggest_float('dropout_rate', 0.1, 0.5),
        'optimizer': trial.suggest_categorical('optimizer', ['Adam', 'SGD', 'RMSprop'])
    }
    train_loader = DataLoader(train_dataset, batch_size=hyperparams['batch_size'], shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=hyperparams['batch_size'], shuffle=False)

    # Initialize model
    model = NeuralNetwork(
        input_size=X_train.shape[1],
        hidden_layers=hyperparams['hidden_layers'],
        hidden_units=hyperparams['hidden_units']
        # dropout_rate=hyperparams['dropout_rate']
    )

    if hyperparams['optimizer'] == 'Adam':
        optimizer = torch.optim.Adam(model.parameters(), lr=hyperparams['lr'])
    elif hyperparams['optimizer'] == 'SGD':
        optimizer = torch.optim.SGD(model.parameters(), lr=hyperparams['lr'])
    else:
        optimizer = torch.optim.RMSprop(model.parameters(), lr=hyperparams['lr'])

    criterion = nn.BCELoss()
    
    # Training loop
    for epoch in range(50):  # Reduced epochs for faster tuning
        model.train()
        for features, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, labels.view(-1, 1))
            loss.backward()
            optimizer.step()

    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for features, labels in test_loader:
            outputs = model(features)
            predicted = (outputs > 0.5).float()
            total += labels.size(0)
            correct += (predicted.view(-1) == labels).sum().item()

    accuracy = correct / total
    return accuracy

In [25]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

[I 2025-01-30 06:28:45,858] A new study created in memory with name: no-name-7c2b2114-8fba-478b-9c44-4f3b5c5b421e
[I 2025-01-30 06:29:12,497] Trial 0 finished with value: 0.8837837837837837 and parameters: {'lr': 0.0016253106965116886, 'batch_size': 256, 'hidden_layers': 3, 'hidden_units': 32, 'optimizer': 'SGD'}. Best is trial 0 with value: 0.8837837837837837.
[I 2025-01-30 06:29:35,397] Trial 1 finished with value: 0.8837837837837837 and parameters: {'lr': 5.681424126083986e-05, 'batch_size': 256, 'hidden_layers': 1, 'hidden_units': 64, 'optimizer': 'Adam'}. Best is trial 0 with value: 0.8837837837837837.
[I 2025-01-30 06:30:33,950] Trial 2 finished with value: 0.8837837837837837 and parameters: {'lr': 1.891392640906652e-05, 'batch_size': 64, 'hidden_layers': 2, 'hidden_units': 128, 'optimizer': 'Adam'}. Best is trial 0 with value: 0.8837837837837837.
[I 2025-01-30 06:31:03,568] Trial 3 finished with value: 0.8766112266112266 and parameters: {'lr': 0.002938477203646729, 'batch_size':

In [26]:
# Show best parameters
print("Best trial:")
trial = study.best_trial
print(f"  Value (Accuracy): {trial.value:.4f}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

Best trial:
  Value (Accuracy): 0.8838
  Params: 
    lr: 0.0016253106965116886
    batch_size: 256
    hidden_layers: 3
    hidden_units: 32
    optimizer: SGD


In [27]:
# Train final model with best parameters
best_params = trial.params
train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=best_params['batch_size'], shuffle=False)

In [28]:
final_model = NeuralNetwork(
    input_size=X_train.shape[1],
    hidden_layers=best_params['hidden_layers'],
    hidden_units=best_params['hidden_units']
    # dropout_rate=best_params['dropout_rate']
)

if best_params['optimizer'] == 'Adam':
    optimizer = torch.optim.Adam(final_model.parameters(), lr=best_params['lr'])
elif best_params['optimizer'] == 'SGD':
    optimizer = torch.optim.SGD(final_model.parameters(), lr=best_params['lr'])
else:
    optimizer = torch.optim.RMSprop(final_model.parameters(), lr=best_params['lr'])

criterion = nn.BCELoss()

In [29]:
# Full training with best parameters
for epoch in range(100):  # Train for more epochs
    final_model.train()
    for features, labels in train_loader:
        optimizer.zero_grad()
        outputs = final_model(features)
        loss = criterion(outputs, labels.view(-1, 1))
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/100], Loss: {loss.item():.4f}')

Epoch [1/100], Loss: 0.5898
Epoch [2/100], Loss: 0.5314
Epoch [3/100], Loss: 0.5203
Epoch [4/100], Loss: 0.5754
Epoch [5/100], Loss: 0.4962
Epoch [6/100], Loss: 0.4663
Epoch [7/100], Loss: 0.4645
Epoch [8/100], Loss: 0.3998
Epoch [9/100], Loss: 0.3890
Epoch [10/100], Loss: 0.3110
Epoch [11/100], Loss: 0.3797
Epoch [12/100], Loss: 0.3454
Epoch [13/100], Loss: 0.3166
Epoch [14/100], Loss: 0.3996
Epoch [15/100], Loss: 0.4331
Epoch [16/100], Loss: 0.3661
Epoch [17/100], Loss: 0.3390
Epoch [18/100], Loss: 0.3179
Epoch [19/100], Loss: 0.4640
Epoch [20/100], Loss: 0.3784
Epoch [21/100], Loss: 0.3175
Epoch [22/100], Loss: 0.4361
Epoch [23/100], Loss: 0.2909
Epoch [24/100], Loss: 0.5299
Epoch [25/100], Loss: 0.4131
Epoch [26/100], Loss: 0.4991
Epoch [27/100], Loss: 0.3144
Epoch [28/100], Loss: 0.3702
Epoch [29/100], Loss: 0.4424
Epoch [30/100], Loss: 0.2902
Epoch [31/100], Loss: 0.3295
Epoch [32/100], Loss: 0.2544
Epoch [33/100], Loss: 0.3066
Epoch [34/100], Loss: 0.4474
Epoch [35/100], Loss: 0

In [30]:
# Final evaluation
final_model.eval()
correct = 0
total = 0
with torch.no_grad():
    for features, labels in test_loader:
        outputs = final_model(features)
        predicted = (outputs > 0.5).float()
        total += labels.size(0)
        correct += (predicted.view(-1) == labels).sum().item()

print(f'Final Test Accuracy: {100 * correct / total:.2f}%')

Final Test Accuracy: 88.38%


In [31]:
import pickle

with open('model.pkl', 'wb') as file:
    pickle.dump(final_model, file)