# Library

In [79]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Data Set And Check For NaN 


In [80]:
dataset = pd.read_csv('Heart_Attack.csv')

In [81]:
missing_values = dataset.isnull().sum() # count of missing values in each column
print(missing_values)

Patient ID                         0
Age                                0
Sex                                0
Cholesterol                        0
Blood Pressure                     0
Heart Rate                         0
Diabetes                           0
Family History                     0
Smoking                            0
Obesity                            0
Alcohol Consumption                0
Exercise Hours Per Week            0
Diet                               0
Previous Heart Problems            0
Medication Use                     0
Stress Level                       0
Sedentary Hours Per Day            0
Income                             0
BMI                                0
Triglycerides                      0
Physical Activity Days Per Week    0
Sleep Hours Per Day                0
Country                            0
Continent                          0
Hemisphere                         0
Heart Attack Risk                  0
dtype: int64


## Explore Data Set

In [82]:
for i, col in enumerate(dataset.columns):
    print(f"{i}: {col}")


0: Patient ID
1: Age
2: Sex
3: Cholesterol
4: Blood Pressure
5: Heart Rate
6: Diabetes
7: Family History
8: Smoking
9: Obesity
10: Alcohol Consumption
11: Exercise Hours Per Week
12: Diet
13: Previous Heart Problems
14: Medication Use
15: Stress Level
16: Sedentary Hours Per Day
17: Income
18: BMI
19: Triglycerides
20: Physical Activity Days Per Week
21: Sleep Hours Per Day
22: Country
23: Continent
24: Hemisphere
25: Heart Attack Risk


In [83]:
print(dataset.dtypes)


Patient ID                          object
Age                                  int64
Sex                                 object
Cholesterol                          int64
Blood Pressure                      object
Heart Rate                           int64
Diabetes                             int64
Family History                       int64
Smoking                              int64
Obesity                              int64
Alcohol Consumption                  int64
Exercise Hours Per Week            float64
Diet                                object
Previous Heart Problems              int64
Medication Use                       int64
Stress Level                         int64
Sedentary Hours Per Day            float64
Income                               int64
BMI                                float64
Triglycerides                        int64
Physical Activity Days Per Week      int64
Sleep Hours Per Day                  int64
Country                             object
Continent  

In [84]:
dataset_Cleaned = dataset.drop(columns=['Patient ID', 'Country'])

In [85]:
bp_split = dataset_Cleaned['Blood Pressure'].str.split('/', expand=True)
dataset_Cleaned['Systolic BP'] = pd.to_numeric(bp_split[0], errors='coerce')
dataset_Cleaned['Diastolic BP'] = pd.to_numeric(bp_split[1], errors='coerce')

dataset_Cleaned = dataset_Cleaned.drop(columns=['Blood Pressure'])

In [86]:
dataset_Cleaned.head()

Unnamed: 0,Age,Sex,Cholesterol,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,...,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Continent,Hemisphere,Heart Attack Risk,Systolic BP,Diastolic BP
0,67,Male,208,72,0,0,1,0,0,4.168189,...,261404,31.251233,286,0,6,South America,Southern Hemisphere,0,158,88
1,21,Male,389,98,1,1,1,1,1,1.813242,...,285768,27.194973,235,1,7,North America,Northern Hemisphere,0,165,93
2,21,Female,324,72,1,0,0,0,0,2.078353,...,235282,28.176571,587,4,4,Europe,Northern Hemisphere,0,174,99
3,84,Male,383,73,1,1,1,0,1,9.82813,...,125640,36.464704,378,3,4,North America,Northern Hemisphere,0,163,100
4,66,Male,318,93,1,1,1,1,0,5.804299,...,160555,21.809144,231,1,5,Asia,Northern Hemisphere,0,91,88


## Split Data Set


In [87]:
from sklearn.model_selection import train_test_split

X = dataset_Cleaned.drop(columns='Heart Attack Risk')
y = dataset_Cleaned['Heart Attack Risk']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)


In [88]:
print(X_train)

      Age     Sex  Cholesterol  Heart Rate  Diabetes  Family History  Smoking  \
5286   54    Male          262         105         1               0        1   
1119   69  Female          293         100         1               0        1   
5315   53    Male          297          74         1               1        1   
3160   33    Male          346          80         1               1        1   
6791   83    Male          386          91         1               1        1   
...   ...     ...          ...         ...       ...             ...      ...   
4373   36    Male          228          92         0               0        1   
7891   19    Male          266          51         0               0        1   
4859   67    Male          254         107         1               0        1   
3264   51    Male          399          97         1               0        1   
2732   37    Male          272          44         0               0        1   

      Obesity  Alcohol Cons

In [89]:
print(y_train)

5286    0
1119    1
5315    0
3160    0
6791    0
       ..
4373    1
7891    0
4859    0
3264    1
2732    0
Name: Heart Attack Risk, Length: 7010, dtype: int64


## Feature Scaling & One Hot-Encoding


In [90]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Identify features
numeric_features = dataset_Cleaned.select_dtypes(include=['int64', 'float64']).columns.drop('Heart Attack Risk')
categorical_features = dataset_Cleaned.select_dtypes(include=['object']).columns

# Define transformers
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first', sparse_output=False)

# Combine transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Apply transformations
X_transformed = preprocessor.fit_transform(dataset_Cleaned.drop(columns='Heart Attack Risk'))

# Get the column names for the transformed data
encoded_cat_features = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
transformed_columns = numeric_features.tolist() + encoded_cat_features.tolist()

# Create transformed DataFrame
df_transformed = pd.DataFrame(X_transformed, columns=transformed_columns) # type: ignore

# Add target back
df_transformed['Heart Attack Risk'] = dataset_Cleaned['Heart Attack Risk'].values



In [91]:

df_transformed.head()

Unnamed: 0,Age,Cholesterol,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,Previous Heart Problems,...,Sex_Male,Diet_Healthy,Diet_Unhealthy,Continent_Asia,Continent_Australia,Continent_Europe,Continent_North America,Continent_South America,Hemisphere_Southern Hemisphere,Heart Attack Risk
0,0.625557,-0.641579,-0.147042,-1.369651,-0.986061,0.339157,-1.002857,-1.219867,-1.010838,-0.991704,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0
1,-1.539322,1.596895,1.118179,0.730113,1.014136,0.339157,0.997151,0.819762,-1.418027,1.008365,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0
2,-1.539322,0.793023,-0.147042,0.730113,-0.986061,-2.948488,-1.002857,-1.219867,-1.372188,1.008365,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
3,1.425621,1.522691,-0.09838,0.730113,1.014136,0.339157,-1.002857,0.819762,-0.032188,1.008365,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
4,0.578495,0.71882,0.874867,0.730113,1.014136,0.339157,0.997151,-1.219867,-0.727941,1.008365,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0


## Logistic Regression