# Library

In [170]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Data Set And Check For NaN 


In [171]:
dataset = pd.read_csv('Heart_Attack.csv')

X = dataset.iloc[:, :-1].values  # Features
y = dataset.iloc[:, -1].values   # Heart Atk Risk  

In [172]:
print(X)

[['BMW7812' 67 'Male' ... 'Argentina' 'South America'
  'Southern Hemisphere']
 ['CZE1114' 21 'Male' ... 'Canada' 'North America' 'Northern Hemisphere']
 ['BNI9906' 21 'Female' ... 'France' 'Europe' 'Northern Hemisphere']
 ...
 ['XKA5925' 47 'Male' ... 'Brazil' 'South America' 'Southern Hemisphere']
 ['EPE6801' 36 'Male' ... 'Brazil' 'South America' 'Southern Hemisphere']
 ['ZWN9666' 25 'Female' ... 'United Kingdom' 'Europe'
  'Northern Hemisphere']]


In [173]:
print(y)

[0 0 0 ... 1 0 1]


In [174]:
missing_values = dataset.isnull().sum() # count of missing values in each column
print(missing_values)

Patient ID                         0
Age                                0
Sex                                0
Cholesterol                        0
Blood Pressure                     0
Heart Rate                         0
Diabetes                           0
Family History                     0
Smoking                            0
Obesity                            0
Alcohol Consumption                0
Exercise Hours Per Week            0
Diet                               0
Previous Heart Problems            0
Medication Use                     0
Stress Level                       0
Sedentary Hours Per Day            0
Income                             0
BMI                                0
Triglycerides                      0
Physical Activity Days Per Week    0
Sleep Hours Per Day                0
Country                            0
Continent                          0
Hemisphere                         0
Heart Attack Risk                  0
dtype: int64


## Explore Data Set

In [175]:
for i, col in enumerate(dataset.columns):
    print(f"{i}: {col}")


0: Patient ID
1: Age
2: Sex
3: Cholesterol
4: Blood Pressure
5: Heart Rate
6: Diabetes
7: Family History
8: Smoking
9: Obesity
10: Alcohol Consumption
11: Exercise Hours Per Week
12: Diet
13: Previous Heart Problems
14: Medication Use
15: Stress Level
16: Sedentary Hours Per Day
17: Income
18: BMI
19: Triglycerides
20: Physical Activity Days Per Week
21: Sleep Hours Per Day
22: Country
23: Continent
24: Hemisphere
25: Heart Attack Risk


In [176]:
print(dataset.dtypes)


Patient ID                          object
Age                                  int64
Sex                                 object
Cholesterol                          int64
Blood Pressure                      object
Heart Rate                           int64
Diabetes                             int64
Family History                       int64
Smoking                              int64
Obesity                              int64
Alcohol Consumption                  int64
Exercise Hours Per Week            float64
Diet                                object
Previous Heart Problems              int64
Medication Use                       int64
Stress Level                         int64
Sedentary Hours Per Day            float64
Income                               int64
BMI                                float64
Triglycerides                        int64
Physical Activity Days Per Week      int64
Sleep Hours Per Day                  int64
Country                             object
Continent  

## Split Data Set


In [179]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)


In [None]:
X_train = np.delete(X_train, [0, 22], axis=1) # Remove Patient ID and Country
X_test = np.delete(X_test, [0, 22], axis=1)

In [181]:
print(X_train)

[[41 'Male' 374 ... 6 'Asia' 'Northern Hemisphere']
 [27 'Female' 129 ... 8 'Asia' 'Northern Hemisphere']
 [65 'Male' 292 ... 4 'Europe' 'Northern Hemisphere']
 ...
 [67 'Male' 254 ... 10 'North America' 'Northern Hemisphere']
 [51 'Male' 399 ... 5 'Asia' 'Northern Hemisphere']
 [37 'Male' 272 ... 4 'Asia' 'Northern Hemisphere']]


## Feature Scaling & One Hot-Encoding


In [184]:
for i, col in enumerate(dataset.columns[:-1]):  # Exclude the target column if necessary
    print(f"{i}: {col}")

0: Age
1: Sex
2: Cholesterol
3: Blood Pressure
4: Heart Rate
5: Diabetes
6: Family History
7: Smoking
8: Obesity
9: Alcohol Consumption
10: Exercise Hours Per Week
11: Diet
12: Previous Heart Problems
13: Medication Use
14: Stress Level
15: Sedentary Hours Per Day
16: Income
17: BMI
18: Triglycerides
19: Physical Activity Days Per Week
20: Sleep Hours Per Day
21: Continent
22: Hemisphere


In [185]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = [1, 21, 22]  # Column indices for categorical features in X

# Apply one-hot encoding
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), categorical_features)], remainder='passthrough')
X_train = ct.fit_transform(X_train)
X_train = ct.transform(X_train)
X_test = ct.transform(X_test)

ValueError: X has 36 features, but ColumnTransformer is expecting 24 features as input.

In [None]:
print(X_train)

[[0.0 1.0 'OQD2740' ... 'India' 'Asia' 'Northern Hemisphere']
 [1.0 0.0 'PMA6847' ... 'Japan' 'Asia' 'Northern Hemisphere']
 [0.0 1.0 'FBC9961' ... 'Germany' 'Europe' 'Northern Hemisphere']
 ...
 [0.0 1.0 'OMJ2303' ... 'United States' 'North America'
  'Northern Hemisphere']
 [0.0 1.0 'YAO7502' ... 'Thailand' 'Asia' 'Northern Hemisphere']
 [0.0 1.0 'ZMU0934' ... 'Thailand' 'Asia' 'Northern Hemisphere']]


In [None]:
# from sklearn.preprocessing import StandardScaler


# # Ensure all features are numeric by selecting only numeric columns
# X_train_numeric = np.array(X_train, dtype=float)
# X_test_numeric = np.array(X_test, dtype=float)

# sc = StandardScaler()
# X_train = sc.fit_transform(X_train_numeric)
# X_test = sc.transform(X_test_numeric)