<a href="https://colab.research.google.com/github/Jessy-Jones/Heart-attack-ML-prediction-/blob/main/Heart_Attack.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DATA EXPLORATION

In [41]:
#Importing necessary dependencies
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [42]:
#Loading the dataset and examining its structure
data = pd.read_csv("/content/heart_attack_prediction_dataset.csv (1) (1).zip")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8763 entries, 0 to 8762
Data columns (total 26 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Patient ID                       8763 non-null   object 
 1   Age                              8763 non-null   int64  
 2   Sex                              8763 non-null   object 
 3   Cholesterol                      8763 non-null   int64  
 4   Blood Pressure                   8763 non-null   object 
 5   Heart Rate                       8763 non-null   int64  
 6   Diabetes                         8763 non-null   int64  
 7   Family History                   8763 non-null   int64  
 8   Smoking                          8763 non-null   int64  
 9   Obesity                          8763 non-null   int64  
 10  Alcohol Consumption              8763 non-null   int64  
 11  Exercise Hours Per Week          8763 non-null   float64
 12  Diet                

# DATA PREPROCESSING

In [43]:
#Dropping unnecessary columns
columns_to_drop = ["Patient ID", "Country", "Continent", "Hemisphere"]
cleaned_data =data.drop(columns=columns_to_drop)
cleaned_data.head()

Unnamed: 0,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,...,Previous Heart Problems,Medication Use,Stress Level,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Heart Attack Risk
0,67,Male,208,158/88,72,0,0,1,0,0,...,0,0,9,6.615001,261404,31.251233,286,0,6,0
1,21,Male,389,165/93,98,1,1,1,1,1,...,1,0,1,4.963459,285768,27.194973,235,1,7,0
2,21,Female,324,174/99,72,1,0,0,0,0,...,1,1,9,9.463426,235282,28.176571,587,4,4,0
3,84,Male,383,163/100,73,1,1,1,0,1,...,1,0,9,7.648981,125640,36.464704,378,3,4,0
4,66,Male,318,91/88,93,1,1,1,1,0,...,1,0,6,1.514821,160555,21.809144,231,1,5,0


In [44]:
#Splitting blood pressure column into systolic and diastolic columns to convert from string to float
cleaned_data[['systolic', 'diastolic']] = cleaned_data['Blood Pressure'].str.split('/', expand=True).astype(float)

#Dropping the original Blood Pressure column
cleaned_data = cleaned_data.drop(columns=["Blood Pressure"])
cleaned_data.tail()

Unnamed: 0,Age,Sex,Cholesterol,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,...,Stress Level,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Heart Attack Risk,systolic,diastolic
8758,60,Male,121,61,1,1,1,0,1,7.917342,...,8,10.806373,235420,19.655895,67,7,7,0,94.0,76.0
8759,28,Female,120,73,1,0,0,1,0,16.558426,...,8,3.833038,217881,23.993866,617,4,9,0,157.0,102.0
8760,47,Male,250,105,0,1,1,1,1,3.148438,...,5,2.375214,36998,35.406146,527,4,4,1,161.0,75.0
8761,36,Male,178,60,1,0,1,0,0,3.78995,...,5,0.029104,209943,27.29402,114,2,8,0,119.0,67.0
8762,25,Female,356,75,1,1,0,0,1,18.081748,...,8,9.005234,247338,32.914151,180,7,4,1,138.0,67.0


In [45]:
#Encoding the cleaned data (one-hot encoding)
columns_to_encode = ["Sex", "Diet"]
encoded_data = pd.get_dummies(cleaned_data, columns=columns_to_encode, drop_first=True)
encoded_data.head()


Unnamed: 0,Age,Cholesterol,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,Previous Heart Problems,...,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Heart Attack Risk,systolic,diastolic,Sex_Male,Diet_Healthy,Diet_Unhealthy
0,67,208,72,0,0,1,0,0,4.168189,0,...,31.251233,286,0,6,0,158.0,88.0,1,0,0
1,21,389,98,1,1,1,1,1,1.813242,1,...,27.194973,235,1,7,0,165.0,93.0,1,0,1
2,21,324,72,1,0,0,0,0,2.078353,1,...,28.176571,587,4,4,0,174.0,99.0,0,1,0
3,84,383,73,1,1,1,0,1,9.82813,1,...,36.464704,378,3,4,0,163.0,100.0,1,0,0
4,66,318,93,1,1,1,1,0,5.804299,1,...,21.809144,231,1,5,0,91.0,88.0,1,0,1


In [46]:
#Splitting the data into features and target values
y = encoded_data["Heart Attack Risk"]
X = encoded_data.drop(columns = ["Heart Attack Risk"])

In [47]:
#Splitting the cleaned, encoded dataset into training and testing data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
print(X_train.shape)
print(X_test.shape)

(6134, 23)
(2629, 23)


In [48]:
#Normalizing the features of the data
scaler = MinMaxScaler()

X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)


In [49]:
#Making an instance of the Random Forest Classifier model
rf_model = RandomForestClassifier()

In [50]:
#Training model on the training set
rf_model.fit(X_train_normalized, y_train)

In [53]:
#Making predictions on the test set
y_predict = rf_model.predict(X_test_normalized)

[0 0 0 ... 0 0 0]


In [58]:
#Evaluating the model
accuracy = accuracy_score(y_test, y_predict)
report = classification_report(y_test, y_predict)
print(f'Accuracy: {accuracy}')
print(f'Classification report: {report}')

Accuracy: 0.6348421453023964
Classification report:               precision    recall  f1-score   support

           0       0.65      0.97      0.77      1698
           1       0.33      0.03      0.06       931

    accuracy                           0.63      2629
   macro avg       0.49      0.50      0.41      2629
weighted avg       0.53      0.63      0.52      2629

