In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [42]:
df = pd.read_csv('/Users/jacksonsorenson/Documents/Pyhton Projects/Japan Heart Attack/japan_heart_attack_dataset.csv')
df.head()

Unnamed: 0,Age,Gender,Region,Smoking_History,Diabetes_History,Hypertension_History,Cholesterol_Level,Physical_Activity,Diet_Quality,Alcohol_Consumption,...,Extra_Column_6,Extra_Column_7,Extra_Column_8,Extra_Column_9,Extra_Column_10,Extra_Column_11,Extra_Column_12,Extra_Column_13,Extra_Column_14,Extra_Column_15
0,56,Male,Urban,Yes,No,No,186.400209,Moderate,Poor,Low,...,0.007901,0.794583,0.290779,0.497193,0.521995,0.799657,0.722398,0.148739,0.83401,0.061632
1,69,Male,Urban,No,No,No,185.136747,Low,Good,Low,...,0.083933,0.688951,0.830164,0.63449,0.302043,0.043683,0.451668,0.878671,0.535602,0.617825
2,46,Male,Rural,Yes,No,No,210.696611,Low,Average,Moderate,...,0.227205,0.496344,0.752107,0.181501,0.62918,0.018276,0.063227,0.146512,0.997296,0.974455
3,32,Female,Urban,No,No,No,211.165478,Moderate,Good,High,...,0.403182,0.741409,0.223968,0.329314,0.143191,0.907781,0.542322,0.922461,0.626217,0.228606
4,60,Female,Rural,No,No,No,223.814253,High,Good,High,...,0.689787,0.904574,0.757098,0.337761,0.362375,0.728552,0.176699,0.484749,0.312091,0.452809


In [43]:
df.dtypes

Age                          int64
Gender                      object
Region                      object
Smoking_History             object
Diabetes_History            object
Hypertension_History        object
Cholesterol_Level          float64
Physical_Activity           object
Diet_Quality                object
Alcohol_Consumption         object
Stress_Levels              float64
BMI                        float64
Heart_Rate                 float64
Systolic_BP                float64
Diastolic_BP               float64
Family_History              object
Heart_Attack_Occurrence     object
Extra_Column_1             float64
Extra_Column_2             float64
Extra_Column_3             float64
Extra_Column_4             float64
Extra_Column_5             float64
Extra_Column_6             float64
Extra_Column_7             float64
Extra_Column_8             float64
Extra_Column_9             float64
Extra_Column_10            float64
Extra_Column_11            float64
Extra_Column_12     

In [44]:

# Drop extra columns
extra_columns = [f'Extra_Column_{i}' for i in range(1, 16)]
df_cleaned = df.drop(columns=extra_columns, errors='ignore')

# Identify categorical and numerical columns
categorical_cols = ['Gender', 'Smoking_History', 'Diabetes_History', 'Hypertension_History',
                    'Physical_Activity', 'Diet_Quality', 'Alcohol_Consumption', 'Family_History']
numerical_cols = ['Age', 'Cholesterol_Level', 'Stress_Levels', 'BMI', 'Heart_Rate', 'Systolic_BP', 'Diastolic_BP']

# Handle missing values
df_cleaned['Alcohol_Consumption'].fillna(df_cleaned['Alcohol_Consumption'].mode()[0], inplace=True)
for col in numerical_cols:
    df_cleaned[col].fillna(df_cleaned[col].median(), inplace=True)

# One-Hot Encoding for categorical features
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_categorical = encoder.fit_transform(df_cleaned[categorical_cols])
encoded_df = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out(categorical_cols))

# Combine categorical & numerical data
df_final = pd.concat([df_cleaned[numerical_cols], encoded_df], axis=1)

# Encode target variable (Yes=1, No=0)
df_final['Heart_Attack_Occurrence'] = df_cleaned['Heart_Attack_Occurrence'].map({'Yes': 1, 'No': 0})

# Drop low-importance features (selected based on your analysis)
drop_cols = ['Cholesterol_Level',  'Diastolic_BP', 'Alcohol_Consumption_Low']
df_final.drop(columns=drop_cols, inplace=True)

# Split dataset
X = df_final.drop(columns=['Heart_Attack_Occurrence'])
y = df_final['Heart_Attack_Occurrence']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical features
scaler = StandardScaler()
num_cols_to_scale = ['Age', 'Stress_Levels', 'BMI', 'Heart_Rate', 'Systolic_BP']  # Only important ones
X_train[num_cols_to_scale] = scaler.fit_transform(X_train[num_cols_to_scale])
X_test[num_cols_to_scale] = scaler.transform(X_test[num_cols_to_scale])

# Add Feature Interaction Terms (Polynomial Features)
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Train Logistic Regression
log_reg = LogisticRegression(class_weight='balanced', max_iter=500, random_state=42)
log_reg.fit(X_train_poly, y_train)

# Predict and Evaluate Model
y_pred = log_reg.predict(X_test_poly)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print Results
print(f"Model Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", classification_rep)


Model Accuracy: 0.5377

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.54      0.68      5392
           1       0.11      0.48      0.17       608

    accuracy                           0.54      6000
   macro avg       0.50      0.51      0.43      6000
weighted avg       0.82      0.54      0.63      6000



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['Alcohol_Consumption'].fillna(df_cleaned['Alcohol_Consumption'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned[col].fillna(df_cleaned[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will neve

In [45]:
from sklearn.ensemble import RandomForestClassifier

# Add Feature Interaction Terms (Polynomial Features)
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Train Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=200, class_weight="balanced", random_state=42)
rf_model.fit(X_train_poly, y_train)

# Predict and Evaluate Model
y_pred_rf = rf_model.predict(X_test_poly)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
classification_rep_rf = classification_report(y_test, y_pred_rf)

# Print Results
print(f"Random Forest Model Accuracy: {accuracy_rf:.4f}")
print("\nClassification Report:\n", classification_rep_rf)


Random Forest Model Accuracy: 0.8987

Classification Report:
               precision    recall  f1-score   support

           0       0.90      1.00      0.95      5392
           1       0.00      0.00      0.00       608

    accuracy                           0.90      6000
   macro avg       0.45      0.50      0.47      6000
weighted avg       0.81      0.90      0.85      6000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [48]:
import numpy as np

# Get feature importance from the trained Random Forest model
feature_importance = rf_model.feature_importances_
feature_names = X_train.columns

# Sort features by importance
important_features = sorted(zip(feature_names, feature_importance), key=lambda x: x[1], reverse=True)

# Print the top important features
print("Top Features Influencing Heart Attacks:")
for feature, importance in important_features:  # Show top 10
    print(f"{feature}: {importance:.4f}")


Top Features Influencing Heart Attacks:
Systolic_BP: 0.0246
Stress_Levels: 0.0244
BMI: 0.0241
Heart_Rate: 0.0240
Age: 0.0189
Alcohol_Consumption_Moderate: 0.0018
Gender_Male: 0.0017
Family_History_Yes: 0.0017
Diet_Quality_Good: 0.0017
Smoking_History_Yes: 0.0016
Hypertension_History_Yes: 0.0016
Physical_Activity_Moderate: 0.0016
Diabetes_History_Yes: 0.0015
Physical_Activity_Low: 0.0014
Diet_Quality_Poor: 0.0014


In [47]:
from xgboost import XGBClassifier

# Train XGBoost Classifier
xgb_model = XGBClassifier(scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1]), random_state=42)
xgb_model.fit(X_train_poly, y_train)

# Predict and Evaluate Model
y_pred_xgb = xgb_model.predict(X_test_poly)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
classification_rep_xgb = classification_report(y_test, y_pred_xgb)

# Print Results
print(f"XGBoost Model Accuracy: {accuracy_xgb:.4f}")
print("\nClassification Report:\n", classification_rep_xgb)


XGBoost Model Accuracy: 0.7773

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.85      0.87      5392
           1       0.09      0.13      0.11       608

    accuracy                           0.78      6000
   macro avg       0.49      0.49      0.49      6000
weighted avg       0.81      0.78      0.80      6000



In [50]:


# Drop low-importance categorical features
drop_cols = ['Diet_Quality', 'Physical_Activity', 'Diabetes_History', 'Hypertension_History', 'Smoking_History']

# Drop these features from the dataset
df_final.drop(columns=[col for col in drop_cols if col in df_final.columns], inplace=True)

# Split dataset
X = df_final.drop(columns=['Heart_Attack_Occurrence'])
y = df_final['Heart_Attack_Occurrence']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensure numerical_cols only contains existing columns
numerical_cols = ['Age', 'Stress_Levels', 'BMI', 'Heart_Rate', 'Systolic_BP']

# Standardize numerical features
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

# Add Feature Interaction Terms (Polynomial Features)
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Train Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=200, class_weight="balanced", random_state=42)
rf_model.fit(X_train_poly, y_train)

# Predict and Evaluate Random Forest
y_pred_rf = rf_model.predict(X_test_poly)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
classification_rep_rf = classification_report(y_test, y_pred_rf)

print(f"Updated Random Forest Model Accuracy: {accuracy_rf:.4f}")
print("\nUpdated Random Forest Classification Report:\n", classification_rep_rf)

# Train XGBoost Classifier
xgb_model = XGBClassifier(scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1]), random_state=42)
xgb_model.fit(X_train_poly, y_train)

# Predict and Evaluate XGBoost
y_pred_xgb = xgb_model.predict(X_test_poly)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
classification_rep_xgb = classification_report(y_test, y_pred_xgb)

print(f"Updated XGBoost Model Accuracy: {accuracy_xgb:.4f}")
print("\nUpdated XGBoost Classification Report:\n", classification_rep_xgb)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Updated Random Forest Model Accuracy: 0.8987

Updated Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.90      1.00      0.95      5392
           1       0.00      0.00      0.00       608

    accuracy                           0.90      6000
   macro avg       0.45      0.50      0.47      6000
weighted avg       0.81      0.90      0.85      6000

Updated XGBoost Model Accuracy: 0.7773

Updated XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.85      0.87      5392
           1       0.09      0.13      0.11       608

    accuracy                           0.78      6000
   macro avg       0.49      0.49      0.49      6000
weighted avg       0.81      0.78      0.80      6000

