# Heart Attack Disease Risk Prediction Model

This notebook creates a machine learning model to predict heart attack risk based on 8 key health parameters:
- Age
- Sex (0: Female, 1: Male)
- Heart Rate
- Diabetes (0: No, 1: Yes)
- Smoking (0: No, 1: Yes)
- Alcohol Consumption (0: No, 1: Yes)
- Previous Heart Problems (0: No, 1: Yes)
- BMI (Body Mass Index)

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Generate synthetic dataset for heart attack prediction
np.random.seed(42)
n_samples = 2000

# Generate features
age = np.random.normal(55, 15, n_samples).astype(int)
age = np.clip(age, 20, 90)  # Clip to reasonable age range

sex = np.random.binomial(1, 0.6, n_samples)  # 60% male
heart_rate = np.random.normal(75, 12, n_samples).astype(int)
heart_rate = np.clip(heart_rate, 50, 120)

diabetes = np.random.binomial(1, 0.15, n_samples)  # 15% have diabetes
smoking = np.random.binomial(1, 0.25, n_samples)  # 25% smoke
alcohol = np.random.binomial(1, 0.4, n_samples)  # 40% consume alcohol
previous_heart_problems = np.random.binomial(1, 0.1, n_samples)  # 10% have previous heart problems

bmi = np.random.normal(26, 4, n_samples)
bmi = np.clip(bmi, 15, 45)

# Create risk score based on medical knowledge
risk_score = (
    (age - 20) * 0.02 +  # Age factor
    sex * 0.3 +  # Male higher risk
    (heart_rate - 60) * 0.01 +  # Heart rate factor
    diabetes * 0.4 +  # Diabetes factor
    smoking * 0.5 +  # Smoking factor
    alcohol * 0.2 +  # Alcohol factor
    previous_heart_problems * 0.8 +  # Previous problems factor
    (bmi - 18.5) * 0.03 +  # BMI factor
    np.random.normal(0, 0.1, n_samples)  # Random noise
)

# Convert risk score to binary outcome (heart attack risk: 0 = Low, 1 = High)
heart_attack_risk = (risk_score > np.percentile(risk_score, 70)).astype(int)

# Create DataFrame
data = pd.DataFrame({
    'Age': age,
    'Sex': sex,
    'Heart_Rate': heart_rate,
    'Diabetes': diabetes,
    'Smoking': smoking,
    'Alcohol_Consumption': alcohol,
    'Previous_Heart_Problems': previous_heart_problems,
    'BMI': bmi,
    'Heart_Attack_Risk': heart_attack_risk
})

print("Dataset created successfully!")
print(f"Dataset shape: {data.shape}")
print("\nFirst 5 rows:")
print(data.head())

In [None]:
# Exploratory Data Analysis
print("Dataset Info:")
print(data.info())
print("\nDataset Description:")
print(data.describe())
print("\nTarget Distribution:")
print(data['Heart_Attack_Risk'].value_counts())
print(f"\nHeart Attack Risk Rate: {data['Heart_Attack_Risk'].mean():.2%}")

In [None]:
# Visualizations
plt.figure(figsize=(15, 10))

# Age distribution by heart attack risk
plt.subplot(2, 3, 1)
sns.boxplot(data=data, x='Heart_Attack_Risk', y='Age')
plt.title('Age Distribution by Heart Attack Risk')

# BMI distribution by heart attack risk
plt.subplot(2, 3, 2)
sns.boxplot(data=data, x='Heart_Attack_Risk', y='BMI')
plt.title('BMI Distribution by Heart Attack Risk')

# Heart Rate distribution by heart attack risk
plt.subplot(2, 3, 3)
sns.boxplot(data=data, x='Heart_Attack_Risk', y='Heart_Rate')
plt.title('Heart Rate Distribution by Heart Attack Risk')

# Correlation heatmap
plt.subplot(2, 3, 4)
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')

# Risk factors bar plot
plt.subplot(2, 3, 5)
risk_factors = ['Sex', 'Diabetes', 'Smoking', 'Alcohol_Consumption', 'Previous_Heart_Problems']
risk_rates = [data[data[factor] == 1]['Heart_Attack_Risk'].mean() for factor in risk_factors]
plt.bar(range(len(risk_factors)), risk_rates)
plt.xticks(range(len(risk_factors)), risk_factors, rotation=45)
plt.ylabel('Heart Attack Risk Rate')
plt.title('Risk Rate by Factor')

plt.tight_layout()
plt.show()

In [None]:
# Prepare data for machine learning
X = data.drop('Heart_Attack_Risk', axis=1)
y = data['Heart_Attack_Risk']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")
print(f"Training set heart attack rate: {y_train.mean():.2%}")
print(f"Testing set heart attack rate: {y_test.mean():.2%}")

In [None]:
# Scale features for Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train multiple models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42)
}

results = {}

for name, model in models.items():
    if name == 'Logistic Regression':
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = {
        'model': model,
        'accuracy': accuracy,
        'predictions': y_pred,
        'probabilities': y_pred_proba
    }
    
    print(f"\n{name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'{name} - Confusion Matrix')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

In [None]:
# Select best model (Random Forest typically performs better)
best_model = results['Random Forest']['model']
print("Selected model: Random Forest")
print(f"Best model accuracy: {results['Random Forest']['accuracy']:.4f}")

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)

# Plot feature importance
plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance, x='importance', y='feature')
plt.title('Feature Importance - Random Forest Model')
plt.xlabel('Importance')
plt.show()

In [None]:
# Save the model and scaler
joblib.dump(best_model, 'heart_attack_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

print("Model and scaler saved successfully!")
print("Files created:")
print("- heart_attack_model.pkl")
print("- scaler.pkl")

In [None]:
# Test model with sample predictions
def predict_heart_attack_risk(age, sex, heart_rate, diabetes, smoking, alcohol, previous_heart, bmi):
    """
    Predict heart attack risk for given parameters
    """
    # Create input array
    input_data = np.array([[age, sex, heart_rate, diabetes, smoking, alcohol, previous_heart, bmi]])
    
    # Make prediction
    prediction = best_model.predict(input_data)[0]
    probability = best_model.predict_proba(input_data)[0][1]
    
    return prediction, probability

# Test cases
test_cases = [
    {"age": 45, "sex": 1, "heart_rate": 70, "diabetes": 0, "smoking": 0, "alcohol": 0, "previous_heart": 0, "bmi": 22.5},
    {"age": 65, "sex": 1, "heart_rate": 85, "diabetes": 1, "smoking": 1, "alcohol": 1, "previous_heart": 1, "bmi": 32.0},
    {"age": 35, "sex": 0, "heart_rate": 75, "diabetes": 0, "smoking": 0, "alcohol": 0, "previous_heart": 0, "bmi": 21.0}
]

print("Sample Predictions:")
for i, case in enumerate(test_cases, 1):
    pred, prob = predict_heart_attack_risk(**case)
    risk_level = "High" if pred == 1 else "Low"
    print(f"\nTest Case {i}:")
    print(f"Input: {case}")
    print(f"Prediction: {risk_level} Risk")
    print(f"Probability: {prob:.2%}")