In [None]:
! pip install pandas
! pip install matplotlib
! pip install seaborn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Load the datasets
df_2020 = pd.read_csv('heart_2020_cleaned.csv')

In [None]:
# Data Preprocessing for heart_2020_cleaned.csv
df_2020 = df_2020.drop(columns='Race')
df_2020['Sex'] = df_2020['Sex'].replace({'Female': 0, 'Male': 1})
df_2020['AgeCategory'] = df_2020['AgeCategory'].replace({
    '18-24': 0, '25-29': 1, '30-34': 2, '35-39': 3,
    '40-44': 4, '45-49': 5, '50-54': 6, '55-59': 7,
    '60-64': 8, '65-69': 9, '70-74': 10, '75-79': 11,
    '80 or older': 12
})
df_2020['Diabetic'] = df_2020['Diabetic'].replace({
    'No': 0, 'No, borderline diabetes': 0,
    'Yes': 1, 'Yes (during pregnancy)': 1
})
df_2020['GenHealth'] = df_2020['GenHealth'].replace({
    'Poor': 0, 'Fair': 1, 'Good': 2,
    'Very good': 3, 'Excellent': 4
})
df_2020 = df_2020.replace({'No': 0, 'Yes': 1})

In [None]:
# Split data into features and target
X_2020 = df_2020.drop(columns='HeartDisease')
y_2020 = df_2020['HeartDisease']

In [None]:
# Define models
models = {
    "Logistic Regression": LogisticRegression(),
    "KNN": KNeighborsClassifier(),
    "NB": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "GradientBoosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier()
}

In [None]:
# Function to evaluate models
def evaluate_models(X, y, models):
    results = {}
    best_model_name = None
    best_accuracy = 0.0
    
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        results[name] = accuracy
        print(f"Model: {name}")
        print(f"Accuracy: {accuracy}")
        print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
        print(f"Classification Report:\n{classification_report(y_test, y_pred)}\n")
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model_name = name
    
    return results, best_model_name, best_accuracy

# Function for hyperparameter tuning
# def hyperparameter_tuning(X, y, model, param_grid):
#     grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
#     grid_search.fit(X_train, y_train)
#     best_params = grid_search.best_params_
#     best_model = grid_search.best_estimator_
#     y_pred = best_model.predict(X_test)
#     accuracy = accuracy_score(y_test, y_pred)
#     return best_params, accuracy

# Split the 2020 dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_2020, y_2020, test_size=0.1, random_state=42)



In [None]:
# Evaluate models on the 2020 dataset
results_2020, best_model_name_2020, best_accuracy_2020 = evaluate_models(X_train, y_train, models)
print(f"Best Model for 2020 dataset: {best_model_name_2020} with accuracy: {best_accuracy_2020}")



In [None]:
def preprocess_input_data(input_data):
    input_df = pd.DataFrame([input_data])
        
    input_df['Sex'] = input_df['Sex'].replace({'Female': 0, 'Male': 1})
    input_df['AgeCategory'] = input_df['AgeCategory'].replace({
        '18-24': 0, '25-29': 1, '30-34': 2, '35-39': 3,
        '40-44': 4, '45-49': 5, '50-54': 6, '55-59': 7,
        '60-64': 8, '65-69': 9, '70-74': 10, '75-79': 11,
        '80+': 12
    })
    input_df['Diabetic'] = input_df['Diabetic'].replace({
        'No': 0, 'No, borderline diabetes': 0,
        'Yes': 1, 'Yes (during pregnancy)': 1
    })
    input_df['DiffWalking'] = df_2020['DiffWalking'].replace({
    'No': 0, 
    'Yes': 1
    })
    input_df['GenHealth'] = input_df['GenHealth'].replace({
        'Poor': 0, 'Fair': 1, 'Good': 2,
        'Very good': 3, 'Excellent': 4
    })
    input_df = input_df.replace({'No': 0, 'Yes': 1})
    
    # Ensure the columns are in the same order as the training data
    ordered_columns = [
        'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke', 'PhysicalHealth', 'MentalHealth', 
        'DiffWalking', 'Sex', 'AgeCategory', 'Diabetic', 'PhysicalActivity', 
        'GenHealth', 'SleepTime', 'Asthma', 'KidneyDisease', 'SkinCancer'
    ]
    input_df = input_df[ordered_columns]
    
    return input_df


In [None]:
best_model = GradientBoostingClassifier(random_state=42)
best_model.fit(X_train, y_train)

def predict_heart_disease_risk(input_data):
    # Preprocess input data
    input_df = preprocess_input_data(input_data)
    
    # Predict risk
    risk_score = best_model.predict_proba(input_df)[:, 1][0]  # Probability of heart disease
    return round(risk_score * 100, 2)  # Convert to percentage and round to two decimal places


# Calculate feature importances
def get_feature_importances(model, X):
    importances = model.feature_importances_
    feature_importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': importances
    })
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False).reset_index(drop=True)
    return feature_importance_df

# Get feature importances from the best model
feature_importances = get_feature_importances(best_model, X_train)
print(feature_importances)


In [None]:
plt.figure(figsize=(12, 10))
correlation_matrix = df_2020.corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()


In [None]:
sns.countplot(x='PhysicalActivity', hue='HeartDisease', data=df_2020)
plt.title('Physical Activity vs Heart Disease')
plt.show()


Physical Activity vs Heart Disease:

The count plot shows that a higher number of individuals without heart disease engage in physical activity compared to those with heart disease.
However, the relative proportion of heart disease cases among those who are physically active is low.

In [None]:
sns.countplot(x='AlcoholDrinking', hue='HeartDisease', data=df_2020)
plt.title('AlcoholDrinking vs Heart Disease')
plt.show()


In [None]:
sns.catplot(x='AlcoholDrinking', y='AgeCategory', hue='HeartDisease', kind='box', data=df_2020)
plt.title('Alcohol Drinking and AgeCategory vs Heart Disease')
plt.show()

sns.catplot(x='AlcoholDrinking', y='GenHealth', hue='HeartDisease', kind='box', data=df_2020)
plt.title('Alcohol Drinking and GenHealth vs Heart Disease')
plt.show()



Alcohol Drinking and AgeCategory vs Heart Disease:
Non-Drinkers: The median age category of non-drinkers with heart disease is higher compared to those without heart disease.
Drinkers: The median age category of drinkers with heart disease is also higher, but the difference between drinkers with and without heart disease is less pronounced compared to non-drinkers.
There is a wider spread in age categories among those without heart disease in both non-drinkers and drinkers.

Alcohol Drinking and GenHealth vs Heart Disease:
For both drinkers and non-drinkers, individuals with heart disease tend to have lower general health ratings.
The median general health rating is similar between drinkers and non-drinkers.
There is a broader range of general health ratings among individuals without heart disease compared to those with heart disease.

Insights:
Age and Heart Disease:
Age appears to be a significant factor in heart disease risk, regardless of alcohol consumption.
Non-drinkers with heart disease tend to be in higher age categories compared to non-drinkers without heart disease, suggesting that age is a critical factor.

General Health and Heart Disease:
General health ratings are lower for individuals with heart disease, indicating poor general health is a risk factor for heart disease.
Alcohol consumption does not seem to drastically change the general health ratings between those with and without heart disease.

Conclusion:
These observations indicate that while alcohol consumption itself might not be a strong direct predictor of heart disease in this dataset, factors like age and general health are significant predictors. Therefore, the low feature importance of "AlcoholDrinking" in the model might be due to these stronger predictors overshadowing its impact.

Based on the observed articles we found that physical activity decreases risk of heart disease by 15%

In [None]:
def predict_heart_disease_risk(input_data):
    # Preprocess input data
    input_df = preprocess_input_data(input_data)
    
    # Predict risk using the model
    risk_score = best_model.predict_proba(input_df)[:, 1][0]  # Probability of heart disease
    
    # Adjust risk based on physical activity
    if input_data['PhysicalActivity'] == 1:
        risk_score *= 0.85  # Decrease risk by 15% for physical activity
    
    return round(risk_score * 100, 2)

In [None]:
# Example usage of healthy person
input_data = {
    'BMI': 25.0,
    'Smoking': 0,
    'AlcoholDrinking': 0,
    'Stroke': 0,
    'PhysicalHealth': 3,
    'MentalHealth': 3,
    'DiffWalking': 0,
    'Sex': 'Male',
    'AgeCategory': '18-24',
    'Diabetic': 'No',
    'PhysicalActivity': 1,
    'GenHealth': 'Very good',
    'SleepTime': 9,
    'Asthma': 0,
    'KidneyDisease': 0,
    'SkinCancer': 0
}

risk_percentage = predict_heart_disease_risk(input_data)
print(f"Predicted risk of heart disease: {risk_percentage:.2f}%")

In [None]:
# Example usage of average person
input_data = {
    'BMI': 35.0,
    'Smoking': 1,
    'AlcoholDrinking': 1,
    'Stroke': 1,
    'PhysicalHealth': 13,
    'MentalHealth': 3,
    'DiffWalking': 0,
    'Sex': 'Male',
    'AgeCategory': '30-34',
    'Diabetic': 'No',
    'PhysicalActivity': 0,
    'GenHealth': 'Fair',
    'SleepTime': 6,
    'Asthma': 1,
    'KidneyDisease': 0,
    'SkinCancer': 0
}

risk_percentage = predict_heart_disease_risk(input_data)
print(f"Predicted risk of heart disease: {risk_percentage:.2f}%")


In [None]:
# Example usage of Poor person
input_data = {
    'BMI': 45.06,
    'Smoking': 1,
    'AlcoholDrinking': 1,
    'Stroke': 1,
    'PhysicalHealth': 0,
    'MentalHealth': 15,
    'DiffWalking': 1,
    'Sex': 'Male',
    'AgeCategory': 12,
    'Diabetic': 'Yes',
    'PhysicalActivity': 0,
    'GenHealth': 'Poor',
    'SleepTime': 7,
    'Asthma': 1,
    'KidneyDisease': 1,
    'SkinCancer': 1
}

risk_percentage = predict_heart_disease_risk(input_data)
print(f"Predicted risk of heart disease: {risk_percentage:.2f}%")