# Heart Disease Prediction

## Objective
Build a model to predict whether a person is at risk of heart disease based on their health data.

## Dataset
Heart Disease UCI Dataset (Cleveland)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, classification_report

# Set plot style
sns.set(style="whitegrid")

## 1. Data Loading

In [None]:
# Define column names as the raw file might not have them
columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']

# Load the dataset
try:
    df = pd.read_csv('heart.csv', names=columns, na_values='?')
    print("Dataset loaded successfully.")
    display(df.head())
except FileNotFoundError:
    print("Error: heart.csv not found. Please ensure the dataset is in the same directory.")

## 2. Data Cleaning and Preprocessing

In [None]:
# Check for missing values
print("Missing values:\n", df.isnull().sum())

# Drop rows with missing values (small number in this dataset)
df.dropna(inplace=True)

# Convert target to binary (0 = no disease, 1-4 = disease)
df['target'] = df['target'].apply(lambda x: 1 if x > 0 else 0)

print("\nData Info after cleaning:")
print(df.info())

## 3. Exploratory Data Analysis (EDA)

In [None]:
# Target distribution
plt.figure(figsize=(6, 4))
sns.countplot(x='target', data=df)
plt.title('Distribution of Target Variable')
plt.show()

# Correlation Matrix
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

## 4. Modeling

In [None]:
# Split data
X = df.drop('target', axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
y_prob_lr = lr_model.predict_proba(X_test)[:, 1]

# Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
y_prob_dt = dt_model.predict_proba(X_test)[:, 1]

## 5. Evaluation

In [None]:
def evaluate_model(name, y_test, y_pred, y_prob):
    print(f"--- {name} ---")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # ROC Curve
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    auc = roc_auc_score(y_test, y_prob)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {auc:.2f})')

plt.figure(figsize=(8, 6))
evaluate_model("Logistic Regression", y_test, y_pred_lr, y_prob_lr)
evaluate_model("Decision Tree", y_test, y_pred_dt, y_prob_dt)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

## 6. Feature Importance

In [None]:
# Feature Importance from Logistic Regression (Coefficients)
feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': np.abs(lr_model.coef_[0])})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance)
plt.title('Feature Importance (Logistic Regression)')
plt.show()