# Heart Disease Risk Prediction: Logistic Regression Homework

## Introductory Context

Heart disease is the world's leading cause of death, claiming approximately 18 million lives each year, as reported by the World Health Organization. Predictive models like logistic regression can enable early identification of at-risk patients by analyzing clinical features such as age, cholesterol, and blood pressure. This not only improves treatment outcomes but also optimizes resource allocation in healthcare settings.

In [None]:
# Install required libraries if not already installed
import sys
!{sys.executable} -m pip install scikit-learn --quiet
print("Libraries installation check completed")

In [4]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

print("Libraries imported successfully")

ModuleNotFoundError: No module named 'sklearn'

## Step 1: Load and Prepare the Dataset

In [None]:
# Load the dataset
df = pd.read_csv('Heart_Disease_Prediction.csv')

print("Dataset dimensions:", df.shape)
print("\nFirst few rows:")
df.head()

In [None]:
# General dataset information
print("Dataset information:")
print(df.info())
print("\n" + "="*50)
print("\nDescriptive statistics:")
df.describe()

In [None]:
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())
print("\n" + "="*50)
print("\nCheck for duplicates:", df.duplicated().sum())

In [None]:
# Binarize the target column
# Map "Presence" -> 1 and "Absence" -> 0
df['Heart Disease'] = df['Heart Disease'].map({'Presence': 1, 'Absence': 0})

# Check class distribution
print("Target variable distribution:")
print(df['Heart Disease'].value_counts())
print("\nProportion:")
print(df['Heart Disease'].value_counts(normalize=True))

# Visualize class distribution
plt.figure(figsize=(8, 5))
df['Heart Disease'].value_counts().plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('Class Distribution - Heart Disease', fontsize=14, fontweight='bold')
plt.xlabel('Heart Disease (0=Absence, 1=Presence)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.xticks(rotation=0)
plt.grid(axis='y', alpha=0.3)
for i, v in enumerate(df['Heart Disease'].value_counts().values):
    plt.text(i, v + 2, str(v), ha='center', fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Exploratory Data Analysis (EDA)
# Visualize distributions of selected numerical features
features_to_plot = ['Age', 'BP', 'Cholesterol', 'Max HR', 'ST depression', 'Number of vessels fluro']

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for i, feature in enumerate(features_to_plot):
    df.boxplot(column=feature, by='Heart Disease', ax=axes[i])
    axes[i].set_title(f'{feature} by Heart Disease', fontweight='bold')
    axes[i].set_xlabel('Heart Disease')
    axes[i].set_ylabel(feature)

plt.suptitle('Feature Distributions by Class', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix for numerical features
numeric_features = ['Age', 'BP', 'Cholesterol', 'Max HR', 'ST depression', 'Number of vessels fluro']
corr_matrix = df[numeric_features + ['Heart Disease']].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Select features (â‰¥6 features as requested)
# Selected: Age, Cholesterol, BP, Max HR, ST depression, Number of vessels fluro
selected_features = ['Age', 'Cholesterol', 'BP', 'Max HR', 'ST depression', 'Number of vessels fluro']

X = df[selected_features].values
y = df['Heart Disease'].values

print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")
print(f"\nSelected features: {selected_features}")