In [None]:
# Import stuff
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

print("Starting EDA...")

In [None]:
# Load the dataset
column_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 
                'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']

df = pd.read_csv('../data/raw/processed.cleveland.data', 
                 names=column_names, na_values='?')

# Convert target to binary
df['target'] = df['target'].apply(lambda x: 1 if x > 0 else 0)

print(f"Data loaded: {df.shape}")
df.head()

In [None]:
# Check basic statistics
print("Statistical Summary:")
df.describe()

In [None]:
# Data types
print("Data Types:")
print(df.dtypes)
print(f"\nNumeric columns: {df.select_dtypes(include=[np.number]).columns.tolist()}")
print(f"Object columns: {df.select_dtypes(include=['object']).columns.tolist()}")

In [None]:
# Missing values
print("Missing Values:")
missing = df.isnull().sum()
print(missing[missing > 0])

# Percentage
missing_pct = (df.isnull().sum() / len(df)) * 100
print("\nMissing Percentage:")
print(missing_pct[missing_pct > 0])

In [None]:
# Target variable
print("Target Distribution:")
print(df['target'].value_counts())
print(f"\nDisease rate: {df['target'].mean()*100:.2f}%")
print(f"Healthy: {(df['target']==0).sum()}")
print(f"Disease: {(df['target']==1).sum()}")

In [None]:
# Age distribution
print("Age Statistics:")
print(df['age'].describe())

print(f"\nAge range: {df['age'].min()} to {df['age'].max()} years")
print(f"Average age: {df['age'].mean():.1f} years")
print(f"Median age: {df['age'].median():.1f} years")

In [None]:
# Sex distribution
print("Gender Distribution:")
print(df['sex'].value_counts())
print("\n(1 = Male, 0 = Female)")

print(f"\nMales: {(df['sex']==1).sum()} ({(df['sex']==1).mean()*100:.1f}%)")
print(f"Females: {(df['sex']==0).sum()} ({(df['sex']==0).mean()*100:.1f}%)")

In [None]:
# Chest pain analysis
print("Chest Pain Types:")
print(df['cp'].value_counts().sort_index())
print("\n1: typical angina")
print("2: atypical angina")
print("3: non-anginal pain")
print("4: asymptomatic")

In [None]:
# Cardiovascular metrics
print("Resting Blood Pressure (trestbps):")
print(df['trestbps'].describe())

print("\nCholesterol (chol):")
print(df['chol'].describe())

print(f"\nHigh BP (>140): {(df['trestbps']>140).sum()} patients")
print(f"High Cholesterol (>200): {(df['chol']>200).sum()} patients")

In [None]:
# Survival by sex
print("Disease Rate by Gender:")
gender_disease = df.groupby('sex')['target'].agg(['sum', 'count', 'mean'])
gender_disease.columns = ['Disease_Count', 'Total', 'Disease_Rate']
print(gender_disease)

print("\nMales have higher disease rate!" if df[df['sex']==1]['target'].mean() > df[df['sex']==0]['target'].mean() else "\nFemales have higher disease rate!")

In [None]:
# Age groups
df['age_group'] = pd.cut(df['age'], bins=[0, 40, 50, 60, 100], 
                         labels=['<40', '40-50', '50-60', '60+'])

print("Disease Rate by Age Group:")
age_disease = df.groupby('age_group')['target'].agg(['sum', 'count', 'mean'])
age_disease.columns = ['Disease_Count', 'Total', 'Disease_Rate']
print(age_disease)

In [None]:
# Chest pain vs disease
print("Disease Rate by Chest Pain Type:")
cp_disease = df.groupby('cp')['target'].agg(['sum', 'count', 'mean'])
cp_disease.columns = ['Disease_Count', 'Total', 'Disease_Rate']
print(cp_disease)

print(f"\nAsymptomatic (cp=4) has {df[df['cp']==4]['target'].mean()*100:.1f}% disease rate")

In [None]:
# Correlation analysis
print("Correlation with Target (Disease):")
correlations = df.corr()['target'].sort_values(ascending=False)
print(correlations)

print("\nTop 5 positive correlations:")
print(correlations.head(6)[1:])  # Skip target itself

print("\nTop 5 negative correlations:")
print(correlations.tail(5))

In [None]:
# Outlier detection - Age
Q1 = df['age'].quantile(0.25)
Q3 = df['age'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers_age = df[(df['age'] < lower_bound) | (df['age'] > upper_bound)]
print(f"Age outliers: {len(outliers_age)}")
print(f"Range: [{lower_bound:.1f}, {upper_bound:.1f}]")

In [None]:
# Outlier detection - Cholesterol
Q1_chol = df['chol'].quantile(0.25)
Q3_chol = df['chol'].quantile(0.75)
IQR_chol = Q3_chol - Q1_chol

lower_chol = Q1_chol - 1.5 * IQR_chol
upper_chol = Q3_chol + 1.5 * IQR_chol

outliers_chol = df[(df['chol'] < lower_chol) | (df['chol'] > upper_chol)]
print(f"Cholesterol outliers: {len(outliers_chol)}")
print(f"Range: [{lower_chol:.1f}, {upper_chol:.1f}]")
print(f"Max cholesterol: {df['chol'].max()}")

In [None]:
# EDA Summary
print("="*70)
print("EDA SUMMARY")
print("="*70)
print(f"Total patients: {len(df)}")
print(f"Disease cases: {df['target'].sum()} ({df['target'].mean()*100:.1f}%)")
print(f"Missing values: {df.isnull().sum().sum()}")
print(f"Age range: {df['age'].min()}-{df['age'].max()} years")
print(f"Male patients: {(df['sex']==1).sum()} ({(df['sex']==1).mean()*100:.1f}%)")
print(f"High cholesterol: {(df['chol']>200).sum()} patients")
print(f"\nKey findings:")
print("- Males have higher disease rate")
print("- Disease increases with age")
print("- Asymptomatic chest pain shows high disease rate")
print("- Strong correlation between cp, thalach, oldpeak and target")

In [None]:
!git add notebooks/02_eda.ipynb
!git commit -m "Complete EDA with correlation and outlier analysis"
!git push