In [1]:
# Import stuff
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

print("Starting EDA...")

Starting EDA...


In [2]:
# Load the dataset
column_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 
                'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']

df = pd.read_csv('../data/raw/processed.cleveland.data', 
                 names=column_names, na_values='?')

# Convert target to binary
df['target'] = df['target'].apply(lambda x: 1 if x > 0 else 0)

print(f"Data loaded: {df.shape}")
df.head()

Data loaded: (303, 14)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [3]:
# Check basic statistics
print("Statistical Summary:")
df.describe()

Statistical Summary:


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,299.0,301.0,303.0
mean,54.438944,0.679868,3.158416,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.672241,4.734219,0.458746
std,9.038662,0.467299,0.960126,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,0.937438,1.939706,0.49912
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,3.0,0.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0,3.0,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,7.0,1.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0,1.0


In [4]:
# Data types
print("Data Types:")
print(df.dtypes)
print(f"\nNumeric columns: {df.select_dtypes(include=[np.number]).columns.tolist()}")
print(f"Object columns: {df.select_dtypes(include=['object']).columns.tolist()}")

Data Types:
age         float64
sex         float64
cp          float64
trestbps    float64
chol        float64
fbs         float64
restecg     float64
thalach     float64
exang       float64
oldpeak     float64
slope       float64
ca          float64
thal        float64
target        int64
dtype: object

Numeric columns: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
Object columns: []


In [5]:
# Missing values
print("Missing Values:")
missing = df.isnull().sum()
print(missing[missing > 0])

# Percentage
missing_pct = (df.isnull().sum() / len(df)) * 100
print("\nMissing Percentage:")
print(missing_pct[missing_pct > 0])

Missing Values:
ca      4
thal    2
dtype: int64

Missing Percentage:
ca      1.320132
thal    0.660066
dtype: float64


In [6]:
# Target variable
print("Target Distribution:")
print(df['target'].value_counts())
print(f"\nDisease rate: {df['target'].mean()*100:.2f}%")
print(f"Healthy: {(df['target']==0).sum()}")
print(f"Disease: {(df['target']==1).sum()}")

Target Distribution:
target
0    164
1    139
Name: count, dtype: int64

Disease rate: 45.87%
Healthy: 164
Disease: 139


In [7]:
# Age distribution
print("Age Statistics:")
print(df['age'].describe())

print(f"\nAge range: {df['age'].min()} to {df['age'].max()} years")
print(f"Average age: {df['age'].mean():.1f} years")
print(f"Median age: {df['age'].median():.1f} years")

Age Statistics:
count    303.000000
mean      54.438944
std        9.038662
min       29.000000
25%       48.000000
50%       56.000000
75%       61.000000
max       77.000000
Name: age, dtype: float64

Age range: 29.0 to 77.0 years
Average age: 54.4 years
Median age: 56.0 years


In [8]:
# Sex distribution
print("Gender Distribution:")
print(df['sex'].value_counts())
print("\n(1 = Male, 0 = Female)")

print(f"\nMales: {(df['sex']==1).sum()} ({(df['sex']==1).mean()*100:.1f}%)")
print(f"Females: {(df['sex']==0).sum()} ({(df['sex']==0).mean()*100:.1f}%)")

Gender Distribution:
sex
1.0    206
0.0     97
Name: count, dtype: int64

(1 = Male, 0 = Female)

Males: 206 (68.0%)
Females: 97 (32.0%)


In [9]:
# Chest pain analysis
print("Chest Pain Types:")
print(df['cp'].value_counts().sort_index())
print("\n1: typical angina")
print("2: atypical angina")
print("3: non-anginal pain")
print("4: asymptomatic")

Chest Pain Types:
cp
1.0     23
2.0     50
3.0     86
4.0    144
Name: count, dtype: int64

1: typical angina
2: atypical angina
3: non-anginal pain
4: asymptomatic


In [10]:
# Cardiovascular metrics
print("Resting Blood Pressure (trestbps):")
print(df['trestbps'].describe())

print("\nCholesterol (chol):")
print(df['chol'].describe())

print(f"\nHigh BP (>140): {(df['trestbps']>140).sum()} patients")
print(f"High Cholesterol (>200): {(df['chol']>200).sum()} patients")

Resting Blood Pressure (trestbps):
count    303.000000
mean     131.689769
std       17.599748
min       94.000000
25%      120.000000
50%      130.000000
75%      140.000000
max      200.000000
Name: trestbps, dtype: float64

Cholesterol (chol):
count    303.000000
mean     246.693069
std       51.776918
min      126.000000
25%      211.000000
50%      241.000000
75%      275.000000
max      564.000000
Name: chol, dtype: float64

High BP (>140): 66 patients
High Cholesterol (>200): 253 patients


In [11]:
# Survival by sex
print("Disease Rate by Gender:")
gender_disease = df.groupby('sex')['target'].agg(['sum', 'count', 'mean'])
gender_disease.columns = ['Disease_Count', 'Total', 'Disease_Rate']
print(gender_disease)

print("\nMales have higher disease rate!" if df[df['sex']==1]['target'].mean() > df[df['sex']==0]['target'].mean() else "\nFemales have higher disease rate!")

Disease Rate by Gender:
     Disease_Count  Total  Disease_Rate
sex                                    
0.0             25     97      0.257732
1.0            114    206      0.553398

Males have higher disease rate!


In [12]:
# Age groups
df['age_group'] = pd.cut(df['age'], bins=[0, 40, 50, 60, 100], 
                         labels=['<40', '40-50', '50-60', '60+'])

print("Disease Rate by Age Group:")
age_disease = df.groupby('age_group')['target'].agg(['sum', 'count', 'mean'])
age_disease.columns = ['Disease_Count', 'Total', 'Disease_Rate']
print(age_disease)

Disease Rate by Age Group:
           Disease_Count  Total  Disease_Rate
age_group                                    
<40                    6     18      0.333333
40-50                 23     76      0.302632
50-60                 66    130      0.507692
60+                   44     79      0.556962


In [13]:
# Chest pain vs disease
print("Disease Rate by Chest Pain Type:")
cp_disease = df.groupby('cp')['target'].agg(['sum', 'count', 'mean'])
cp_disease.columns = ['Disease_Count', 'Total', 'Disease_Rate']
print(cp_disease)

print(f"\nAsymptomatic (cp=4) has {df[df['cp']==4]['target'].mean()*100:.1f}% disease rate")

Disease Rate by Chest Pain Type:
     Disease_Count  Total  Disease_Rate
cp                                     
1.0              7     23      0.304348
2.0              9     50      0.180000
3.0             18     86      0.209302
4.0            105    144      0.729167

Asymptomatic (cp=4) has 72.9% disease rate


In [15]:
print("Correlation with Target (Disease):")
numeric_df = df.select_dtypes(include=[np.number])
correlations = numeric_df.corr()['target'].sort_values(ascending=False)
print(correlations)

print("\nTop 5 positive correlations:")
print(correlations.head(6)[1:])

print("\nTop 5 negative correlations:")
print(correlations.tail(5))

Correlation with Target (Disease):
target      1.000000
thal        0.525689
ca          0.460442
exang       0.431894
oldpeak     0.424510
cp          0.414446
slope       0.339213
sex         0.276816
age         0.223120
restecg     0.169202
trestbps    0.150825
chol        0.085164
fbs         0.025264
thalach    -0.417167
Name: target, dtype: float64

Top 5 positive correlations:
thal       0.525689
ca         0.460442
exang      0.431894
oldpeak    0.424510
cp         0.414446
Name: target, dtype: float64

Top 5 negative correlations:
restecg     0.169202
trestbps    0.150825
chol        0.085164
fbs         0.025264
thalach    -0.417167
Name: target, dtype: float64


In [16]:
# Outlier detection - Age
Q1 = df['age'].quantile(0.25)
Q3 = df['age'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers_age = df[(df['age'] < lower_bound) | (df['age'] > upper_bound)]
print(f"Age outliers: {len(outliers_age)}")
print(f"Range: [{lower_bound:.1f}, {upper_bound:.1f}]")

Age outliers: 0
Range: [28.5, 80.5]


In [17]:
# Outlier detection - Cholesterol
Q1_chol = df['chol'].quantile(0.25)
Q3_chol = df['chol'].quantile(0.75)
IQR_chol = Q3_chol - Q1_chol

lower_chol = Q1_chol - 1.5 * IQR_chol
upper_chol = Q3_chol + 1.5 * IQR_chol

outliers_chol = df[(df['chol'] < lower_chol) | (df['chol'] > upper_chol)]
print(f"Cholesterol outliers: {len(outliers_chol)}")
print(f"Range: [{lower_chol:.1f}, {upper_chol:.1f}]")
print(f"Max cholesterol: {df['chol'].max()}")

Cholesterol outliers: 5
Range: [115.0, 371.0]
Max cholesterol: 564.0


In [18]:
# EDA Summary
print("="*70)
print("EDA SUMMARY")
print("="*70)
print(f"Total patients: {len(df)}")
print(f"Disease cases: {df['target'].sum()} ({df['target'].mean()*100:.1f}%)")
print(f"Missing values: {df.isnull().sum().sum()}")
print(f"Age range: {df['age'].min()}-{df['age'].max()} years")
print(f"Male patients: {(df['sex']==1).sum()} ({(df['sex']==1).mean()*100:.1f}%)")
print(f"High cholesterol: {(df['chol']>200).sum()} patients")
print(f"\nKey findings:")
print("- Males have higher disease rate")
print("- Disease increases with age")
print("- Asymptomatic chest pain shows high disease rate")
print("- Strong correlation between cp, thalach, oldpeak and target")

EDA SUMMARY
Total patients: 303
Disease cases: 139 (45.9%)
Missing values: 6
Age range: 29.0-77.0 years
Male patients: 206 (68.0%)
High cholesterol: 253 patients

Key findings:
- Males have higher disease rate
- Disease increases with age
- Asymptomatic chest pain shows high disease rate
- Strong correlation between cp, thalach, oldpeak and target
