In [None]:
%pip install pandas
%pip install seaborn

In [None]:
import pandas as pd

# Load the dataset
file_path = "data/heart.csv"
data = pd.read_csv(file_path)

# Display the first few rows
print(data.head())

# Check for missing values
print("\nMissing values:\n", data.isnull().sum())

# Separate features (X) and target (y)
target_column = "target"
X = data.drop(columns=[target_column])
y = data[target_column]

print("\nFeatures:\n", X.head())
print("\nTarget:\n", y.head())


In [None]:
# Data Exploration

# Summary statistics
print(data.describe())

# Check for missing values
print("\nMissing Values:\n", data.isnull().sum())

# Check for duplicate rows
duplicates = data.duplicated().sum()
print(f"\nNumber of Duplicate Rows: {duplicates}")

# Drop duplicate rows
data_cleaned = data.drop_duplicates()

# Verify duplicates are removed
duplicates_after = data_cleaned.duplicated().sum()
print(f"Number of duplicate rows after removal: {duplicates_after}")

# Save the cleaned dataset (optional)
data_cleaned.to_csv("data/heart_cleaned.csv", index=False)
print("Cleaned dataset saved as 'data/heart_cleaned.csv'")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot boxplots for numerical columns
for column in data.select_dtypes(include=['float64', 'int64']).columns:
    plt.figure(figsize=(8, 4))
    sns.boxplot(data[column])
    plt.title(f"Boxplot of {column}")
    plt.show()


In [None]:
# Dealing with outliers

# Function to detect outliers using IQR
def detect_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] < lower_bound) | (df[column] > upper_bound)]

# Detect outliers for all numerical columns
for column in data.select_dtypes(include=['float64', 'int64']).columns:
    outliers = detect_outliers_iqr(data, column)
    print(f"Outliers in {column}:")
    print(outliers)


In [None]:
# Define caps for each column
caps = {
    'chol': 400,        # Max plausible cholesterol level
    'trestbps': 180,    # Max plausible resting blood pressure
    'thalach': 80,      # Min plausible max heart rate
    'oldpeak': 4.0,     # Max ST depression value
    'ca': 3             # Max number of major vessels
}

# Apply capping
for column, cap in caps.items():
    if column == 'thalach':  # For thalach, cap the minimum
        data[column] = data[column].apply(lambda x: cap if x < cap else x)
    else:  # For other columns, cap the maximum
        data[column] = data[column].apply(lambda x: cap if x > cap else x)

# Investigate 'thal' separately
data['thal'] = data['thal'].apply(lambda x: None if x == 0 else x)  # Treat '0' as missing

# Verify changes
print(data.describe())


In [None]:
data.to_csv("data/heart_capped.csv", index=False)
print("Cleaned dataset saved as 'data/heart_capped.csv'")
