In [12]:
import pandas as pd
import numpy as np

# Load the dataset
file_path = '/Users/ernestgaisie/Downloads/archive (29)/2022/heart_2022_no_nans.csv'  # Replace with your actual file path
df = pd.read_csv(file_path)

# Remove leading and trailing whitespace from all string columns
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# Drop the 'State' column
if 'State' in df.columns:
    df.drop(columns=['State'], inplace=True)
    print("Dropped 'State' column.")

# Rename 'HadHeartAttack' to 'HeartDisease'
if 'HadHeartAttack' in df.columns:
    df.rename(columns={'HadHeartAttack': 'HeartDisease'}, inplace=True)
    print("Renamed 'HadHeartAttack' to 'HeartDisease'.")

# Define reverse mappings (text to numerical codes)
SEX_REVERSE = {"Male": 0, "Female": 1}
GEN_HEALTH_REVERSE = {"Excellent": 0, "Very good": 1, "Good": 2, "Fair": 3, "Poor": 4}
LAST_CHECKUP_REVERSE = {
    "Within past year (anytime less than 12 months ago)": 0,
    "Within past 2 years (1 year but less than 2 years ago)": 1,
    "Within past 5 years (2 years but less than 5 years ago)": 2,
    "5 or more years ago": 3
}
TEETH_REMOVED_REVERSE = {
    "1 to 5": 0,
    "6 or more, but not all": 1,
    "All": 2,
    "None of them": 3
}
SMOKER_STATUS_REVERSE = {
    "Current smoker - now smokes every day": 0,
    "Current smoker - now smokes some days": 1,
    "Former smoker": 2,
    "Never smoked": 3
}
ECIGARETTES_REVERSE = {
    "Never used e-cigarettes in my entire life": 0,
    "Use them every day": 1,
    "Use them some days": 2,
    "Not at all (right now)": 3
}
RACE_REVERSE = {
    "White only, Non-Hispanic": 0,
    "Black only, Non-Hispanic": 1,
    "Other race only, Non-Hispanic": 2,
    "Multiracial, Non-Hispanic": 3,
    "Hispanic": 5
}
AGE_CATEGORY_REVERSE = {
    "Age 18 to 24": 1,
    "Age 25 to 29": 2,
    "Age 30 to 34": 3,
    "Age 35 to 39": 4,
    "Age 40 to 44": 5,
    "Age 45 to 49": 6,
    "Age 50 to 54": 7,
    "Age 55 to 59": 8,
    "Age 60 to 64": 9,
    "Age 65 to 69": 10,
    "Age 70 to 74": 11,
    "Age 75 to 79": 12,
    "Age 80 or older": 13
}
TETANUS_LAST_10_TDAP_REVERSE = {
    "Yes, received Tdap": 1,
    "Yes, received tetanus shot but not sure what type": 2,
    "No, did not receive any tetanus shot in the past 10 years": 0
}

# Define binary columns (Yes/No)
YES_NO_COLUMNS = [
    "PhysicalActivities", "HeartDisease", "HadAngina", "HadStroke",
    "HadAsthma", "HadSkinCancer", "HadCOPD", "HadDepressiveDisorder",
    "HadKidneyDisease", "HadArthritis", "HadDiabetes", "DeafOrHardOfHearing",
    "BlindOrVisionDifficulty", "DifficultyConcentrating", "DifficultyWalking",
    "DifficultyDressingBathing", "DifficultyErrands", "ChestScan",
    "AlcoholDrinkers", "HIVTesting", "FluVaxLast12", "PneumoVaxEver",
    "HighRiskLastYear", "CovidPos"
]

# Map "yes/no" columns to binary
for column in YES_NO_COLUMNS:
    df[column] = df[column].map({"Yes": 1, "No": 0}).fillna(0)

# Apply reverse mappings for categorical columns
df['Sex'] = df['Sex'].map(SEX_REVERSE).fillna(0)
df['GeneralHealth'] = df['GeneralHealth'].map(GEN_HEALTH_REVERSE).fillna(0)
df['LastCheckupTime'] = df['LastCheckupTime'].map(LAST_CHECKUP_REVERSE).fillna(0)
df['RemovedTeeth'] = df['RemovedTeeth'].map(TEETH_REMOVED_REVERSE).fillna(0)
df['SmokerStatus'] = df['SmokerStatus'].map(SMOKER_STATUS_REVERSE).fillna(0)
df['ECigaretteUsage'] = df['ECigaretteUsage'].map(ECIGARETTES_REVERSE).fillna(0)
df['RaceEthnicityCategory'] = df['RaceEthnicityCategory'].map(RACE_REVERSE).fillna(0)
df['AgeCategory'] = df['AgeCategory'].map(AGE_CATEGORY_REVERSE).fillna(0)
df['TetanusLast10Tdap'] = df['TetanusLast10Tdap'].map(TETANUS_LAST_10_TDAP_REVERSE).fillna(0)

# Handle numerical outliers (e.g., BMI, SleepHours)
df = df[(df['SleepHours'] >= 3) & (df['SleepHours'] <= 16)]  # Sleep hours
df = df[(df['BMI'] >= 10) & (df['BMI'] <= 50)]               # BMI

# Handle any remaining missing values
df.fillna(0, inplace=True)  # Fill any remaining missing values with 0

# Verify the results
print("Columns with missing values after cleaning:")
print(df.isnull().sum())

print("\nSample of cleaned data:")
print(df.head())

# Save the cleaned dataset
output_path = 'cleaned_data_numeric.csv'  # Replace with your desired output path
df.to_csv(output_path, index=False)
print(f"\nCleaned dataset saved to: {output_path}")

  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


Dropped 'State' column.
Renamed 'HadHeartAttack' to 'HeartDisease'.
Columns with missing values after cleaning:
Sex                          0
GeneralHealth                0
PhysicalHealthDays           0
MentalHealthDays             0
LastCheckupTime              0
PhysicalActivities           0
SleepHours                   0
RemovedTeeth                 0
HeartDisease                 0
HadAngina                    0
HadStroke                    0
HadAsthma                    0
HadSkinCancer                0
HadCOPD                      0
HadDepressiveDisorder        0
HadKidneyDisease             0
HadArthritis                 0
HadDiabetes                  0
DeafOrHardOfHearing          0
BlindOrVisionDifficulty      0
DifficultyConcentrating      0
DifficultyWalking            0
DifficultyDressingBathing    0
DifficultyErrands            0
SmokerStatus                 0
ECigaretteUsage              0
ChestScan                    0
RaceEthnicityCategory        0
AgeCategory         