# **Cleaning Kaggle Dataset**

In [None]:
import pycountry

def to_iso3(name):
    try:
        return pycountry.countries.lookup(name).alpha_3
    except:
        return None

country_dietary["iso3"] =country_dietary["country"].apply(to_iso3)

In [None]:
diet_cols = ['Fruit', 'Vegetables', 'Legumes', 'Nuts',
              'Whole grains', 'Fish', 'Dairy', 'Red meat']

for col in diet_cols:
    country_dietary[col] = country_dietary.groupby("subregion")[col].transform(
        lambda g: g.fillna(g.median())
    )

# **Cleaning WHO Datasets**

In [None]:
import pycountry
import re
import pandas as pd

def clean_who(df):
    """
    Safely cleans WHO GHO API datasets with full try/except protection.
    """

    df = df.copy()

    # ==============================================
    # 1. ISO3 â†’ Country Name
    # ==============================================
    def iso3_to_country(code):
        try:
            result = pycountry.countries.get(alpha_3=code)
            return result.name if result else None
        except:
            return None

    try:
        df['Country'] = df['SpatialDim'].apply(iso3_to_country)
    except:
        df['Country'] = None

    # ==============================================
    # 2. Standardize Sex Labels
    # ==============================================
    sex_map = {
        "SEX_MLE": "Male",
        "SEX_FMLE": "Female",
        "SEX_BTSX": "Both"
    }

    try:
        df['Sex'] = df['Dim1'].map(sex_map)
    except:
        df['Sex'] = None

    # ==============================================
    # 3. Clean Age Group
    # ==============================================
    def clean_age(group):
        try:
            if pd.isna(group):
                return None
            cleaned = re.sub(r'AGEGROUP(_YEARS)?', '', group)
            cleaned = cleaned.replace("_", "").strip()
            return cleaned if cleaned != "" else None
        except:
            return None

    try:
        df['AgeGroup'] = df['Dim2'].apply(clean_age)
    except:
        df['AgeGroup'] = None

    # ==============================================
    # 4. Numeric Extract (Value, Low, High)
    # ==============================================
    try:
        df['Value_clean'] = df['NumericValue']
    except:
        df['Value_clean'] = None

    try:
        df['Low_clean'] = df['Low']
    except:
        df['Low_clean'] = None

    try:
        df['High_clean'] = df['High']
    except:
        df['High_clean'] = None

    # ==============================================
    # 5. Rename Columns
    # ==============================================
    try:
        df = df.rename(columns={
            "TimeDim": "Year",
            "ParentLocation": "Region"
        })
    except:
        pass  # safe ignore

    # ==============================================
    # 6. Drop Metadata Columns
    # ==============================================
    columns_to_drop = [
        'SpatialDimType', 'ParentLocationCode',
        'Dim1Type', 'Dim2Type', 'Dim3Type', 'Dim3',
        'Comments', 'Date', 'TimeDimensionValue',
        'TimeDimensionBegin', 'TimeDimensionEnd',
        'Value'
    ]

    try:
        df = df.drop(columns=columns_to_drop, errors='ignore')
    except:
        pass

    # ==============================================
    # 7. Final Ordered Columns
    # ==============================================
    final_cols = [
        'Country', 'Region', 'Year', 'Sex', 'AgeGroup',
        'Value_clean', 'Low_clean', 'High_clean',
        'IndicatorCode', 'Id', 'SpatialDim'
    ]

    try:
        df = df[[c for c in final_cols if c in df.columns]]
    except:
        pass

    return df


In [None]:
adult_obesity_age_standardized_cleaned = clean_who(adult_obesity_age_standardized)
child_adolescent_obesity_crude_cleaned = clean_who(child_adolescent_obesity_crude)
under5_overweight_prevalence_cleaned = clean_who(under5_overweight_prevalence)
under5_wasting_prevalence_cleaned = clean_who(under5_wasting_prevalence)
under5_stunting_prevalence_cleaned = clean_who(under5_stunting_prevalence)
sugar_availability_per_capita_cleaned = clean_who(sugar_availability_per_capita)

In [None]:
print(adult_obesity_age_standardized_cleaned.shape, child_adolescent_obesity_crude_cleaned.shape, under5_overweight_prevalence_cleaned.shape, under5_wasting_prevalence_cleaned.shape, under5_stunting_prevalence_cleaned.shape, sugar_availability_per_capita_cleaned.shape)

In [None]:
datasets = {
    "Adult Obesity (Age Standardized)": adult_obesity_age_standardized_cleaned,
    "Child & Adolescent Obesity (Crude)": child_adolescent_obesity_crude_cleaned,
    "Under 5 Overweight Prevalence": under5_overweight_prevalence_cleaned,
    "Under 5 Wasting Prevalence": under5_wasting_prevalence_cleaned,
    "Under 5 Stunting Prevalence": under5_stunting_prevalence_cleaned,
    "Sugar Availability Per Capita": sugar_availability_per_capita_cleaned
}