# **Analysing Kaggle Datasets**

In [None]:
def overview(df):
    print("Shape:", df.shape)
    print("\nColumns and dtypes:\n", df.dtypes)
    print("\nNon-null counts:\n", df.count())
    print("\nMissing values per column:\n", df.isna().sum())
    print("\nBasic descriptive statistics for diet columns:\n")
    display(df[diet_cols].describe().T)

overview(country_dietary)

In [None]:
def plot_distributions(df):
    for col in diet_cols:
        plt.figure(figsize=(6,4))
        data = df[col].dropna()
        plt.hist(data, bins=30)
        plt.title(f"Distribution: {col}")
        plt.xlabel(col)
        plt.ylabel("Count")

    # Scatter matrix of diet columns as per the intake
    plt.figure(figsize=(12,12))

plot_distributions(country_dietary)

In [None]:
def correlation_analysis(df):
    corr = df[diet_cols].corr()
    corr.to_csv(os.path.join("diet_columns_correlation.csv"))
    # Heatmap using matplotlib
    plt.figure(figsize=(8,6))
    im = plt.imshow(corr, vmin=-1, vmax=1)
    plt.colorbar(im, fraction=0.046, pad=0.04)
    plt.xticks(range(len(diet_cols)), diet_cols, rotation=45, ha='right')
    plt.yticks(range(len(diet_cols)), diet_cols)
    plt.title("Correlation matrix (diet variables)")
    for (i, j), val in np.ndenumerate(corr.values):
        plt.text(j, i, f"{val:.2f}", ha='center', va='center', fontsize=8, color='white' if abs(val)>0.5 else 'black')
    return corr

correlation_analysis(country_dietary)

# **Analysing WHO Datasets**

In [None]:
def analyze_single_dataset(df, dataset_name="Dataset"):
    print(f"\n==============================")
    print(f" ANALYSIS FOR: {dataset_name}")
    print(f"==============================\n")

    # -----------------------
    # 1. Year-wise trend
    # -----------------------
    try:
        yearly = df.groupby("Year")["Value_clean"].mean().reset_index()
        print("✔ YEARLY TREND (global mean):")
        print(yearly.head(), "\n")
    except:
        print("⚠ Could not compute yearly trend\n")

    # -----------------------
    # 2. Country ranking
    # -----------------------
    try:
        country_rank = (
            df.groupby("Country")["Value_clean"]
            .mean()
            .sort_values(ascending=False)
            .head(10)
        )
        print("✔ TOP 10 COUNTRIES:")
        print(country_rank, "\n")
    except:
        print("⚠ Could not compute country ranking\n")

    # -----------------------
    # 3. Region-wise averages
    # -----------------------
    try:
        region_avg = (
            df.groupby("Region")["Value_clean"].mean().sort_values(ascending=False)
        )
        print("✔ REGION-WISE AVERAGE:")
        print(region_avg, "\n")
    except:
        print("⚠ Could not compute region trends\n")

    # -----------------------
    # 4. Missing years per country
    # -----------------------
    try:
        missing_years = (
            df.groupby("Country")["Year"]
            .apply(lambda x: sorted(set(range(min(x), max(x)+1)) - set(x)))
        )
        print("✔ MISSING YEARS PER COUNTRY:")
        print(missing_years.head(), "\n")
    except:
        print("⚠ Could not compute missing years\n")

    # -----------------------
    # 5. Sex-wise trend (if exists)
    # -----------------------
    try:
        sex_trend = df.groupby(["Year", "Sex"])["Value_clean"].mean().reset_index()
        print("✔ SEX-WISE TREND:")
        print(sex_trend.head(), "\n")
    except:
        print("⚠ Sex data may be missing\n")

    # -----------------------
    # 6. Age-group trend (if exists)
    # -----------------------
    try:
        age_trend = df.groupby(["Year", "AgeGroup"])["Value_clean"].mean().reset_index()
        print("✔ AGE-GROUP TREND:")
        print(age_trend.head(), "\n")
    except:
        print("⚠ Age group data may be missing\n")

    print("------------------------------------------------------\n")



def analyze_all_datasets(datasets_dict):
    for name, df in datasets_dict.items():
        analyze_single_dataset(df, name)


In [None]:
analyze_all_datasets(datasets)

# **Analysis of merged Datasets**

In [None]:
def rename_value(df, new_name):
    df = df.copy()
    df = df.rename(columns={"Value_clean": new_name})
    return df

In [None]:
def minimize(df, colname):
    df = df[["Country", "Year", "Value_clean"]].copy()
    df = df.rename(columns={"Value_clean": colname})
    df = df.drop_duplicates(subset=["Country", "Year"])
    return df

In [None]:
adult_df = minimize(adult_obesity_age_standardized_cleaned, "AdultObesity")
child_df = minimize(child_adolescent_obesity_crude_cleaned, "ChildObesity")
over_u5_df = minimize(under5_overweight_prevalence_cleaned, "Overweight_U5")
wasting_u5_df = minimize(under5_wasting_prevalence_cleaned, "Wasting_U5")
stunting_u5_df = minimize(under5_stunting_prevalence_cleaned, "Stunting_U5")
sugar_df = minimize(sugar_availability_per_capita_cleaned, "Sugar")


In [None]:
print(adult_df.shape, child_df.shape, over_u5_df.shape, wasting_u5_df.shape, stunting_u5_df.shape, sugar_df.shape)

In [None]:
print(min(adult_df['Year']), max(adult_df['Year']))
print(min(child_df['Year']),max(child_df['Year']))
print(min(over_u5_df['Year']), max(over_u5_df['Year']))
print(min(wasting_u5_df['Year']), max(wasting_u5_df['Year']))
print(min(stunting_u5_df['Year']), max(stunting_u5_df['Year']))
print(min(sugar_df['Year']), max(sugar_df['Year']))


In [None]:
print(adult_df.shape, child_df.shape, over_u5_df.shape, wasting_u5_df.shape, stunting_u5_df.shape, sugar_df.shape)

In [None]:
adult_child_df = pd.merge(adult_df, child_df, on=['Country','Year'], how='inner')
adult_child_df

In [None]:
def compute_country_corr(df):
    results = []

    for country, sub in df.groupby("Country"):
        # Must have at least 2 years to compute correlation
        if sub[["AdultObesity", "ChildObesity"]].shape[0] >= 2:
            corr = sub["AdultObesity"].corr(sub["ChildObesity"])
        else:
            corr = None

        results.append([country, corr])

    corr_df = pd.DataFrame(results, columns=["Country", "Correlation"])
    return corr_df

In [None]:
corr_df = compute_country_corr(adult_child_df)

corr_sample = corr_df.sample(20, random_state=42)   # pick 20 random countries
corr_matrix = corr_sample.set_index("Country")[["Correlation"]]

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Between Adult and Child Obesity (Sample of 20 Countries)")
plt.tight_layout()
plt.show()


# **Sex-wise Obesity Trends**

In [None]:
global_adult_obesity_sex_trend = adult_obesity_age_standardized_cleaned.groupby(['Year', 'Sex'])['Value_clean'].mean().reset_index()
global_adult_obesity_sex_trend = global_adult_obesity_sex_trend[(global_adult_obesity_sex_trend['Year'] >= 1990) & (global_adult_obesity_sex_trend['Year'] <= 2022)]

global_child_obesity_sex_trend = child_adolescent_obesity_crude_cleaned.groupby(['Year', 'Sex'])['Value_clean'].mean().reset_index()
global_child_obesity_sex_trend = global_child_obesity_sex_trend[(global_child_obesity_sex_trend['Year'] >= 1990) & (global_child_obesity_sex_trend['Year'] <= 2022)]

print("Global Adult Obesity Trend by Sex (1990-2022):")
display(global_adult_obesity_sex_trend.head())
print("\nGlobal Child Obesity Trend by Sex (1990-2022):")
display(global_child_obesity_sex_trend.head())

## Compare Child vs. Adult Obesity Levels Across Countries

### Subtask:
Analyze the differences in obesity levels between children and adults across various countries.

In [None]:
country_obesity_means = adult_child_df.groupby('Country')[['AdultObesity', 'ChildObesity']].mean().reset_index()

print("Mean Adult and Child Obesity per Country:")
display(country_obesity_means.head())

In [None]:
country_obesity_means['ObesityDisparity'] = country_obesity_means['AdultObesity'] - country_obesity_means['ChildObesity']

sorted_disparity = country_obesity_means.sort_values(by='ObesityDisparity', ascending=False)

print("Countries sorted by Obesity Disparity (Adult - Child):")
display(sorted_disparity.head())

## Analyze Age-Group Specific Obesity Trends

### Subtask:
Examine if different age groups show varying obesity risk levels.

In [None]:
global_child_adolescent_obesity_age_trend = child_adolescent_obesity_crude_cleaned.groupby(['Year', 'AgeGroup'])['Value_clean'].mean().reset_index()
global_child_adolescent_obesity_age_trend = global_child_adolescent_obesity_age_trend[
    (global_child_adolescent_obesity_age_trend['Year'] >= 1990) &
    (global_child_adolescent_obesity_age_trend['Year'] <= 2022)
]

global_under5_overweight_age_trend = under5_overweight_prevalence_cleaned.groupby(['Year', 'AgeGroup'])['Value_clean'].mean().reset_index()
global_under5_overweight_age_trend = global_under5_overweight_age_trend[
    (global_under5_overweight_age_trend['Year'] >= 1990) &
    (global_under5_overweight_age_trend['Year'] <= 2022)
]

print("Global Child & Adolescent Obesity Trend by Age Group (1990-2022):")
display(global_child_adolescent_obesity_age_trend.head())
print("\nGlobal Under 5 Overweight Prevalence Trend by Age Group (1990-2022):")
display(global_under5_overweight_age_trend.head())

## Analyze Global Trends of Under-5 Indicators

### Subtask:
Calculate the global mean yearly trends for stunting prevalence, wasting prevalence, and overweight prevalence in children under 5.

global_stunting_trend = under5_stunting_prevalence_cleaned.groupby('Year')['Value_clean'].mean().reset_index()
global_stunting_trend = global_stunting_trend[(global_stunting_trend['Year'] >= 1990) & (global_stunting_trend['Year'] <= 2022)]

global_wasting_trend = under5_wasting_prevalence_cleaned.groupby('Year')['Value_clean'].mean().reset_index()
global_wasting_trend = global_wasting_trend[(global_wasting_trend['Year'] >= 1990) & (global_wasting_trend['Year'] <= 2022)]

global_overweight_trend = under5_overweight_prevalence_cleaned.groupby('Year')['Value_clean'].mean().reset_index()
global_overweight_trend = global_overweight_trend[(global_overweight_trend['Year'] >= 1990) & (global_overweight_trend['Year'] <= 2022)]

print("Global Stunting Trend (1990-2022):")
display(global_stunting_trend.head())
print("\nGlobal Wasting Trend (1990-2022):")
display(global_wasting_trend.head())
print("\nGlobal Overweight Trend (1990-2022):")
display(global_overweight_trend.head())