# **Analysing Kaggle Datasets**

In [None]:
def overview(df):
    print("Shape:", df.shape)
    print("\nColumns and dtypes:\n", df.dtypes)
    print("\nNon-null counts:\n", df.count())
    print("\nMissing values per column:\n", df.isna().sum())
    print("\nBasic descriptive statistics for diet columns:\n")
    display(df[diet_cols].describe().T)

overview(country_dietary)

In [None]:
def plot_distributions(df):
    for col in diet_cols:
        plt.figure(figsize=(6,4))
        data = df[col].dropna()
        plt.hist(data, bins=30)
        plt.title(f"Distribution: {col}")
        plt.xlabel(col)
        plt.ylabel("Count")

    # Scatter matrix of diet columns as per the intake
    plt.figure(figsize=(12,12))

plot_distributions(country_dietary)

In [None]:
def correlation_analysis(df):
    corr = df[diet_cols].corr()
    corr.to_csv(os.path.join("diet_columns_correlation.csv"))
    # Heatmap using matplotlib
    plt.figure(figsize=(8,6))
    im = plt.imshow(corr, vmin=-1, vmax=1)
    plt.colorbar(im, fraction=0.046, pad=0.04)
    plt.xticks(range(len(diet_cols)), diet_cols, rotation=45, ha='right')
    plt.yticks(range(len(diet_cols)), diet_cols)
    plt.title("Correlation matrix (diet variables)")
    for (i, j), val in np.ndenumerate(corr.values):
        plt.text(j, i, f"{val:.2f}", ha='center', va='center', fontsize=8, color='white' if abs(val)>0.5 else 'black')
    return corr

correlation_analysis(country_dietary)

# **Analysing WHO Datasets**

In [None]:
def analyze_single_dataset(df, dataset_name="Dataset"):
    print(f"\n==============================")
    print(f" ANALYSIS FOR: {dataset_name}")
    print(f"==============================\n")

    # -----------------------
    # 1. Year-wise trend
    # -----------------------
    try:
        yearly = df.groupby("Year")["Value_clean"].mean().reset_index()
        print("✔ YEARLY TREND (global mean):")
        print(yearly.head(), "\n")
    except:
        print("⚠ Could not compute yearly trend\n")

    # -----------------------
    # 2. Country ranking
    # -----------------------
    try:
        country_rank = (
            df.groupby("Country")["Value_clean"]
            .mean()
            .sort_values(ascending=False)
            .head(10)
        )
        print("✔ TOP 10 COUNTRIES:")
        print(country_rank, "\n")
    except:
        print("⚠ Could not compute country ranking\n")

    # -----------------------
    # 3. Region-wise averages
    # -----------------------
    try:
        region_avg = (
            df.groupby("Region")["Value_clean"].mean().sort_values(ascending=False)
        )
        print("✔ REGION-WISE AVERAGE:")
        print(region_avg, "\n")
    except:
        print("⚠ Could not compute region trends\n")

    # -----------------------
    # 4. Missing years per country
    # -----------------------
    try:
        missing_years = (
            df.groupby("Country")["Year"]
            .apply(lambda x: sorted(set(range(min(x), max(x)+1)) - set(x)))
        )
        print("✔ MISSING YEARS PER COUNTRY:")
        print(missing_years.head(), "\n")
    except:
        print("⚠ Could not compute missing years\n")

    # -----------------------
    # 5. Sex-wise trend (if exists)
    # -----------------------
    try:
        sex_trend = df.groupby(["Year", "Sex"])["Value_clean"].mean().reset_index()
        print("✔ SEX-WISE TREND:")
        print(sex_trend.head(), "\n")
    except:
        print("⚠ Sex data may be missing\n")

    # -----------------------
    # 6. Age-group trend (if exists)
    # -----------------------
    try:
        age_trend = df.groupby(["Year", "AgeGroup"])["Value_clean"].mean().reset_index()
        print("✔ AGE-GROUP TREND:")
        print(age_trend.head(), "\n")
    except:
        print("⚠ Age group data may be missing\n")

    print("------------------------------------------------------\n")



def analyze_all_datasets(datasets_dict):
    for name, df in datasets_dict.items():
        analyze_single_dataset(df, name)


In [None]:
analyze_all_datasets(datasets)