In [None]:
# =========================
# 0) Imports & paths
# =========================
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 160)

YIELD_PATH = "../raw_data/barley_yield_from_1982.csv"
CLIMATE_PATH = "../raw_data/climate_data_from_1982.parquet"

In [None]:
# =========================
# 1) Load correctly (IMPORTANT)
# =========================
# Your CSV is ; separated -> sep=";"
df_yield = pd.read_csv(YIELD_PATH, sep=";")

df_climate = pd.read_parquet(CLIMATE_PATH)

print("df_yield:", df_yield.shape)
print("df_climate:", df_climate.shape)

display(df_yield.head())
display(df_climate.head())

In [None]:
# =========================
# 2) Utility functions (EDA helpers)
# =========================
def clean_cols(df: pd.DataFrame) -> pd.DataFrame:
    """Clean column names."""
    df = df.copy()
    df.columns = (
        df.columns.astype(str)
        .str.strip()
        .str.lower()
        .str.replace(r"\s+", "_", regex=True)
        .str.replace(r"[^a-z0-9_]", "", regex=True)
    )
    return df


def overview(df: pd.DataFrame, name: str):
    """Print an overview of the DataFrame."""
    print(f"\n{'=' * 90}\n{name}\n{'=' * 90}")
    print("shape:", df.shape)
    display(df.head(5))

    print("\n--- dtypes ---")
    # FIX: make dtypes sortable by converting to string
    dtypes_sorted = df.dtypes.astype(str).sort_values()
    display(dtypes_sorted.to_frame("dtype"))

    print("\n--- missing (% top 25) ---")
    miss = (df.isna().mean().sort_values(ascending=False) * 100).head(25)
    display(miss.to_frame("% missing"))

    print("\n--- duplicates ---")
    print("duplicate rows:", df.duplicated().sum())


def numeric_stats(df: pd.DataFrame, name: str):
    """Calculate and display statistics for numeric columns."""
    num = df.select_dtypes(include=[np.number])
    print(f"\n{name} - numeric columns:", list(num.columns))
    if num.shape[1] == 0:
        return
    display(num.describe().T)


def categorical_stats(df: pd.DataFrame, name: str, topk=15, max_cols=8):
    """Calculate and display statistics for categorical columns."""
    cat_cols = df.select_dtypes(
        include=["object", "string", "category", "bool"]
    ).columns.tolist()
    print(f"\n{name} - categorical columns:", cat_cols)
    if not cat_cols:
        return
    card = df[cat_cols].nunique(dropna=True).sort_values(ascending=False)
    display(card.to_frame("n_unique"))

    for c in card.head(min(max_cols, len(card))).index:
        print(f"\nTop values for '{c}'")
        display(df[c].value_counts(dropna=False).head(topk).to_frame("count"))

In [None]:
# =========================
# 3) Clean column names + basic overview
# =========================
df_yield = clean_cols(df_yield)
df_climate = clean_cols(df_climate)

overview(df_yield, "df_yield")
overview(df_climate, "df_climate")

numeric_stats(df_yield, "df_yield")
categorical_stats(df_yield, "df_yield")

# df_climate is huge: we do careful summaries below
categorical_stats(df_climate, "df_climate", topk=10, max_cols=6)

In [None]:
# =========================
# 4) df_yield checks
# =========================
# Common expected columns given your screenshot: department, year, yield, area,
# production
print(df_yield.columns)

# Convert numeric columns safely if needed
for c in ["year", "yield", "area", "production"]:
    if c in df_yield.columns:
        df_yield[c] = pd.to_numeric(df_yield[c], errors="coerce")

# Basic sanity checks
if "year" in df_yield.columns:
    print("Year range:", df_yield["year"].min(), "->", df_yield["year"].max())

# Quick plots
if "yield" in df_yield.columns:
    plt.figure()
    plt.hist(df_yield["yield"].dropna(), bins=40)
    plt.title("Yield distribution")
    plt.xlabel("yield")
    plt.ylabel("count")
    plt.show()

# If you have a department column, show top departments by avg yield
dep_col = None
for c in ["department", "nom_dep", "dep", "departement"]:
    if c in df_yield.columns:
        dep_col = c
        break

if dep_col and "yield" in df_yield.columns:
    dep_summary = (
        df_yield.groupby(dep_col)["yield"]
        .agg(count="count", mean="mean", median="median")
        .sort_values("mean", ascending=False)
    )
    display(dep_summary.head(20))

In [None]:
# =========================
# 5) df_climate structure checks
# =========================
print(df_climate.columns)

# Parse time if needed
if "time" in df_climate.columns and not np.issubdtype(
    df_climate["time"].dtype, np.datetime64
):
    df_climate["time"] = pd.to_datetime(df_climate["time"], errors="coerce")

# Key distributions (fast-ish even on large data)
for c in ["scenario", "year", "metric"]:
    if c in df_climate.columns:
        print(f"\nValue counts for {c} (top 20):")
        display(df_climate[c].value_counts(dropna=False).head(20).to_frame("count"))

# Date range
if "time" in df_climate.columns:
    print("\nTime range:", df_climate["time"].min(), "->", df_climate["time"].max())

In [None]:
# =========================
# 6) Missingness & numeric stats
# =========================
# Full missingness (still ok)
miss_cl = df_climate.isna().mean().sort_values(ascending=False) * 100
display(miss_cl.head(20).to_frame("% missing"))

# Numeric describe on full data (usually ok)
numeric_stats(df_climate, "df_climate")

# For plots: sample (so it's responsive)
SAMPLE_N = 300_000  # adjust if too heavy/too light
df_climate_s = df_climate.sample(n=min(SAMPLE_N, len(df_climate)), random_state=42)

plt.figure()
plt.hist(df_climate_s["value"].dropna(), bins=60)
plt.title("Climate 'value' distribution (sample)")
plt.xlabel("value")
plt.ylabel("count")
plt.show()

In [None]:
# =========================
# 7) Yearly aggregation (key step)
# =========================
group_cols = [
    c
    for c in ["scenario", "code_dep", "nom_dep", "year", "metric"]
    if c in df_climate.columns
]
print("Grouping on:", group_cols)

climate_yearly = df_climate.groupby(group_cols, as_index=False)["value"].agg(
    n="count",
    mean="mean",
    std="std",
    min="min",
    p05=lambda x: x.quantile(0.05),
    p50=lambda x: x.quantile(0.50),
    p95=lambda x: x.quantile(0.95),
    max="max",
)

print("climate_yearly:", climate_yearly.shape)
display(climate_yearly.head())

In [None]:
# =========================
# 8) Plot: yearly mean by scenario for a chosen metric
# =========================
# Pick one metric automatically (most frequent)
metric_choice = (
    df_climate["metric"].value_counts().index[0]
    if "metric" in df_climate.columns
    else None
)
print("Chosen metric:", metric_choice)

if metric_choice:
    tmp = climate_yearly[climate_yearly["metric"] == metric_choice].copy()

    # choose up to 3 scenarios to plot
    if "scenario" in tmp.columns:
        scenarios = tmp["scenario"].dropna().unique()[:3]
    else:
        scenarios = [None]

    for sc in scenarios:
        plot_df = tmp if sc is None else tmp[tmp["scenario"] == sc]

        # aggregate across departments to get a national-ish curve
        national = plot_df.groupby("year", as_index=False)["mean"].mean()

        plt.figure()
        plt.plot(national["year"], national["mean"])
        plt.title(
            f"Mean yearly {metric_choice} (avg across deps)"
            + ("" if sc is None else f" â€” {sc}")
        )
        plt.xlabel("year")
        plt.ylabel("mean(value)")
        plt.show()