# Data Cleaning (via pipeline)

This notebook calls the `clean_bronze_data` function from `src.pipelines.data.bronze_to_silver`.

The same function is used by the DVC pipeline â€” this notebook is just for interactive inspection.

In [None]:
import os
import sys

sys.path.insert(0, os.path.abspath(".."))

import pandas as pd

from src.pipelines.data.bronze_to_silver import clean_bronze_data

YIELD_PATH = "../raw_data/barley_yield_from_1982.csv"
CLIMATE_PATH = "../raw_data/climate_data_from_1982.parquet"

df_yield_raw = pd.read_csv(YIELD_PATH, sep=";")
df_climate_raw = pd.read_parquet(CLIMATE_PATH)

yield_clean, climate_clean = clean_bronze_data(df_yield_raw, df_climate_raw)

In [None]:
print("Yield clean:", yield_clean.shape)
display(yield_clean.head())
print("\nMissing:")
print(yield_clean.isna().sum())

In [None]:
print("Climate clean:", climate_clean.shape)
print("Scenarios:", climate_clean["scenario"].unique().tolist())
display(climate_clean.head(10))
print("\nMissing:")
print(climate_clean.isna().sum())

In [None]:
# Sanity check: all yield (dept, year) pairs have historical climate
hist = climate_clean[climate_clean["scenario"] == "historical"]
yield_keys = set(zip(yield_clean["nom_dep"], yield_clean["year"], strict=True))
climate_keys = set(zip(hist["nom_dep"], hist["year"], strict=True))

no_match = yield_keys - climate_keys
print(f"Yield (dept, year) with no historical climate: {len(no_match)}")
if not no_match:
    print("All yield rows have matching climate data.")

In [None]:
# Save to outputs for quick access
import os

os.makedirs("outputs", exist_ok=True)
yield_clean.to_parquet("outputs/yield_clean.parquet", index=False)
climate_clean.to_parquet("outputs/climate_clean.parquet", index=False)
print("Saved to outputs/")