In [1]:
import pandas as pd
from pathlib import Path
import numpy as np

bronze_in = Path("../Data_assignment_1/bronze/open_meteo.csv")
meteo_df = pd.read_csv(bronze_in)
print("Loaded from bronze:", meteo_df.shape)


Loaded from bronze: (8760, 6)


In [2]:
print("Dtypes:\n", meteo_df.dtypes, "\n")
print("Nulls per column:\n", meteo_df.isna().sum(), "\n")
print("Sample rows:")
display(meteo_df.head(3))

Dtypes:
 time                       object
temperature_2m (°C)       float64
precipitation (mm)        float64
wind_speed_10m (m/s)      float64
wind_gusts_10m (m/s)      float64
wind_direction_10m (°)      int64
dtype: object 

Nulls per column:
 time                      0
temperature_2m (°C)       0
precipitation (mm)        0
wind_speed_10m (m/s)      0
wind_gusts_10m (m/s)      0
wind_direction_10m (°)    0
dtype: int64 

Sample rows:


Unnamed: 0,time,temperature_2m (°C),precipitation (mm),wind_speed_10m (m/s),wind_gusts_10m (m/s),wind_direction_10m (°)
0,2020-01-01T00:00,-2.2,0.1,9.6,21.3,284
1,2020-01-01T01:00,-2.2,0.0,10.6,23.0,282
2,2020-01-01T02:00,-2.3,0.0,11.0,23.5,284


In [3]:
# keep it easy: lowercase, replace spaces with underscores, strip
meteo_df.columns = (
    meteo_df.columns
      .str.strip()
      .str.lower()
      .str.replace(" ", "_")
)

print("Standardized column names:\n", list(meteo_df.columns))


Standardized column names:
 ['time', 'temperature_2m_(°c)', 'precipitation_(mm)', 'wind_speed_10m_(m/s)', 'wind_gusts_10m_(m/s)', 'wind_direction_10m_(°)']


In [4]:
before = len(meteo_df)
meteo_df = meteo_df.drop_duplicates(keep="first")
after = len(meteo_df)
print(f"Removed {before - after} exact duplicate rows")


Removed 0 exact duplicate rows


In [5]:
# Show before
print("Nulls before fill:\n", meteo_df.isna().sum())

num_cols = meteo_df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in meteo_df.columns if c not in num_cols]

# For time series, ffill/bfill is typically the least invasive
meteo_df[num_cols] = meteo_df[num_cols].ffill().bfill()
meteo_df[cat_cols] = meteo_df[cat_cols].ffill().bfill()

# As a final guardrail, drop rows that are still all NA (should be none)
before = len(meteo_df)
meteo_df = meteo_df.dropna(how="all")
after = len(meteo_df)
print(f"Dropped {before - after} fully-empty rows (after fill)")

# Show after
print("Nulls after fill:\n", meteo_df.isna().sum())


Nulls before fill:
 time                      0
temperature_2m_(°c)       0
precipitation_(mm)        0
wind_speed_10m_(m/s)      0
wind_gusts_10m_(m/s)      0
wind_direction_10m_(°)    0
dtype: int64
Dropped 0 fully-empty rows (after fill)
Nulls after fill:
 time                      0
temperature_2m_(°c)       0
precipitation_(mm)        0
wind_speed_10m_(m/s)      0
wind_gusts_10m_(m/s)      0
wind_direction_10m_(°)    0
dtype: int64


In [6]:
print("Final shape:", meteo_df.shape)
print("Dtypes:\n", meteo_df.dtypes)
if "time" in meteo_df.columns:
    is_monotonic = meteo_df["time"].is_monotonic_increasing
    print("Time monotonic increasing:", is_monotonic)


Final shape: (8760, 6)
Dtypes:
 time                       object
temperature_2m_(°c)       float64
precipitation_(mm)        float64
wind_speed_10m_(m/s)      float64
wind_gusts_10m_(m/s)      float64
wind_direction_10m_(°)      int64
dtype: object
Time monotonic increasing: True


In [7]:
silver_dir = Path("../Data_assignment_1/silver")

# we also add the meteo subset to the streamlit/data folder for streamlit app use
silver_dir = Path("../streamlit/Data")

silver_dir.mkdir(parents=True, exist_ok=True)

silver_out = silver_dir / "open_meteo_clean.csv"
meteo_df.to_csv(silver_out, index=False)
print(f"Saved silver dataset to {silver_out} with shape {meteo_df.shape}")


Saved silver dataset to ..\streamlit\Data\open_meteo_clean.csv with shape (8760, 6)
