Imports

In [10]:
import pandas as pd
from pathlib import Path
import os

In [9]:
# Try to import parquet engines
try:
    import pyarrow
    PARQUET_ENGINE = 'pyarrow'
except ImportError:
    try:
        import fastparquet
        PARQUET_ENGINE = 'fastparquet'
    except ImportError:
        print("Warning: No parquet engine found. Installing pyarrow recommended.")
        print("Run: pip install pyarrow")
        PARQUET_ENGINE = None

Constants

In [3]:
PARAMS = [
    'AvgSurfT_inst',
    'CanopInt_inst',
    'LWdown_f_tavg',
    'Psurf_f_inst',
    'Qair_f_inst',
    'SnowDepth_inst',
    'SWdown_f_tavg',
    'Tair_f_inst',
    'TVeg_tavg',
    'Wind_f_inst',
    'Rainf_tavg'
]  

AGGREGATION = {
    'Rainf_tavg': 'sum',        # Rain accumulates over time
    'SnowDepth_inst': 'sum',    # Snow accumulation over time  
    'CanopInt_inst': 'sum',     # Water accumulation over time
    'Tair_f_inst': 'mean',      # Daily average temperature
    'AvgSurfT_inst': 'mean',    # Daily average surface temperature
    'Psurf_f_inst': 'mean',     # Daily average pressure
    'Qair_f_inst': 'mean',      # Daily average humidity
    'Wind_f_inst': 'mean',      # Daily average wind speed
    'LWdown_f_tavg': 'mean',    # Daily average longwave radiation
    'SWdown_f_tavg': 'mean',    # Daily average shortwave radiation
    'TVeg_tavg': 'mean'         # Daily average transpiration
}

DATES = [
    '2024_March',
    '2024_April',
    '2024_May',
    '2024_June',
    '2024_July',
    '2024_Aug',
    '2024_Sept',
    '2024_Oct',
    '2024_Nov',
    '2024_Dec',
    '2025_Jan',
    '2025_Feb',
]

Read in monthly CSVs, aggregate upto daily granularity and restructure the data to be column-wise lagged

In [6]:
def read_raw_csv(filename: str) -> pd.DataFrame:
    """Read a CSV file and return a pandas DataFrame."""
    file_path = Path('data/raw') / filename
    return pd.read_csv(file_path)

In [13]:
for parameter in PARAMS:

    print("-" * 50)
    print(f'Parameter: {parameter}')
    print("-" * 50)

    monthly_daily_dfs = []
    
    for date in DATES:

        print(f'date: {date}')
        file = parameter + '_data_' + date + '.csv'
        print("file: " + file)
        
        try:
            df = read_raw_csv(file)
            print(f"Successfully loaded {file}")
            print(f"Shape: {df.shape}")
            print(f"Columns: {list(df.columns)}")
        except Exception as e:
            print(f"Error: {e}")
        
        # Check missing values in final column
        final_col = df.columns[-1]
        missing_count = df[final_col].isna().sum()
        print(f"Missing values in '{final_col}': {missing_count}")

        # Print ranges of longitude and latitude
        lon_range = (df['longitude'].min(), df['longitude'].max())
        lat_range = (df['latitude'].min(), df['latitude'].max())
        print(f"Longitude range: {lon_range}")
        print(f"Latitude range: {lat_range}")

        # Aggregate 3-hourly data to daily
        daily_df = df.groupby(['longitude', 'latitude', 'year', 'month', 'day']).agg({
            final_col: AGGREGATION.get(parameter, 'mean')
        }).reset_index()
    
        print(f"Original shape: {df.shape}")
        print(f"Daily aggregated shape: {daily_df.shape}")

        monthly_daily_dfs.append(daily_df)
        
        print("-" * 50)
    
    if monthly_daily_dfs:

        consolidated_df = pd.concat(monthly_daily_dfs, ignore_index=True)
        print(f"Shape: {consolidated_df.shape}")
        print(f"Columns: {list(consolidated_df.columns)}")

        df_sorted = consolidated_df.sort_values(['longitude', 'latitude', 'year', 'month', 'day']).reset_index(drop=True)

        for lag in range(1, 14):
            df_sorted[f'{parameter}_lag{lag}'] = df_sorted.groupby(['longitude', 'latitude'])[parameter].shift(lag)

        output_path = os.path.join('data', 'interim', f"{parameter}.parquet")
        df_sorted.to_parquet(output_path, index=False, engine=PARQUET_ENGINE)

--------------------------------------------------
Parameter: AvgSurfT_inst
--------------------------------------------------
date: 2024_March
file: AvgSurfT_inst_data_2024_March.csv
Successfully loaded AvgSurfT_inst_data_2024_March.csv
Shape: (3759432, 7)
Columns: ['year', 'month', 'day', 'hour', 'longitude', 'latitude', 'AvgSurfT_inst']
Missing values in 'AvgSurfT_inst': 0
Longitude range: (np.float64(-179.5), np.float64(179.5))
Latitude range: (np.float64(-54.5), np.float64(83.5))
Original shape: (3759432, 7)
Daily aggregated shape: (469929, 6)
--------------------------------------------------
date: 2024_April
file: AvgSurfT_inst_data_2024_April.csv
Successfully loaded AvgSurfT_inst_data_2024_April.csv
Shape: (3638160, 7)
Columns: ['year', 'month', 'day', 'hour', 'longitude', 'latitude', 'AvgSurfT_inst']
Missing values in 'AvgSurfT_inst': 0
Longitude range: (np.float64(-179.5), np.float64(179.5))
Latitude range: (np.float64(-54.5), np.float64(83.5))
Original shape: (3638160, 7)
Da