In [46]:
import pandas as pd
import datetime

In [57]:
test = pd.read_csv('../test_for_participants.csv')

In [47]:
data = pd.read_csv('../train.csv')
data["date_start"] = pd.to_datetime(data["delivery_start"])
data.columns


Index(['id', 'target', 'market', 'global_horizontal_irradiance',
       'diffuse_horizontal_irradiance', 'direct_normal_irradiance',
       'cloud_cover_total', 'cloud_cover_low', 'cloud_cover_mid',
       'cloud_cover_high', 'precipitation_amount', 'visibility',
       'air_temperature_2m', 'apparent_temperature_2m',
       'dew_point_temperature_2m', 'wet_bulb_temperature_2m',
       'surface_pressure', 'freezing_level_height', 'relative_humidity_2m',
       'convective_available_potential_energy', 'lifted_index',
       'convective_inhibition', 'wind_speed_80m', 'wind_direction_80m',
       'wind_gust_speed_10m', 'wind_speed_10m', 'solar_forecast',
       'wind_forecast', 'load_forecast', 'delivery_start', 'delivery_end',
       'date_start'],
      dtype='object')

In [None]:
def add_time_dummies(df: pd.DataFrame) -> pd.DataFrame:
    """
    Create dummy variables for each month-hour combination based on delivery_start.
    Format will be UPPERCASEMONTH_Hh, e.g. JANUARY_1h.
    Avoids the dummy variable trap by dropping the first category (first month-hour).
    """
    df = df.copy()
    
    # Ensure delivery_start is datetime type
    ds = pd.to_datetime(df["delivery_start"])
    
    # Get month names in uppercase and hours
    month_names = ds.dt.strftime('%B').str.upper()
    hour_strs = ds.dt.hour.astype(str) + 'h'
    
    # Combine into the requested format: MONTH_Hh
    month_hour = month_names + '_' + hour_strs
    
    # Enforce categorical order to ensure complete columns across train/test
    all_months = ['JANUARY', 'FEBRUARY', 'MARCH', 'APRIL', 'MAY', 'JUNE', 
                  'JULY', 'AUGUST', 'SEPTEMBER', 'OCTOBER', 'NOVEMBER', 'DECEMBER']
    all_hours = [f"{h}h" for h in range(24)]
    all_categories = [f"{m}_{h}" for m in all_months for h in all_hours]
    
    month_hour_cat = pd.Categorical(month_hour, categories=all_categories)
    
    # Create dummies, drop_first=True to avoid dummy trap
    dummies = pd.get_dummies(month_hour_cat, drop_first=True, dtype=int)
    
    # Concatenate to the original dataframe
    df = pd.concat([df, dummies], axis=1)
    
    return df

In [50]:
def add_variable_lags(df: pd.DataFrame, lag_configs: dict[str, list[int]]) -> pd.DataFrame:
    """
    Create lagged features for specific variables across lists of hours.
    Lags are computed within each market to prevent data leakage.
    lag_configs e.g., {"air_temperature_2m": [1, 24], "wind_speed_10m": [2, 6]}
    """
    df = df.copy().sort_values(["market", "delivery_start"])
    for variable, lags in lag_configs.items():
        for lag in lags:
            df[f"{variable}_lag_{lag}h"] = df.groupby("market")[variable].shift(lag)
    return df

lag = {
    "wind_speed_80m": [1,24]
}


data_lagged = add_variable_lags(data, lag)


df_cleaned = data_lagged.dropna()
df_cleaned


Unnamed: 0,id,target,market,global_horizontal_irradiance,diffuse_horizontal_irradiance,direct_normal_irradiance,cloud_cover_total,cloud_cover_low,cloud_cover_mid,cloud_cover_high,...,wind_gust_speed_10m,wind_speed_10m,solar_forecast,wind_forecast,load_forecast,delivery_start,delivery_end,date_start,wind_speed_80m_lag_1h,wind_speed_80m_lag_24h
120,120,-3.675,Market A,0.0,0.0,0.0,14.0,0.0,0.0,14.0,...,36.000000,23.400000,0.0,26847.8,37042.4075,2023-01-02 00:00:00,2023-01-02 01:00:00,2023-01-02 00:00:00,44.529541,31.253719
125,125,-14.183,Market A,0.0,0.0,0.0,12.0,0.0,0.0,12.0,...,36.719997,22.870626,0.0,26958.7,35521.2382,2023-01-02 01:00:00,2023-01-02 02:00:00,2023-01-02 01:00:00,44.412971,30.918108
130,130,-26.723,Market A,0.0,0.0,0.0,34.0,0.0,0.0,34.0,...,40.320000,26.208395,0.0,26667.8,34602.4094,2023-01-02 02:00:00,2023-01-02 03:00:00,2023-01-02 02:00:00,46.102493,26.983196
135,135,-29.322,Market A,0.0,0.0,0.0,13.0,0.0,0.0,13.0,...,35.279999,21.959999,0.0,25938.2,33971.1780,2023-01-02 03:00:00,2023-01-02 04:00:00,2023-01-02 03:00:00,47.762112,22.218153
140,140,-28.491,Market A,0.0,0.0,0.0,15.0,0.0,0.0,15.0,...,28.080000,16.981165,0.0,25008.5,34247.9417,2023-01-02 04:00:00,2023-01-02 05:00:00,2023-01-02 04:00:00,41.766209,27.210381
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132581,133591,21.588,Market F,43.0,80.0,0.0,79.0,0.0,0.0,79.0,...,19.080000,13.363711,1743.7,6070.3,65458.1898,2025-08-31 19:00:00,2025-08-31 20:00:00,2025-08-31 19:00:00,11.983188,24.066206
132587,133599,10.068,Market F,0.0,0.0,0.0,100.0,1.0,0.0,100.0,...,38.160000,22.354811,11.9,6029.3,64513.0090,2025-08-31 20:00:00,2025-08-31 21:00:00,2025-08-31 20:00:00,18.391735,28.227304
132593,133607,11.095,Market F,0.0,0.0,0.0,25.0,3.0,0.0,25.0,...,38.160000,20.056877,0.0,5911.8,62863.3300,2025-08-31 21:00:00,2025-08-31 22:00:00,2025-08-31 21:00:00,34.516850,35.870163
132599,133615,19.942,Market F,0.0,0.0,0.0,7.0,7.0,0.0,0.0,...,38.519997,18.844202,0.0,5437.9,60269.8502,2025-08-31 22:00:00,2025-08-31 23:00:00,2025-08-31 22:00:00,32.900139,35.309376


In [None]:


def create_lagged_variable(data ,hours: list ,column: str):
    
    
    
    None 
