In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import data_settings as settings
from preprocessing import filter_column_based_on_quantile, normalize_based_on_other_df, avoid_structural_imbalance, add_date_time_features

In [3]:
df = pd.read_csv("./data/no1_validation.csv")
df_train = pd.read_csv("./data/no1_train.csv")
df_train = filter_column_based_on_quantile(df_train, 0.001, settings.COLUMNS_TO_CLAMP)
df = filter_column_based_on_quantile(df, 0.001, settings.COLUMNS_TO_CLAMP)
df = normalize_based_on_other_df(df, df_train, settings.COLUMNS_TO_NORMALIZE)
if settings.AVOID_STRUCTURAL_IMBALANCE:
    df = avoid_structural_imbalance(df)
df = add_date_time_features(df)
df["y_prev"] = df["y"].shift(1) # Previous y lag feature
df["y_prev_24h"] = df["y"].shift(24*60//5) # Add power imbalance from 24 hours ago
df = pd.merge_asof(
    df,
    df.resample('D', on="start_time")["y"].mean().shift(1),
    right_index=True,
    left_on="start_time",
) # Mean power imbalance yesterday
df = df.rename(columns={"y_x": "y", "y_y": "y_yesterday"})
df = df.dropna()
df = df.drop(columns=settings.COLUMNS_TO_DROP)
df = df[[c for c in df if c not in ['y_prev']] 
       + ['y_prev']]

df.to_csv("./data/test_dataset.csv", index=False)