In [None]:
import pandas as pd

In [None]:
def process_raw_data(df, target, date):
    print("Preprocessing data: it will take around 15s")

    df[date] = pd.to_datetime(df[date])
    df = df.sort_values(date)

    # Fill out missing dates
    df_new = pd.DataFrame()
    df_new[date] = pd.date_range(
        start=df[date].iloc[0],
        end=df[date].iloc[-1],
        freq="15min",
        inclusive="both",
    )
    df_new = pd.merge(df_new, df, on=date, how="outer")

    # Format decimal numbers
    df_new[target] = df_new[target].str.replace(",", ".").astype(float)

    df = df_new.copy()
    df[date] = pd.to_datetime(df[date], errors="coerce")

    # Prepare columns for data filling
    df["month"] = df[date].dt.month
    df["dow"] = df[date].dt.dayofweek
    df["hour"] = df[date].dt.hour
    df["minute"] = df[date].dt.minute
    keys = ["month", "dow", "hour", "minute"]

    # Select most frequent value for month, day of the week, hour and minute and save it to column
    modes = (
        df.dropna(subset=[target])
        .groupby(keys, observed=True)[target]
        .agg(lambda s: s.value_counts().idxmax())
        .rename("value_mode")
        .reset_index()
    )
    out = df.merge(modes, on=keys, how="left")

    # Fill out missing values with most frequent value for month, day of the week, hour and minute
    out[target] = out[target].fillna(out["value_mode"])
    df = out.drop(columns=keys + ["value_mode"])

    # Aggregate to daily level
    daily = df.resample("D", on=date)[target].sum().reset_index()

    # Round and rename columns
    daily[target] = daily[target].round(4)
    daily.rename(columns={date: "datetime", target: "y"}, inplace=True)

    daily.to_csv(f"example.csv", index=False)