# Inputs

In [24]:
import pandas as pd
import numpy as np 
from pathlib import Path

In [25]:
input_path = "../data/silver/"
output_path = "../data/gold/"

Path(output_path).mkdir(parents=True, exist_ok=True)

In [26]:
df_train = pd.read_parquet(input_path + "train.parquet", engine='pyarrow')
df_weather = pd.read_parquet(input_path + "weather_train.parquet", engine='pyarrow')
df_building = pd.read_parquet(input_path + "building_metadata.parquet", engine='pyarrow')

# Processing

In [None]:
# create new features for the gold layer
df_train['building_month'] = df_train['building_id'].astype(str) + '_' + df_train['timestamp'].dt.month.astype(str)
df_train['building_hour'] = df_train['building_id'].astype(str) + '_' + df_train['timestamp'].dt.hour.astype(str)
df_train['building_week_day_hour'] = df_train['building_id'].astype(str) + '_' + df_train['timestamp'].dt.dayofweek.astype(str) + '_' + df_train['timestamp'].dt.hour.astype(str)

In [None]:
# create new features for the gold layer
df_weather['air_temperature_4h_avg'] = df_weather.groupby('site_id')['air_temperature'].rolling(window=4, min_periods=1).mean().reset_index(level=0, drop=True)

In [40]:
# join all df's into a final de-normalized table
df_all = df_train.set_index(['building_id']).join(
    df_building.set_index(['building_id']), 
    how='left'
).reset_index()

df_all = df_all.set_index(['site_id', 'timestamp']).join(
    df_weather.set_index(['site_id', 'timestamp']), 
    how='left'
).reset_index()

# Output

In [42]:
df_all.to_parquet(output_path + "anomaly_features.parquet", engine='pyarrow', index=False)