# Inputs

In [22]:
import pandas as pd
import numpy as np 
from sklearn.preprocessing import LabelEncoder
from pathlib import Path

In [23]:
input_path = "../data/bronze/"
output_path = "../data/silver/"

Path(output_path).mkdir(parents=True, exist_ok=True)

In [24]:
df_train = pd.read_parquet(input_path + "train.parquet", engine='pyarrow')
df_weather = pd.read_parquet(input_path + "weather_train.parquet", engine='pyarrow')
df_building = pd.read_parquet(input_path + "building_metadata.parquet", engine='pyarrow')

# Processing

In [25]:
def fill_missing_values(df, strategy_map=None, default_strategy='zero'):
    """
    strategy_map: dict, e.g., {'Age': 'median', 'Score': 'mean'}
    default_strategy: used if a column isn't in strategy_map
    """
    strategy_map = strategy_map or {}

    for col in df.columns:
        strategy = strategy_map.get(col, default_strategy)
        
        if pd.api.types.is_numeric_dtype(df[col]):
            if strategy == 'median':
                fill_value = df[col].median()
            elif strategy == 'mean':
                fill_value = df[col].mean()
            elif strategy == 'zero':
                fill_value = 0
            else:
                fill_value = 0 
            df[col] = df[col].fillna(fill_value)
        else:
            # Handle non-numeric columns
            df[col] = df[col].fillna('Unknown')

    return df

In [26]:
df_train = df_train[(df_train['building_id'].notnull()) & (df_train['timestamp'].notnull() )]
train_impute_map = {
    'meter_reading': 'median',
    'anomaly': 'median'}

df_train = fill_missing_values(df_train, strategy_map=train_impute_map)

In [27]:

df_weather = df_weather[(df_weather['site_id'].notnull()) & (df_weather['timestamp'].notnull() )]
weather_impute_map = {
    'air_temperature': 'mean',
    'cloud_coverage': 'mean',
    'dew_temperature': 'mean',
    'precip_depth_1_hr': 'mean',
    'sea_level_pressure': 'mean',
    'wind_direction': 'mean',
    'wind_speed': 'mean'}
df_weather = fill_missing_values(df_weather, strategy_map=weather_impute_map)



In [28]:
df_building = df_building[(df_building['site_id'].notnull()) & (df_building['building_id'].notnull() )]
building_impute_map = {
    'floor_count': 'median',
    'year_built': 'median',
    'square_feet': 'median',
    'primary_use': 'Unknown'}
df_building = fill_missing_values(df_building, strategy_map=building_impute_map)

le = LabelEncoder()
df_building['primary_use_enc'] = le.fit_transform(df_building['primary_use'].astype(str))

# Output

In [29]:
df_train.to_parquet(output_path + "train.parquet", engine='pyarrow', index=False)
df_weather.to_parquet(output_path + "weather_train.parquet", engine='pyarrow', index=False)
df_building.to_parquet(output_path + "building_metadata.parquet", engine='pyarrow', index=False)