In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestClassifier

# --------------------------
# 1. Load data
# --------------------------
meta = pd.read_csv("/kaggle/input/short-term-load-forecasting/metadata.csv")
train = pd.read_csv("/kaggle/input/short-term-load-forecasting/train.csv")
test = pd.read_csv("/kaggle/input/short-term-load-forecasting/test.csv")

# remove duplicates from metadata
meta = meta.drop_duplicates(subset=["building_id"]).reset_index(drop=True)

# --------------------------
# 2. Imputation for metadata
# --------------------------
def impute_metadata(meta):
    df = meta.copy()

    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

    # Impute numeric (example: area_in_sqft) using HistGradientBoosting
    for col in ["area_in_sqft"]:
        if col in df.columns and df[col].isnull().sum() > 0:
            not_null = df[df[col].notnull()]
            null = df[df[col].isnull()]
            features = [c for c in num_cols if c != col]

            if not_null.shape[0] > 5:
                hgb = HistGradientBoostingRegressor(random_state=42)
                hgb.fit(not_null[features], not_null[col])
                df.loc[df[col].isnull(), col] = hgb.predict(null[features])
            else:
                df[col] = df[col].fillna(df[col].median())

    # Impute categorical/binary (example: inverter) with RandomForestClassifier
    for col in ["inverter"]:
        if col in df.columns and df[col].isnull().sum() > 0:
            not_null = df[df[col].notnull()]
            null = df[df[col].isnull()]
            features = [c for c in num_cols if c != col]

            if not_null.shape[0] > 5:
                rf = RandomForestClassifier(n_estimators=100, random_state=42)
                rf.fit(not_null[features], not_null[col].astype(int))
                df.loc[df[col].isnull(), col] = rf.predict(null[features])
            else:
                df[col] = df[col].fillna(df[col].mode()[0])

    # Fallback: median for any remaining numeric NaNs
    df[num_cols] = df[num_cols].fillna(df[num_cols].median())
    return df

meta = impute_metadata(meta)

# --------------------------
# 3. Feature Engineering
# --------------------------
def feature_engineering(data, is_train=True):
    df = data.copy()

    # timestamp features
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    df["hour"] = df["timestamp"].dt.hour
    df["dayofweek"] = df["timestamp"].dt.dayofweek
    df["month"] = df["timestamp"].dt.month
    df["is_weekend"] = df["dayofweek"].isin([5,6]).astype(int)

    # lags
    df = df.sort_values(["building_id", "timestamp"])
    if "meter_reading" in df.columns:  # only in train
        df["meter_lag1"] = df.groupby("building_id")["meter_reading"].shift(1)
        df["meter_lag24"] = df.groupby("building_id")["meter_reading"].shift(24)
        df["meter_lag168"] = df.groupby("building_id")["meter_reading"].shift(168)

        # rolling stats
        df["meter_roll_mean_24h"] = df.groupby("building_id")["meter_reading"].transform(lambda x: x.rolling(24, min_periods=1).mean())
        df["meter_roll_std_24h"] = df.groupby("building_id")["meter_reading"].transform(lambda x: x.rolling(24, min_periods=1).std())
        df["meter_roll_mean_7d"] = df.groupby("building_id")["meter_reading"].transform(lambda x: x.rolling(24*7, min_periods=1).mean())

    # normalized static features
    if "area_in_sqft" in df.columns and "rooms" in df.columns:
        df["area_per_room"] = df["area_in_sqft"] / df["rooms"].replace(0, np.nan)
        df["people_per_room"] = df["no_of_people"] / df["rooms"].replace(0, np.nan)
        df["people_per_sqft"] = df["no_of_people"] / df["area_in_sqft"].replace(0, np.nan)

        df["lights_per_room"] = df["lights"] / df["rooms"].replace(0, np.nan)
        df["fans_per_room"] = df["ceiling_fans"] / df["rooms"].replace(0, np.nan)
        df["ac_per_room"] = df["air_conditioners"] / df["rooms"].replace(0, np.nan)
        df["fridge_per_person"] = df["fridge"] / df["no_of_people"].replace(0, np.nan)

        df["appliance_density"] = (
            df["lights"] + df["ceiling_fans"] + df["air_coolers"] +
            df["air_conditioners"] + df["fridge"] + df["tv"] +
            df["water_heaters"] + df["washing_machine"] +
            df["mixer"] + df["iron"] + df["micro_wave"]
        ) / df["area_in_sqft"].replace(0, np.nan)

    # interaction features
    df["has_ac"] = (df["air_conditioners"] > 0).astype(int)
    df["has_fridge"] = (df["fridge"] > 0).astype(int)
    df["has_inverter"] = (df["inverter"] > 0).astype(int)

    df["electricity_load_score"] = (
        2 * df["air_conditioners"] +
        1.5 * df["water_heaters"] +
        1 * df["fridge"] +
        0.5 * df["lights"] +
        0.3 * df["ceiling_fans"]
    )

    # region-based features
    if "region" in df.columns:
        region_stats = df.groupby("region")["area_in_sqft"].median().rename("region_median_area")
        df = df.merge(region_stats, on="region", how="left")
        df["deviation_from_region_area"] = df["area_in_sqft"] - df["region_median_area"]

    # building-based stats (only train has meter_reading)
    if "meter_reading" in df.columns:
        bldg_stats = df.groupby("building_id")["meter_reading"].agg(["mean","std","max"])
        bldg_stats["peak_to_mean_ratio"] = bldg_stats["max"] / (bldg_stats["mean"]+1e-6)
        bldg_stats = bldg_stats.add_prefix("bldg_").reset_index()
        df = df.merge(bldg_stats, on="building_id", how="left")

    # missingness indicators
    df["is_area_missing"] = df["area_in_sqft"].isnull().astype(int)
    df["is_inverter_missing"] = df["inverter"].isnull().astype(int)

    return df

# --------------------------
# 4. Apply transformations
# --------------------------
train = train.merge(meta, on="building_id", how="left")
test  = test.merge(meta, on="building_id", how="left")

final_train = feature_engineering(train, is_train=True)
final_test  = feature_engineering(test, is_train=False)

# --------------------------
# 5. Save results
# --------------------------
final_train.to_csv("/kaggle/working/final_train.csv", index=False)
final_test.to_csv("/kaggle/working/final_test.csv", index=False)


print("✅ Final train shape:", final_train.shape)
print("✅ Final test shape :", final_test.shape)


✅ Final train shape: (2582976, 51)
✅ Final test shape : (676800, 52)


In [4]:
final_train = pd.read_csv("/kaggle/working/final_train.csv")
final_test = pd.read_csv("/kaggle/working/final_test.csv")

building_id                       0
window_id                         0
timestamp                         0
meter_reading                     0
role                              0
region                            0
rooms                             0
no_of_people                      0
area_in_sqft                      0
inverter                          0
lights                            0
ceiling_fans                      0
air_coolers                       0
air_conditioners                  0
fridge                            0
tv                                0
water_heaters                     0
washing_machine                   0
mixer                             0
iron                              0
micro_wave                        0
hour                              0
dayofweek                         0
month                             0
is_weekend                        0
meter_lag1                       91
meter_lag24                    2184
meter_lag168                