In [1]:
import os
from datetime import datetime as dt

import icecream
import jax.numpy as jnp
import numpy as np
import pandas as pd
from flax import nnx
from icecream import ic
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from tqdm.notebook import tqdm
from transformers import BertTokenizerFast, FlaxBertModel

import hephaestus as hp
import hephaestus.training.training as ht

icecream.install()
ic_disable = False  # Global variable to disable ic
if ic_disable:
    ic.disable()
ic.configureOutput(includeContext=True, contextAbsPath=True)
pd.options.mode.copy_on_write = True
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
model = FlaxBertModel.from_pretrained(model_name)
tokenizer = BertTokenizerFast.from_pretrained(model_name)

# Get the embeddings matrix
embeddings = model.params["embeddings"]["word_embeddings"]["embedding"]

# Now you can access specific embeddings like this:
# For example, to get embeddings for tokens 23, 293, and 993:
selected_embeddings = jnp.take(embeddings, jnp.array([23, 293, 993]), axis=0)

# If you want to get embeddings for specific words:
words = ["hello", "world", "example"]
tokens = tokenizer.convert_tokens_to_ids(words)
word_embeddings = jnp.take(embeddings, jnp.array(tokens), axis=0)
word_embeddings.shape

Some weights of FlaxBertModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: {('pooler', 'dense', 'bias'), ('pooler', 'dense', 'kernel')}
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


(3, 768)

In [3]:
df = pd.read_csv("data/energy/smart_home_energy_usage_dataset.csv")

df.describe()

Unnamed: 0,home_id,energy_consumption_kWh,temperature_setting_C,usage_duration_minutes,holiday
count,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
mean,50.019812,2.548839,19.999284,59.505089,0.099588
std,28.605155,1.415527,2.887678,34.65189,0.29945
min,1.0,0.1,15.0,0.0,0.0
25%,25.0,1.32,17.5,30.0,0.0
50%,50.0,2.55,20.0,59.0,0.0
75%,75.0,3.78,22.5,90.0,0.0
max,99.0,5.0,25.0,119.0,1.0


In [4]:
df.timestamp = pd.to_datetime(df.timestamp)
# set timestamp to be epoch seconds
df["since_epoch"] = df.timestamp.astype(int) // 10**9
df.sort_values(["home_id", "timestamp"], inplace=True)
df.drop(columns=["timestamp"], inplace=True)
# convert timestamp to sin and cos features and make them start from 0 to 1
df["sin_time"] = np.sin(2 * np.pi * df.since_epoch / df.since_epoch.max())
df["cos_time"] = np.cos(2 * np.pi * df.since_epoch / df.since_epoch.max())
df.drop(columns=["since_epoch"], inplace=True)

In [5]:
df.head()

Unnamed: 0,home_id,energy_consumption_kWh,temperature_setting_C,occupancy_status,appliance,usage_duration_minutes,season,day_of_week,holiday,sin_time,cos_time
76,1,4.48,18.0,Occupied,Lighting,11,Winter,Wednesday,0,0.912002,-0.410186
187,1,1.4,24.1,Unoccupied,Refrigerator,54,Spring,Sunday,0,0.911806,-0.41062
218,1,2.68,18.2,Unoccupied,Refrigerator,68,Summer,Tuesday,0,0.911752,-0.410741
270,1,2.54,23.0,Unoccupied,Lighting,82,Winter,Thursday,0,0.91166,-0.410945
279,1,1.39,25.0,Occupied,Lighting,59,Summer,Thursday,0,0.911644,-0.41098


In [6]:
df = df.reset_index(drop=True)

In [7]:
df.groupby("home_id").size().min()
df["idx"] = df.index // 100

In [8]:
# Get train test split at 80/20
time_series_config = hp.TimeSeriesConfig.generate(df=df)
train_idx = int(df.idx.max() * 0.8)
train_df = df.loc[df.idx < train_idx].copy()
test_df = df.loc[df.idx >= train_idx].copy()
# del df
train_ds = hp.TimeSeriesDS(train_df, time_series_config)
test_ds = hp.TimeSeriesDS(test_df, time_series_config)
len(train_ds), len(test_ds)

(7999, 2001)

In [9]:
time_series_config.n_tokens

35

In [10]:
def make_batch(ds: hp.TimeSeriesDS, start: int, length: int):
    numeric = []
    categorical = []
    for i in range(start, length + start):
        numeric.append(ds[i][0])
        categorical.append(ds[i][1])
    # print index of None values
    return {"numeric": jnp.array(numeric), "categorical": jnp.array(categorical)}


batch = make_batch(train_ds, 0, 4)
print(batch["numeric"].shape, batch["categorical"].shape)

# (4, 27, 59) (4, 3, 59)
# batch

(4, 7, 100) (4, 4, 100)


In [11]:
multiplier = 4
time_series_regressor = hp.TimeSeriesDecoder(
    time_series_config, d_model=512, n_heads=8 * multiplier, rngs=nnx.Rngs(0)
)
# nnx.display(time_series_regressor)

In [12]:
res = time_series_regressor(
    numeric_inputs=batch["numeric"],
    categorical_inputs=batch["categorical"],
    deterministic=False,
)

ic| /Users/kailukowiak/Hephaestus/hephaestus/models/time_series_decoder.py:697 in __call__()
    numeric_inputs.shape: (4, 7, 100)
    categorical_inputs.shape: (4, 4, 100)
ic| /Users/kailukowiak/Hephaestus/hephaestus/models/time_series_decoder.py:544 in process_numeric()
    "col_token type": 'col_token type'
    numeric_col_embeddings.dtype: dtype('float32')
ic| /Users/kailukowiak/Hephaestus/hephaestus/models/time_series_decoder.py:548 in process_numeric()
    numeric_embedding.shape: (512,)
ic| /Users/kailukowiak/Hephaestus/hephaestus/models/time_series_decoder.py:551 in process_numeric()
    numeric_embedding.shape: (4, 7, 100, 512)
ic| /Users/kailukowiak/Hephaestus/hephaestus/models/time_series_decoder.py:559 in process_numeric()
    numeric_embedding.shape: (4, 7, 100, 512)
ic| /Users/kailukowiak/Hephaestus/hephaestus/models/time_series_decoder.py:587 in process_categorical()
    "Issue here": 'Issue here'
    categorical_inputs.shape: (4, 4, 100)
    "args": 'args'
    categoric

In [13]:
res["numeric_out"].shape, res["categorical_out"].shape

((4, 7, 100), (4, 4, 100, 35))

In [14]:
ic.disable()

In [15]:
causal_mask = True
# time_series_regressor.train()

In [None]:
metric_history = ht.create_metric_history()

learning_rate = 1e-4
momentum = 0.9
optimizer = ht.create_optimizer(time_series_regressor, learning_rate, momentum)

metrics = ht.create_metrics()
writer_name = "HouseHeating"

writer_time = dt.now().strftime("%Y-%m-%dT%H:%M:%S")
model_name = writer_time + writer_name
summary_writer = SummaryWriter("runs/" + model_name)


train_data_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
train_step = ht.create_train_step(
    model=time_series_regressor, optimizer=optimizer, metrics=metrics
)

for step, batch in enumerate(tqdm(train_data_loader)):
    batch = {"numeric": jnp.array(batch[0]), "categorical": jnp.array(batch[1])}
    train_step(time_series_regressor, batch, optimizer, metrics)
    for metric, value in metrics.compute().items():
        # Only shows `loss`

        metric_history[metric].append(value)
        if jnp.isnan(value).any():
            raise ValueError("Nan Values")
        summary_writer.add_scalar(f"train/{metric}", np.array(value), step)
    metrics.reset()

  0%|          | 0/500 [00:00<?, ?it/s]