In [None]:
# tensorboard --logdir='runs/'
# %pip install seaborn

In [None]:
import sys
IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
    %load_ext tensorboard
    from google.colab import drive
    drive.mount('/content/drive')
    %cd '/content/drive/MyDrive/Colab Notebooks/Hephaestus/Transformers'
    # %tensorboard \
        # --logdir '/content/drive/MyDrive/Colab Notebooks/Hephaestus/Transformers/runs' \


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/Hephaestus/Transformers


In [None]:
from datetime import datetime as dt
from itertools import chain
import os
import numpy as np
import pandas as pd
import torch

from tqdm.notebook import tqdm, trange
from torch.utils.tensorboard import SummaryWriter
import matplotlib.pyplot as plt
import seaborn as sns
import hephaestus as hp


torch.manual_seed(4242)
# Load and preprocess the dataset (assuming you have a CSV file)
csvs = [
    os.path.join("../data/air_quality/", f)
    for f in os.listdir("../data/air_quality/")
    if f.endswith(".csv")
]
dfs = [pd.read_csv(csv) for csv in csvs]
df = pd.concat(dfs, ignore_index=True)
del dfs
df = (
    df.sort_values(["year", "month", "day", "hour"])
    .reset_index(drop=True)
    .drop("No", axis=1)
)
# replace . and lower case column names
df.columns = [c.replace(".", "_").lower() for c in df.columns]
# df = df.dropna()
df_no_na = df.dropna()
print(df.shape)
df.head()

(420768, 17)


Unnamed: 0,year,month,day,hour,pm2_5,pm10,so2,no2,co,o3,temp,pres,dewp,rain,wd,wspm,station
0,2013,3,1,0,4.0,4.0,4.0,7.0,300.0,77.0,-0.7,1023.0,-18.8,0.0,NNW,4.4,Aotizhongxin
1,2013,3,1,0,3.0,6.0,13.0,7.0,300.0,85.0,-2.3,1020.8,-19.7,0.0,E,0.5,Changping
2,2013,3,1,0,4.0,4.0,3.0,,200.0,82.0,-2.3,1020.8,-19.7,0.0,E,0.5,Dingling
3,2013,3,1,0,9.0,9.0,3.0,17.0,300.0,89.0,-0.5,1024.5,-21.4,0.0,NNW,5.7,Dongsi
4,2013,3,1,0,4.0,4.0,14.0,20.0,300.0,69.0,-0.7,1023.0,-18.8,0.0,NNW,4.4,Guanyuan


In [None]:
dataset = hp.TabularDS(df, target_column="pm2_5")

In [None]:
model = hp.TabTransformer(dataset, n_heads=8).to(dataset.device)
# dataset.X_train_numeric[dataset.X_train_numeric.isnan()] = float("nan")
dataset.X_train_categorical[dataset.X_train_categorical.isnan()] = model.cat_mask_token
dataset.X_train_numeric.isnan().sum(), dataset.X_train_categorical.isnan().sum()

# dataset.X_test_numeric[dataset.X_test_numeric.isnan()] = float("nan")
dataset.X_test_categorical[dataset.X_test_categorical.isnan()] = model.cat_mask_token


batch_size = 50
test_num = dataset.X_train_numeric[0:batch_size, :]
test_num_mask = hp.mask_tensor(test_num, model, probability=0.8)
test_cat = dataset.X_test_categorical[0:batch_size, :]
test_cat_mask = hp.mask_tensor(test_cat, model, probability=0.8)
type(test_num)

torch.Tensor

In [None]:
with torch.no_grad():
    x = model(
        test_num_mask,
        test_cat_mask,
        task="mlm",
    )
x[0].shape, x[1].shape

(torch.Size([50, 16, 49]), torch.Size([50, 14]))

In [None]:
with torch.no_grad():
    x = model(
        test_num_mask,
        test_cat_mask,
        task="mlm",
    )
x[0].shape, x[1].shape
x[1][0]

tensor([ 0.4806,  0.0632, -1.0811, -0.1427, -0.7734, -0.7243, -2.3233,  1.1817,
         0.2252, -0.5767,  2.3254,  1.9085,  2.0615,  0.0175], device='cuda:0')

In [None]:
write_graph = False
if write_graph:
    board_writer = SummaryWriter(log_dir="runs/ModelGraph/first_graph")
    board_writer.add_graph(model, (test_num_mask, test_cat_mask))

In [None]:
# hp.show_mask_pred(0, model, dataset, probability=0.8)

In [None]:
# Masked Tabular Modeling
base_model_name = "air_quality"

model_time = dt.now()
model_time = model_time.strftime("%Y-%m-%dT%H:%M:%S")
model_name = f"{base_model_name}_{model_time}"

model_save_path = "./models/inf2nan.pt"
remove_old_checkpoints = True
if remove_old_checkpoints:
    os.system(f"rm {model_save_path}")

In [None]:
model_list = os.listdir("./models")
if model_save_path.split("/")[-1] in model_list:
    print("Model already exists")
    model_exists = True
else:
    print("Model does not exist")
    model_exists = False

if model_exists:
    model.load_state_dict(torch.load(model_save_path))
    # model = torch.compile(model)
else:
    # model = torch.compile(model)
    hp.mtm(
        model,
        dataset,
        model_name,
        epochs=150,
        batch_size=100_000,
        lr=0.001,
        patience=20,
        training_size=None,
    )
    # torch.save(model.state_dict(), model_save_path)

Model does not exist


Epochs:   0%|          | 0/150 [00:00<?, ?it/s]

In [None]:
# Remove missing target values:
df_no_na = df.dropna(subet="pm2_5")
dataset = hp.TabularDS(df_no_na, target_column="pm2_5")



In [None]:
n_train_rows = [
    # 10,
    100,
    1_000,
    2_000,
    5_000,
    10_000,
    15_000,
    30_000,
    # 40_000,
    dataset.X_train.shape[0],
]

In [None]:
def train_multiple_sizes(pt_model_path, dataset, n_train_rows, n_epochs=100):
    model = hp.TabTransformer(dataset, n_heads=8).to(dataset.device)
    if pt_model_path is not None:
        model.load_state_dict(torch.load(pt_model_path))

    regression_performance = hp.fine_tune_model(
        model,
        dataset,
        model_name=f"ft_{n_train_rows}",
        n_rows=n_train_rows,
        epochs=n_epochs,
        patience=20,
    )

    return regression_performance

In [None]:
hephaestus_results_no_pre_train = []
pbar = tqdm(n_train_rows)
for i in pbar:
    pbar.set_description(f"n_rows: {i}")
    loss = train_multiple_sizes(None, dataset, i, n_epochs=250)
    hephaestus_results_no_pre_train.append(loss)

  0%|          | 0/8 [00:00<?, ?it/s]

Epochs, Model: ft_100_100_2023-09-15T01:39:49:   0%|          | 0/250 [00:00<?, ?it/s]

Epochs, Model: ft_1000_1000_2023-09-15T01:40:49:   0%|          | 0/250 [00:00<?, ?it/s]

Epochs, Model: ft_2000_2000_2023-09-15T01:41:50:   0%|          | 0/250 [00:00<?, ?it/s]

In [None]:
no_pt_df = pd.DataFrame(hephaestus_results_no_pre_train)
no_pt_df["model"] = "Hephaestus No Fine Tune"
no_pt_df

In [None]:
# loss = regression_performance = hp.fine_tune_model(
#     model,
#     dataset,
#     model_name=f"ft_{n_train_rows}",
#     n_rows=1000,
#     epochs=150,
#     early_stop=True,
# )
# loss  # 15_629_481.0