# MTM Air Quality

## Load Libs


In [None]:
import os
from datetime import datetime as dt

import icecream
import pandas as pd
import torch
from icecream import ic
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from transformers import BertModel, BertTokenizerFast

import hephaestus as hp
from hephaestus.analysis.plots import plot_training_history
from hephaestus.models import TimeSeriesDecoder

# %%
torch.set_default_dtype(torch.float32)
# %%
icecream.install()
ic_disable = True  # Global variable to disable ic
if ic_disable:
    ic.disable()
ic.configureOutput(includeContext=True, contextAbsPath=True)
# pd.options.mode.copy_on_write = True
os.environ["TOKENIZERS_PARALLELISM"] = "false"

2025-03-09 21:11:05.131597: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741554665.139990   17540 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741554665.143758   17540 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


OptionError: You can only set the value of existing options

In [None]:
model_name = "bert-base-uncased"
model = BertModel.from_pretrained(
    "bert-base-uncased", torch_dtype=torch.float32, attn_implementation="sdpa"
)
tokenizer = BertTokenizerFast.from_pretrained(model_name)

## Load and Process Data


In [None]:
# Load and preprocess the dataset (assuming you have a CSV file)
# Select numeric columns

csvs = [
    os.path.join("./data/air_quality/", f)
    for f in os.listdir("./data/air_quality/")
    if f.endswith(".csv")
]
dfs = [pd.read_csv(csv) for csv in csvs]
df = pd.concat(dfs, ignore_index=True)
del dfs
time_cols = ["year", "month", "day", "hour"]
df = df.sort_values(time_cols).reset_index(drop=True).drop("No", axis=1)
# Convert time columns to strings
for col in time_cols:
    df[col] = df[col].astype(str)
# replace . and lower case column names
df.columns = [c.replace(".", "_").lower() for c in df.columns]
# df = df.dropna()
df_no_na = df.dropna()
print(df.shape)
df.dropna(subset=["pm2_5"], inplace=True)
print(df.shape)
df = df.reset_index(drop=True)

df["idx"] = df.index // 32
# df = df.drop(["year", "month", "day", "hour"], axis=1)
numeric_cols = df.select_dtypes(include=["float64", "int64"]).columns.tolist()
numeric_cols.remove("idx")  # Remove idx column from scaling

# Create and fit scaler
scale_data = True
if scale_data:
    scaler = StandardScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
df.head()

## Initialize Model


In [None]:
# Get train test split at 80/20
time_series_config = hp.TimeSeriesConfig.generate(df=df)
train_idx = int(df.idx.max() * 0.8)
train_df = df.loc[df.idx < train_idx].copy()
test_df = df.loc[df.idx >= train_idx].copy()
# del df
train_ds = hp.TimeSeriesDS(train_df, time_series_config)
test_ds = hp.TimeSeriesDS(test_df, time_series_config)
len(train_ds), len(test_ds)

In [None]:
N_HEADS = 8 * 4
tabular_decoder = TimeSeriesDecoder(time_series_config, d_model=512, n_heads=N_HEADS)

In [None]:
device = torch.device(
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using device: {device}")

batch_size = 64

train_loader = DataLoader(
    train_ds,
    batch_size=batch_size,
    shuffle=True,
    num_workers=0,  # Set to 0 to avoid multiprocessing issues
    collate_fn=hp.tabular_collate_fn,
    pin_memory=True if torch.cuda.is_available() else False,
)
tabular_decoder = tabular_decoder.to(device)  # Move model to device first

# Create a sample batch from the training dataset
example_batch = train_ds[0:6]
numeric_data = example_batch.numeric.to(device)
categorical_data = example_batch.categorical.to(device)

In [None]:
with torch.no_grad():
    prediction = tabular_decoder(numeric_data, categorical_data)

    # Move predictions back to CPU for numpy operations if needed
    prediction = prediction.to("cpu")

    prediction.numeric = prediction.numeric.transpose(1, 2)
    prediction.categorical = prediction.categorical.permute(0, 2, 1, 3)


# Print prediction summary instead of all values
print("Prediction numeric shape:", prediction.numeric.shape)
print("Prediction categorical shape:", prediction.categorical.shape)
print(
    "Prediction contains NaN (numeric):",
    torch.isnan(prediction.numeric).any().item(),
)
print(
    "Prediction contains NaN (categorical):",
    torch.isnan(prediction.categorical).any().item(),
)

In [None]:
ic.disable()

In [None]:
def init_weights(m):
    if hasattr(m, "weight") and m.weight is not None:
        if len(m.weight.shape) > 1:
            # Use Kaiming initialization for better stability
            torch.nn.init.kaiming_normal_(m.weight, mode="fan_in", nonlinearity="relu")
            # Scale down initial weights to prevent explosions
            m.weight.data *= 0.05
        if hasattr(m, "bias") and m.bias is not None:
            torch.nn.init.zeros_(m.bias)


# Apply custom weight initialization
tabular_decoder.apply(init_weights)
# print("Applied conservative weight initialization")

# Move model to device
tabular_decoder.to(device)

# Set up training parameters with much more conservative values
learning_rate = 1e-3  # Reduced learning rate by 5x
num_epochs = 8
gradient_accumulation_steps = 4  # Increased for stability
max_grad_norm = 0.1  # Much tighter gradient clipping

# Add gradient explosion detection threshold
max_gradient_norm_allowed = 10.0
max_explosion_count = 5  # Allow this many explosions before reducing LR permanently

timestamp = dt.now().strftime("%Y-%m-%dT%H-%M-%S")
model_name = "LongerTraining"
log_dir = f"runs/{timestamp}_Heads_{N_HEADS}_Batch_{batch_size}_{model_name}"
save_dir = "images/MTMAirQuality"

# Ensure log directory exists
os.makedirs(log_dir, exist_ok=True)
os.makedirs(save_dir, exist_ok=True)
print(f"TensorBoard logs will be saved to: {log_dir}")
print("To view logs, run: tensorboard --logdir=runs")
writer = SummaryWriter(log_dir)
# Train the model with enhanced stability parameters
history = hp.train_model(
    model=tabular_decoder,
    train_dataset=train_ds,
    val_dataset=test_ds,
    batch_size=batch_size,  # Reduced batch size for stability
    epochs=num_epochs,
    learning_rate=learning_rate,
    log_dir=log_dir,
    save_dir=save_dir,
    device=device,
    gradient_accumulation_steps=gradient_accumulation_steps,
    max_grad_norm=max_grad_norm,
    explosion_threshold=max_gradient_norm_allowed,
    max_explosions_per_epoch=max_explosion_count,
    writer=writer,
)

In [None]:
# Import the plotting functions from our new module

# Visualize training history
plot_training_history(history, save_dir)

# Load best model for evaluation
best_model_path = os.path.join(save_dir, "best_model.pt")
if os.path.exists(best_model_path):
    checkpoint = torch.load(best_model_path, map_location=device)
    tabular_decoder.load_state_dict(checkpoint["model_state_dict"])
    print(
        f"Loaded best model from epoch {checkpoint['epoch']} with validation loss {checkpoint['val_loss']:.4f}"
    )

    # Evaluate the model on the test data using our new function

In [None]:
df_comp = hp.show_results_df(
    model=tabular_decoder,
    time_series_config=time_series_config,
    dataset=train_ds,
    idx=0,
)

In [None]:
df_comp.output_df.loc[:, time_series_config.categorical_col_tokens].head()

In [None]:
df_comp.input_df

In [None]:
hp.plot_col_error(df_comp, "pm2_5")

In [None]:
hp.plot_col_comparison(df_comp, "pm2_5")

Performance is mediocre.
