In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Clone the Pristi repository clone 
!git clone https://github.com/LMZZML/PriSTI.git

# Navigate into the directory
!cd PriSTI

# Install the required packages
!pip install -r /kaggle/working/PriSTI/requirements.txt



In [None]:
!cd PriSTI
!pip install -r /kaggle/working/PriSTI/requirements.txt

In [None]:
import sys
sys.path.append('/kaggle/working/PriSTI') 

In [None]:
# Imports and paths 
import os, sys, json, yaml, pickle, numpy as np, pandas as pd, torch

# Paths – adjust if your layout differs
PRISTI_ROOT = "/kaggle/working/PriSTI"
CONFIG_PATH = f"{PRISTI_ROOT}/config/base.yaml"
WEIGHTS_PATH = f"{PRISTI_ROOT}/save/aqi36/model.pth"   # included in your working tree
MEANSTD_PK = "/kaggle/input/airq36/pm25/pm25_meanstd.pk"  # from AQI-36 dataset

# Make PriSTI importable
sys.path.append(PRISTI_ROOT)

from main_model import PriSTI_aqi36

In [None]:
import os, shutil
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load config
with open(CONFIG_PATH, "r") as f:
    config = yaml.safe_load(f)

# Load per-sensor mean/std used by AQI-36
with open(MEANSTD_PK, "rb") as f:
    meanstd = pickle.load(f)

# Adjust these depending on the exact structure
#print(type(meanstd))
#print(len(meanstd))
#print(meanstd[0][:5])  # first 5 values of mean
#print(meanstd[1][:5])  # first 5 values of std

mean = np.asarray(meanstd[0], dtype=np.float32)
std  = np.asarray(meanstd[1], dtype=np.float32)
std_safe = np.where(std == 0, 1.0, std)

with open(CONFIG_PATH, "r") as f:
    config = yaml.safe_load(f)

config["model"]["is_unconditional"] = False       
config["model"]["target_strategy"] = "hybrid"
config["diffusion"]["adj_file"] = "AQI36"        
config["seed"] = 42

os.makedirs("./data/pm25/SampleData", exist_ok=True)
shutil.copy(
    "/kaggle/input/airq36/pm25/SampleData/pm25_latlng.txt",
    "./data/pm25/SampleData/pm25_latlng.txt"
)

def scale_window(x_2d: np.ndarray) -> np.ndarray:
    # x_2d shape: (T, N)
    return (x_2d - mean) / std_safe

def inv_scale_vec(x_1d: np.ndarray) -> np.ndarray:
    # x_1d shape: (N,)
    return x_1d * std_safe + mean

# Load PriSTI AQI-36 model
model = PriSTI_aqi36(config, DEVICE).to(DEVICE)
state = torch.load(WEIGHTS_PATH, map_location=DEVICE)
model.load_state_dict(state)
model.eval()
print("PriSTI AQI-36 model loaded.")


In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import json


latlng = pd.read_csv("/kaggle/input/airq36/pm25/SampleData/pm25_latlng.txt")
missing_df_raw = pd.read_csv(
    "/kaggle/input/airq36/pm25/SampleData/pm25_missing.txt",
    index_col="datetime", parse_dates=True
)
ground_df = pd.read_csv(
    "/kaggle/input/airq36/pm25/SampleData/pm25_ground.txt",
    index_col="datetime", parse_dates=True
)

def normalize_cols(cols):
    return (
        pd.Index(cols)
          .astype(str)
          .str.strip()
          .str.lstrip("0")
          .tolist()
    )

sensor_ids = normalize_cols(latlng["sensor_id"])
missing_df_raw.columns = normalize_cols(missing_df_raw.columns)
ground_df.columns      = normalize_cols(ground_df.columns)

# Reorder columns to match sensor_ids
missing_df_raw = missing_df_raw[sensor_ids]
ground_df      = ground_df[sensor_ids]

#scaler params
with open("scaler_params.json") as f:
    sp = json.load(f)

min_val, max_val = sp["min_val"], sp["max_val"]
scaler     = lambda x: (x - min_val) / (max_val - min_val)
inv_scaler = lambda x: x * (max_val - min_val) + min_val

#Mask building
try:
    artificial_mask = pd.read_csv(
        "/kaggle/input/airq36/pm25/SampleData/artificial_mask.csv",
        index_col="datetime", parse_dates=True
    ).astype(bool)
    artificial_mask = artificial_mask[sensor_ids]
except FileNotFoundError:
    artificial_mask = missing_df_raw.isna() & ground_df.notna()


TEST_MONTHS = {3, 6, 9, 12}  
month_mask = missing_df_raw.index.month.isin(TEST_MONTHS)

missing_df_raw = missing_df_raw.loc[month_mask]
ground_df      = ground_df.loc[month_mask]
artificial_mask = artificial_mask.loc[month_mask]

print(f"After month filter: {len(missing_df_raw)} timestamps remain")

#window with valid_ends (the ideat that we take only the windows where we have at least one artifical missed value in the last column)
EVAL_LEN = 36
T = len(missing_df_raw)
valid_ends = []

for t in range(EVAL_LEN - 1, T):
    if artificial_mask.iloc[t].any():
        valid_ends.append(t)

print("Number of valid test windows:", len(valid_ends))


class PriSTITestDataset(Dataset):
    def __init__(self, data_df, gt_df, ends, eval_len, scaler):
        filled     = data_df.ffill().bfill()
        self.data_np   = filled.to_numpy(dtype=np.float32)            # [T, N]
        self.mask_np   = (~data_df.isna()).to_numpy(dtype=np.float32) # [T, N]
        self.gt_np     = gt_df.fillna(0.0).to_numpy(dtype=np.float32) # [T, N]

        self.scaled    = scaler(self.data_np)     # [T, N]
        self.gt_scaled = scaler(self.gt_np)       # [T, N]

        self.ends     = ends
        self.eval_len = eval_len

    def __len__(self):
        return len(self.ends)

    def __getitem__(self, idx):
        end   = self.ends[idx]
        start = end - (self.eval_len - 1)

        window_x = self.scaled   [start : end+1]  # [T, N]
        window_m = self.mask_np  [start : end+1]  # [T, N]
        y_true   = self.gt_scaled[end]            # [N]

        # Transpose to match PriSTI API: [N, T]
        x = torch.from_numpy(window_x.T.copy())
        m = torch.from_numpy(window_m.T.copy())
        y = torch.from_numpy(y_true.copy())

        return x, m, y


test_ds = PriSTITestDataset(
    data_df   = missing_df_raw,
    gt_df     = ground_df,
    ends      = valid_ends,
    eval_len  = EVAL_LEN,
    scaler    = scaler
)

torch.backends.cudnn.enabled = False
test_loader = DataLoader(test_ds, batch_size=32, shuffle=False)

print("Final dataset length:", len(test_ds))


In [None]:
import numpy as np
import torch
from sklearn.metrics import mean_absolute_error, mean_squared_error
import time
from tqdm import tqdm

t0 = time.perf_counter()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device).eval()

all_preds, all_truth, all_mask = [], [], []

with torch.no_grad():
    for x_batch, m_batch, y_batch in tqdm(test_loader, desc="Evaluating", unit="batch"):
        # Move to device
        x_batch = x_batch.to(device)   
        m_batch = m_batch.to(device)   
        y_batch = y_batch.to(device)   

        B, N, T = x_batch.shape
        inner = getattr(model, "model", model)

        # side_info
        tp = torch.arange(T, device=device).unsqueeze(0).expand(B, -1)
        side_info = inner.get_side_info(tp, m_batch)

        itp_info = None
        if getattr(inner, "use_guide", False):
            itp_info = torch.zeros((B, 1, N, T), device=device, dtype=torch.float32)

        # Model inference
        y_pred = inner.impute(
            x_batch, m_batch, side_info,
            n_samples=10,
            itp_info=itp_info
        )

        # Handle sample dimension if present
        if y_pred.dim() == 4:
            y_pred = y_pred.mean(dim=1)  # [B, N, T]

        # Extract last time stamp values
        last_scaled = y_pred[:, :, -1]                      # [B, N]
        preds       = inv_scaler(last_scaled.cpu().numpy()) # [B, N]
        truth       = inv_scaler(y_batch.cpu().numpy())     # [B, N]
        mask_last   = m_batch[:, :, -1].cpu().numpy()       # [B, N]

        all_preds.append(preds)
        all_truth.append(truth)
        all_mask.append(mask_last)

# Stacking
all_preds = np.vstack(all_preds)  # [num_windows, N]
all_truth = np.vstack(all_truth)  # [num_windows, N]
all_mask  = np.vstack(all_mask)   # [num_windows, N]

# Evaluate only on artificially missing values
miss_pos = (all_mask == 0)
mae  = mean_absolute_error(all_truth[miss_pos], all_preds[miss_pos])
rmse = np.sqrt(mean_squared_error(all_truth[miss_pos], all_preds[miss_pos]))

t1 = time.perf_counter()
print(f"Total inference time: {t1 - t0:.3f} seconds")
print(f"PriSTI Test MAE:  {mae:.4f}")
print(f"PriSTI Test RMSE: {rmse:.4f}")
# if you are on cluster you have to return or write all the results you needed