In [1]:

import pywt
from copy import deepcopy
from ttknn.light_utility import Utility
from tqdm import tqdm
import numpy as np
import pandas as pd
import torch
import sys
import os
parent_dir = os.path.join(os.getcwd(), '..')
if parent_dir not in sys.path: sys.path.append(parent_dir)
from torch import nn
from mkit.torch_support.tensor_utils import (
    sequential_x_y_split, xy_to_tensordataset,
)
from mkit.torch_support.nn_utils import training_loop
import geobleu
from IPython.display import clear_output
# from model.CNN import FlexibleCNN
from model.NN import NN
from module.utility import LabelEncoder
from dotenv import load_dotenv

In [2]:

df = pd.read_csv('./cityD-dataset.csv')

In [3]:
ax_series = df['x'].values
ay_series = df['y'].values
def wavelet_denoise(data, wavelet='db4', level=None, alpha=1.0):
    coeffs = pywt.wavedec(data, wavelet, mode="per", level=level)
    sigma = np.median(np.abs(coeffs[-1])) / 0.6745
    # Multiply the universal threshold by "alpha" (explained below)
    uthresh = alpha * sigma * np.sqrt(2 * np.log(len(data))) 
    
    # Soft-threshold detail coefficients
    coeffs[1:] = [pywt.threshold(c, value=uthresh, mode='soft') for c in coeffs[1:]]
    
    return pywt.waverec(coeffs, wavelet, mode="per")

# Example usage
ax_denoised = wavelet_denoise(ax_series, wavelet='db4', level=5, alpha=5)[:len(ax_series)]
ay_denoised = wavelet_denoise(ay_series, wavelet='db4', level=5, alpha=5)[:len(ay_series)]
df['denoised_x'] = np.round(ax_denoised).astype(int)
df['denoised_y'] = np.round(ay_denoised).astype(int)


In [94]:


# Load .env variables
load_dotenv()
SPLIT_DATE = 60
END_DATE = 75
NUM_OF_TIMESTAMPS = 48
SAMPLE_NUM = 3_000
VOCAB=40401
LENGTHS = [NUM_OF_TIMESTAMPS * 7]
EPOCHS = 20
DEVICE = torch.device("cuda")
uid = 35
uid_df = df[df.uid == uid]
train_df, test_df = uid_df[uid_df.d < 60], uid_df[uid_df.d > 60]

encoder = LabelEncoder()
train_template = pd.MultiIndex.from_product([train_df.uid.unique(), range(SPLIT_DATE), range(48)])
train_template = pd.DataFrame(index=train_template).reset_index()
train_template.columns = ['uid', 'd', 't']
test_template = pd.DataFrame(index=pd.MultiIndex.from_product([range(SPLIT_DATE, END_DATE), range(NUM_OF_TIMESTAMPS)])).reset_index()
test_template.columns = ['d', 't']
train_df = train_template.merge(train_df, on=['uid', 'd', 't'], how='left')
train_df = train_df.fillna(method='ffill').fillna(method='bfill')
'''Inference'''
AHEAD = test_template.d.unique().__len__() * test_template.t.unique().__len__() # length of target sequences
'''Training Data Processing'''
tmp_train_df = train_df[train_df.columns[-2:].values].copy()
tmp_train_df = tmp_train_df.rename(columns={'denoised_x': "x", "denoised_y": 'y'})
seq_train = train_df.dropna().apply(encoder.transform, axis=1)

In [95]:

class SingleNN(nn.Module):
    def __init__(self, window_size, embed_dim, vocab):
        super(SingleNN, self).__init__()

        self.embed = nn.Embedding(vocab, embed_dim)
        self.window_size = window_size
        self.net = nn.Sequential(
            nn.Linear(embed_dim * window_size, 16),
            nn.LayerNorm(16),
            nn.Tanh(),
            nn.Linear(16, 32),
            nn.LayerNorm(32),
            nn.Tanh(),
            nn.Linear(32, vocab),
        )
    def forward(self, x):
        x = self.embed(x)
        x = x.reshape(len(x), -1)
        x = self.net(x)
        return x
def get_model(df, look_back, col='x'):

    train_x, train_y = sequential_x_y_split(
        df[col].values,
        look_back=look_back,
    )
    train_y = train_y.ravel()
    model = SingleNN(train_x.shape[1], 100, VOCAB)
    train_loader = xy_to_tensordataset(train_x, train_y, return_loader=True)
    model = model.cuda()
    final_segment = train_x[-1]
    losses = []
    optim = torch.optim.Adamax(model.parameters())
    criterion = nn.CrossEntropyLoss()

    for epoch in range(20):
        avg_loss = 0.0
        for loader in train_loader:
            x, y = loader
            x = x.cuda().to(torch.int64)
            y = y.cuda().to(torch.int64)
            out = model(x)
            loss = criterion(out, y)
            optim.zero_grad()
            loss.backward()
            optim.step()
            avg_loss += loss.item()
        losses.append(avg_loss / len(train_loader))
    return model, final_segment

x_model, x_final_segment = get_model(LENGTHS[0], col='x')
y_model, y_final_segment = get_model(LENGTHS[0], col='y')


TypeError: get_model() missing 1 required positional argument: 'look_back'

In [89]:

import torch
from tqdm import tqdm

def predict_sequence(model, initial_tokens, ahead, device):
    """
    Generate 'ahead' tokens from the 'model' given an initial token sequence.
    
    Args:
        model (nn.Module): The trained model.
        initial_tokens (list or np.array): The initial token sequence.
        ahead (int): Number of tokens to predict.
        device (torch.device): The device (CPU or GPU).
    
    Returns:
        list: A list of predicted tokens.
    """

    # Convert initial tokens to a tensor of shape (1, sequence_length)
    in_x = torch.tensor(initial_tokens, dtype=torch.int64).to(device).unsqueeze(0)
    print("Initial in_x shape:", in_x.shape)
    
    sequences = []
    for _ in tqdm(range(ahead)):
        # Forward pass to get the model's output
        out = model(in_x)
        
        # Take the index of the most likely token
        out_token = torch.argmax(out, dim=1).unsqueeze(0)
        in_x = torch.concat([in_x, out_token], dim=1)[:, 1:]
        
        # Store the predicted token
        sequences.append(out_token[0].item())
    
    return sequences
x_segments = predict_sequence(x_model, x_final_segment, AHEAD, DEVICE)
y_segments = predict_sequence(y_model, y_final_segment, AHEAD, DEVICE)
tmp_template = test_template
tmp_template['x'] = x_segments
tmp_template['y'] = y_segments
tmp_template = test_df[['uid', 'd', 't']].merge(tmp_template, on=['d', 't'], how='left')

dtw_score = geobleu.calc_dtw(
    Utility.to_eval_format(tmp_template),
    Utility.to_eval_format(test_df)
)
dtw_score

Initial in_x shape: torch.Size([1, 336])


100%|██████████| 720/720 [00:00<00:00, 786.57it/s]


Initial in_x shape: torch.Size([1, 336])


100%|██████████| 720/720 [00:00<00:00, 743.37it/s]


113.20664650155058

154.30421415852805

In [None]:
raise Exception
length = LENGTHS[0]



train_x, train_y = sequential_x_y_split(
    seq_train.values,
    look_back=length,
)
train_y = train_y.ravel()
train_x.shape, train_y.shape
# 3) Convert to Torch loaders
train_loader, val_loader = xy_to_tensordataset(
    train_x,
    train_y,
    shuffle=False,
    input_dtype=torch.float,
    output_dtype=torch.long,
    return_loader=True,
    val_ratio=0.2
)
nn_model = NN(
    input_size=length,
    start_dim=32,
    n_layers=5,
    output_size=VOCAB
)
# 4) Train the model
training_loop(
    nn_model,
    device=DEVICE,
    train_loader=train_loader,
    optimizer=torch.optim.Adam(nn_model.parameters()),
    criterion=torch.nn.CrossEntropyLoss(),
    epochs=10,
    val_loader=val_loader,
    early_stopping=True
)

# If you're in a notebook, you might want to clear the output each iteration:
# clear_output(wait=True)
print(f"Finished training for length={length}. Now predicting...")

# 5) Model prediction
nn_model.eval()
nn_model.cpu()

sequences = []
# Start input from the last training example
in_x = torch.tensor(np.array([train_x[-1]]))

for _ in tqdm(range(AHEAD)):
    # Forward pass
    out = torch.argmax(nn_model(in_x.float()), dim=1).unsqueeze(-1)
    # Append the newly predicted token
    in_x = torch.concat([in_x, out], dim=1)[:, 1:]
    sequences.append(out[0])

# Decode the entire generated sequence
predicted_tokens = torch.concat(sequences).tolist()
target_val = [encoder.decode(i) for i in predicted_tokens]

# 6) Format the prediction output
tmp_df = test_template.copy()
tmp_df[['x', 'y']] = np.array(target_val)

test_uid_df = test_df[test_df.uid == uid]  # Testing subset for current uid
sub_df = test_uid_df[['d', 't']].merge(tmp_df, how='left')

# 7) Calculate score (DTW) and store results
dtw_score = geobleu.calc_dtw(
    Utility.to_eval_format(sub_df),
    Utility.to_eval_format(test_uid_df)
)

Exception: 

In [None]:
def toxy(data):
    return data.values[:, 0], data.values[:, 1]
x, y = toxy(test_uid_df[['x', 'y']])
plt.scatter(x, y)
x, y = toxy(sub_df[['x', 'y']])
plt.scatter(x, y, s=10)

In [None]:
def toxy(data):
    return data.values[:, 0], data.values[:, 1]
x, y = toxy(test_uid_df[['x', 'y']])
plt.scatter(x, y)
x, y = toxy(sub_df[['x', 'y']])
plt.scatter(x, y, s=10)

In [None]:

results = []  # List to store outcomes for each length

for length in LENGTHS:
    # 1) Define the NN model
    nn_model = NN(
        input_size=length,
        start_dim=32,
        n_layers=5,
        output_size=VOCAB
    )

    # 2) Prepare the training data
    train_x, train_y = sequential_x_y_split(
        seq_train.values,
        look_back=length,
    )
    train_y = train_y.ravel()

    # 3) Convert to Torch loaders
    train_loader, val_loader = xy_to_tensordataset(
        train_x,
        train_y,
        shuffle=False,
        input_dtype=torch.float,
        output_dtype=torch.long,
        return_loader=True,
        val_ratio=0.2
    )

    # 4) Train the model
    training_loop(
        nn_model,
        device=DEVICE,
        train_loader=train_loader,
        optimizer=torch.optim.Adam(nn_model.parameters()),
        criterion=torch.nn.CrossEntropyLoss(),
        epochs=EPOCHS,
        val_loader=val_loader,
        early_stopping=True
    )

    # If you're in a notebook, you might want to clear the output each iteration:
    # clear_output(wait=True)
    print(f"Finished training for length={length}. Now predicting...")

    # 5) Model prediction
    nn_model.eval()
    nn_model.cpu()

    sequences = []
    # Start input from the last training example
    in_x = torch.tensor(np.array([train_x[-1]]))

    for _ in tqdm(range(AHEAD)):
        # Forward pass
        out = torch.argmax(nn_model(in_x.float()), dim=1).unsqueeze(-1)
        # Append the newly predicted token
        in_x = torch.concat([in_x, out], dim=1)[:, 1:]
        sequences.append(out[0])

    # Decode the entire generated sequence
    predicted_tokens = torch.concat(sequences).tolist()
    target_val = [encoder.decode(i) for i in predicted_tokens]

    # 6) Format the prediction output
    tmp_df = test_template.copy()
    tmp_df[['x', 'y']] = np.array(target_val)

    test_uid_df = test_df[test_df.uid == uid]  # Testing subset for current uid
    sub_df = test_uid_df[['d', 't']].merge(tmp_df, how='left')

    # 7) Calculate score (DTW) and store results
    dtw_score = geobleu.calc_dtw(
        Utility.to_eval_format(sub_df),
        Utility.to_eval_format(test_uid_df)
    )

    # Store everything of interest in a dictionary
    iteration_result = {
        'length': length,
        'predicted_sequence': target_val,
        'dtw_score': dtw_score
    }
    results.append(iteration_result)

# After the loop, you can analyze all results:
# for r in results:
#     print(f"Length: {r['length']}, DTW: {r['dtw_score']}, Sequence sample: {r['predicted_sequence'][:5]}")

# Or convert to a pandas DataFrame if you like:
# import pandas as pd
# results_df = pd.DataFrame(results)
# display(results_df)


In [None]:
out.shape, in_x.shape