## Define the model




In [1]:
%%capture installs
%pip install polars
%pip install torchviz
# %conda install -y conda install -c  python-graphviz
# %conda install -c fastchan python-graphviz -y

In this tutorial, we train a ``nn.TransformerEncoder`` model on a
language modeling task. The language modeling task is to assign a
probability for the likelihood of a given word (or a sequence of words)
to follow a sequence of words. A sequence of tokens are passed to the embedding
layer first, followed by a positional encoding layer to account for the order
of the word (see the next paragraph for more details). The
``nn.TransformerEncoder`` consists of multiple layers of
[nn.TransformerEncoderLayer](https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoderLayer.html)_.
Along with the input sequence, a square attention mask is required because the
self-attention layers in ``nn.TransformerEncoder`` are only allowed to attend
the earlier positions in the sequence. For the language modeling task, any
tokens on the future positions should be masked. To produce a probability
distribution over output words, the output of the ``nn.TransformerEncoder``
model is passed through a linear layer followed by a log-softmax function.




In [2]:
import math
import os
import time
from tempfile import TemporaryDirectory
from typing import Tuple

import re
from numbers import Number

from dataclasses import dataclass, field
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

from tqdm import trange, tqdm

import torch
import polars as pl
import numpy as np

from torch.utils.data import DataLoader, Dataset

import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset



In [3]:
torch.__version__

'2.1.0.dev20230623'

In [4]:
if torch.backends.mps.is_built():
    device_name = 'mps'
elif torch.cuda.is_available():
    device_name = 'cuda'
else:
    device_name = 'cpu'
device = torch.device(device_name)
print(device)

mps


In [5]:
weather = pl.read_parquet("~/Hephaestus/data/weather_clean.parquet")
weather.head()


x,y,station_name,climate_identifier,province_code,local_year,local_month,local_day,local_hour,temp,temp_flag,dew_point_temp,dew_point_temp_flag,humidex,precip_amount,precip_amount_flag,relative_humidity,relative_humidity_flag,station_pressure,station_pressure_flag,wind_chill,wind_direction,wind_direction_flag,wind_speed,wind_speed_flag
f64,f64,str,str,str,str,str,str,str,f64,str,f64,str,f64,f64,str,f64,str,f64,str,f64,f64,str,f64,str
-114.000297,51.109447,"""CALGARY INT'L …","""3031094""","""AB""","""2010""","""1""","""1""","""0""",-21.6,"""missing""",-23.9,"""missing""",,,"""missing""",82.0,"""missing""",89.38,"""missing""",,,"""M""",,"""M"""
-114.000297,51.109447,"""CALGARY INT'L …","""3031094""","""AB""","""2010""","""1""","""1""","""1""",-21.2,"""missing""",-23.5,"""missing""",,,"""missing""",82.0,"""missing""",89.25,"""missing""",,,"""M""",,"""M"""
-114.000297,51.109447,"""CALGARY INT'L …","""3031094""","""AB""","""2010""","""1""","""1""","""2""",-20.8,"""missing""",-23.0,"""missing""",,,"""missing""",82.0,"""missing""",89.21,"""missing""",,,"""M""",,"""M"""
-114.000297,51.109447,"""CALGARY INT'L …","""3031094""","""AB""","""2010""","""1""","""1""","""3""",-20.4,"""missing""",-22.6,"""missing""",,,"""missing""",83.0,"""missing""",89.12,"""missing""",,,"""M""",,"""M"""
-114.000297,51.109447,"""CALGARY INT'L …","""3031094""","""AB""","""2010""","""1""","""1""","""4""",-20.4,"""missing""",-22.7,"""missing""",,,"""missing""",82.0,"""missing""",89.04,"""missing""",,,"""M""",,"""M"""


``PositionalEncoding`` module injects some information about the
relative or absolute position of the tokens in the sequence. The
positional encodings have the same dimension as the embeddings so that
the two can be summed. Here, we use ``sine`` and ``cosine`` functions of
different frequencies.




In [6]:
def scale_numeric(df):
    for col in df.columns:
        if df[col].dtype == pl.Float64 or df[col].dtype == pl.Int64:
            df = df.with_columns(
                ((pl.col(col) - pl.col(col).mean()) / pl.col(col).std()).alias(col)
            )  # .select(pl.col(["dew_point_temp", "NewCOL"]))
    return df


weather = scale_numeric(weather)


In [7]:
def make_lower_remove_special_chars(df):
    df = df.with_columns(
        pl.col(pl.Utf8).str.to_lowercase().str.replace_all("[^a-zA-Z0-9]", " ")
    )
    return df


weather = make_lower_remove_special_chars(weather)

In [49]:
weather.select(pl.col(pl.Utf8)).columns

['station_name',
 'climate_identifier',
 'province_code',
 'local_year',
 'local_month',
 'local_day',
 'local_hour',
 'temp_flag',
 'dew_point_temp_flag',
 'precip_amount_flag',
 'relative_humidity_flag',
 'station_pressure_flag',
 'wind_direction_flag',
 'wind_speed_flag']

In [60]:
x = weather["station_name"].unique().to_numpy()
np.concatenate([np.array(i.split(" ")) for i in x])

array(['lethbridge', 'cda', 'fort', 'mcmurray', 'cs', 'pincher', 'creek',
       'climate', 'sundre', 'a', 'edmonton', 'international', 'cs',
       'calgary', 'int', 'l', 'cs'], dtype='<U13')

In [61]:
def get_unique_utf8_values(df):
    arr = np.array([])
    for col in df.select(pl.col(pl.Utf8)).columns:
        col_vals = df[col].unique().to_numpy()
        arr = np.append(arr, np.concatenate([np.array(val.split(" ")) for val in col_vals]))

    return np.unique(arr)


weather_val_tokens = get_unique_utf8_values(weather)
weather_val_tokens

array(['0', '1', '10', '11', '12', '13', '14', '15', '16', '17', '18',
       '19', '2', '20', '2010', '2011', '2012', '2013', '2014', '2015',
       '2016', '2017', '2018', '2019', '2020', '2021', '2022', '21', '22',
       '23', '24', '25', '26', '27', '28', '29', '3', '30', '3012206',
       '3026knq', '3031094', '3033890', '3035208', '3062696', '31', '4',
       '5', '6', '7', '8', '9', 'a', 'ab', 'calgary', 'cda', 'climate',
       'creek', 'cs', 'edmonton', 'fort', 'int', 'international', 'l',
       'lethbridge', 'm', 'mcmurray', 'missing', 'pincher', 'sundre'],
      dtype='<U32')

In [62]:
def get_col_tokens(df):
    tokens = []
    for col_name in df.columns:
        sub_strs = re.split(r"[^a-zA-Z0-9]", col_name)
        tokens.extend(sub_strs)
    return np.unique(np.array(tokens))


weather_col_tokens = get_col_tokens(weather)
weather_col_tokens

array(['amount', 'chill', 'climate', 'code', 'day', 'dew', 'direction',
       'flag', 'hour', 'humidex', 'humidity', 'identifier', 'local',
       'month', 'name', 'point', 'precip', 'pressure', 'province',
       'relative', 'speed', 'station', 'temp', 'wind', 'x', 'y', 'year'],
      dtype='<U10')

In [10]:
special_tokens = np.array(
    [
        "missing",
        "<mask>",
        "<pad>",
        "<unk>",
        "<numeric>",
        ":",
        ",",
        "<row-start>",
        "<row-end>",
    ]
)
tokens = np.unique(
    np.concatenate(
        (
            weather_val_tokens,
            weather_col_tokens,
            special_tokens,
        )
    )
)
tokens


array([',', '0', '1', '10', '11', '12', '13', '14', '15', '16', '17',
       '18', '19', '2', '20', '2010', '2011', '2012', '2013', '2014',
       '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022',
       '21', '22', '23', '24', '25', '26', '27', '28', '29', '3', '30',
       '3012206', '3026knq', '3031094', '3033890', '3035208', '3062696',
       '31', '4', '5', '6', '7', '8', '9', ':', '<mask>', '<numeric>',
       '<pad>', '<row-end>', '<row-start>', '<unk>', 'ab', 'amount',
       'calgary int l cs', 'chill', 'climate', 'code', 'day', 'dew',
       'direction', 'edmonton international cs', 'flag',
       'fort mcmurray cs', 'hour', 'humidex', 'humidity', 'identifier',
       'lethbridge cda', 'local', 'm', 'missing', 'month', 'name',
       'pincher creek climate', 'point', 'precip', 'pressure', 'province',
       'relative', 'speed', 'station', 'sundre a', 'temp', 'wind', 'x',
       'y', 'year'], dtype=object)

In [11]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 100_000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [12]:
@dataclass
class StringNumeric:
    value: Union[str, float]
    # all_tokens: np.array
    is_numeric: bool = field(default=None, repr=True)
    embedding_idx: int = field(default=None, repr=True)
    is_special: bool = field(default=False, repr=True)
    def __post_init__(self):
        if isinstance(self.value, str):
            self.is_numeric = False
        else:
            self.is_numeric = True
            self.embedding_idx = 0

    def gen_embed_idx(self, tokens: np.array, special_tokens: np.array):
        if not self.is_numeric:
            try:
                self.embedding_idx = np.where(tokens == self.value)[0][0] + 1
            except IndexError:
                self.embedding_idx = np.where(tokens == "<unk>")[0][0] + 1
            if self.value in special_tokens:
                self.is_special = True


In [13]:
class TabularDataset(Dataset):
    # def __init__(self, df: pl.DataFrame, vocab_dict: Dict, m_dim: int) -> Dataset:
    def __init__(
        self,
        df: pl.DataFrame,
        vocab,
        special_tokens: np.array,
        shuffle_cols=False,
        max_row_length=512,
    ) -> Dataset:
        self.df = df
        self.vocab = vocab
        self.special_tokens = special_tokens
        self.vocab_len = vocab.shape[0]
        self.shuffle_cols = shuffle_cols
        self.max_row_length = max_row_length
        # self.vocab_dict = vocab_dict
        # self.embedding = nn.Embedding(len(self.string_vocab), m_dim)
        # Numeric Scale

        # self.col_vocab = self.df.columns

    def __len__(self):
        """Returns the number of sequences in the dataset."""
        length = self.df.shape[0]
        return length

    def __getitem__(self, idx):
        """Returns a tuple of (input, target) at the given index."""
        row = self.df[idx]
        row = self.splitter(row)
        return row



    def splitter(self, row: pl.DataFrame) -> List[Union[str, float, None]]:
        vals = ["<row-start>"]
        cols = row.columns
        if self.shuffle_cols:
            np.random.shuffle(cols)

        for col in cols:
            value = row[col][0]
            col = col.split("_")
            vals.extend(col)
            vals.append(":")
            if isinstance(value, Number):
                vals.append(value)
            elif value is None:
                vals.append("missing")
                # Nones are only for numeric columns, others are "None"
            elif isinstance(value, str):
                vals.extend(value.split(" "))
            else:
                raise ValueError("Unknown type")
            vals.append(",")
        vals.append("<row-end>")

        val_len = len(vals)
        if val_len < self.max_row_length:
            diff = self.max_row_length - val_len
            vals.extend(["<pad>"] * diff)
        elif val_len > self.max_row_length:
            vals = vals[:self.max_row_length - 1]
            # add warning

            vals = np.append(vals, ["<row-end>"])
            print("Row too long, truncating")
            Warning("Row too long, truncating")
        vals = [StringNumeric(value=val) for val in vals]
        for val in vals:
            val.gen_embed_idx(self.vocab, self.special_tokens)

        return vals


weather_ds = TabularDataset(weather, tokens, special_tokens=special_tokens, shuffle_cols=False, max_row_length=140 )

print(len(weather_ds[0]))

140


In [14]:
class StringNumericEmbedding(nn.Module):
    def __init__(self, n_token: int, d_model: int):
        super().__init__()
        self.embedding = nn.Embedding(n_token+1, d_model, padding_idx=0).to(device)

    def forward(self, input: StringNumeric):
        embedding_index = torch.tensor([i.embedding_idx for i in input]).to(device)
        embed = self.embedding(embedding_index)
        with torch.no_grad():
            for idx, value in enumerate(input):
                if value.is_numeric:
                    embed[idx][0] = value.value
        return embed


In [15]:
def mask_row(row):
    row = row[:]
    prob = 0.15
    for idx, val in enumerate(row):
        if val.is_special:
            continue
        if np.random.rand() < prob:
            val = StringNumeric(value="<mask>")
            val.gen_embed_idx(tokens, special_tokens)
            row[idx] = val
    return row


## Load and batch data




In [16]:
def batch_data(ds, idx: int, n_row=4):
    target = []
    if len(ds) > n_row+idx:
        end_idx = n_row+idx
    else:
        end_idx = len(ds) - 1
    for i in range(idx, end_idx):
        target.extend(ds[i])
    
    batch = mask_row(target)

    return batch, target


The model hyperparameters are defined below. The ``vocab`` size is
equal to the length of the vocab object.




In [17]:
data, targets = batch_data(weather_ds, idx=635650, n_row=50)
print(len(data), len(targets))
# data[0]

1820 1820


In [18]:

class TransformerModel(nn.Module):

    def __init__(self, n_token: int, d_model: int, n_head: int, d_hid: int,
                n_layers: int, dropout: float = 0.15):
        super().__init__()
        n_token = n_token +1
        self.n_token = n_token
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, n_head, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, n_layers)
        self.encoder = StringNumericEmbedding(n_token, d_model)
        self.d_model = d_model
        self.decoder = nn.Linear(d_model, n_token)
        self.numeric_decoder = nn.Linear(d_model, n_token)
        self.numeric_flattener = nn.Linear(n_token, 1)

        # self.numeric_decoder = nn.Linear(d_model)


        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.encoder.embedding.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)
        # self.numeric_decoder.data.uniform(-initrange, initrange)

    def forward(self, src: Tensor, src_mask: Tensor = None) -> Tensor:
        """
        Arguments:
            src: Tensor, shape ``[seq_len, batch_size]``
            src_mask: Tensor, shape ``[seq_len, seq_len]``

        Returns:
            output Tensor of shape ``[seq_len, batch_size, ntoken]``
        """
        # src_shape = src.shape
        # print(f"raw src_shape: {len(src)}")
        src = self.encoder(src) * math.sqrt(self.d_model)
        src = torch.unsqueeze(src, dim=1)
        # print(f"encoded src_shape: {src.shape}")

        src_shape = src.shape
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        # print(f"output_shape: {output.shape}")
        numeric_output = self.numeric_decoder(output)  # .flatten()
        numeric_output = torch.squeeze(numeric_output, dim=1)
        # numeric_output = torch.mean(numeric_output, [1])
        numeric_output = self.numeric_flattener(numeric_output)
        # numeric_output = nn.flatten(numeric_output)
        output = self.decoder(output)
        output = torch.squeeze(output, dim=1)
        numeric_output = numeric_output.view(output.shape[0])

        # print(f"output_shape decoded: {output.shape}")
        # output = output.view(-1, self.n_token+1)
        # output = output.view(-1, src_shape[0]).T
        # print(f"output_shape view: {output.shape}")

        return output, numeric_output


In [19]:
n_token = len(weather_ds.vocab)  # size of vocabulary
d_model = 32  # embedding dimension
d_hid = 200  # dimension of the feedforward network model in ``nn.TransformerEncoder``
n_layers = 4  # number of ``nn.TransformerEncoderLayer`` in ``nn.TransformerEncoder``
n_head = 4  # number of heads in ``nn.MultiheadAttention``
dropout = 0.2  # dropout probability
model = TransformerModel(n_token, d_model, n_head, d_hid, n_layers, dropout).to(device)
model.load_state_dict(torch.load("model_path.pt"))
model.eval()



TransformerModel(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=32, out_features=32, bias=True)
        )
        (linear1): Linear(in_features=32, out_features=200, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=200, out_features=32, bias=True)
        (norm1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
      )
    )
  )
  (encoder): StringNumericEmbedding(
    (embedding): Embedding(97, 32, padding_idx=0)
  )
  (decoder): Linear(in_features=32, out_features=96, bias=True)
  (numeric_decoder): Linear(in

## Run the model




In [20]:
def custom_loss(class_preds, numeric_preds, raw_data):
    cross_entropy = nn.CrossEntropyLoss()
    mse_loss = nn.MSELoss()
    raw_data_numeric_class = raw_data[:]
    
    for idx, val in enumerate(raw_data_numeric_class):
        if val.is_numeric:
            val = StringNumeric(value="<numeric>")
            val.gen_embed_idx(tokens, special_tokens)
            raw_data_numeric_class[idx] = val

    class_target = torch.tensor([i.embedding_idx for i in raw_data_numeric_class]).to(device)
    class_loss = cross_entropy(class_preds, class_target)
    
    actual_num_idx = torch.tensor([idx for idx, j in enumerate(raw_data) if j.is_numeric]).to(device)
    pred_nums = numeric_preds[actual_num_idx]
    # print(actual_num_idx.shape)
    actual_nums = torch.tensor([i.value for i in raw_data if i.is_numeric]).to(device)
    # print(actual_nums.shape)
    # print(pred_nums.shape)
    reg_loss = mse_loss(pred_nums, actual_nums)
    reg_loss_adjuster = 1 # class_loss/reg_loss
    
    return reg_loss*reg_loss_adjuster+class_loss, {"reg_loss": reg_loss,
                                                  "class_loss": class_loss}



In [21]:
data, targets = batch_data(weather_ds, idx=0, n_row=3)
print(len(data), len(targets))
# data[0]

420 420


In [22]:
cat, num = model(data)


In [23]:
def custom_loss_manual(class_preds, numeric_preds, raw_data):
    cross_entropy = nn.CrossEntropyLoss()
    mse_loss = nn.MSELoss()
    raw_data_numeric_class = raw_data[:]
    
    for idx, val in enumerate(raw_data_numeric_class):
        if val.is_numeric:
            val = StringNumeric(value="<numeric>")
            val.gen_embed_idx(tokens, special_tokens)
            raw_data_numeric_class[idx] = val

    class_target = torch.tensor([i.embedding_idx for i in raw_data_numeric_class]).to(device)
    class_loss = cross_entropy(class_preds, class_target)
    
    actual_num_idx = torch.tensor([idx for idx, j in enumerate(raw_data) if j.is_numeric]).to(device)
    pred_nums = numeric_preds[actual_num_idx]
    # print(actual_num_idx.shape)
    actual_nums = torch.tensor([i.value for i in raw_data if i.is_numeric]).to(device)
    # print(actual_nums.shape)
    # print(pred_nums.shape)
    reg_loss = mse_loss(pred_nums, actual_nums)
    reg_loss_adjuster = 1 # class_loss/reg_loss
    
    return reg_loss*reg_loss_adjuster+class_loss, {"reg_loss": reg_loss,
                                                  "class_loss": class_loss}

custom_loss_manual(cat, num, targets)

(tensor(0.4961, device='mps:0', grad_fn=<AddBackward0>),
 {'reg_loss': tensor(0.1621, device='mps:0', grad_fn=<MseLossBackward0>),
  'class_loss': tensor(0.3340, device='mps:0', grad_fn=<NllLossBackward0>)})

In [26]:
lsm = nn.Softmax(dim=0)
l_cat = lsm(cat.T)
l_cat = torch.argmax(l_cat, dim=0)

In [27]:
gen_tokens = []
for idx, pred in enumerate(l_cat):
    token = tokens[pred-1]
    if token == "<numeric>":
        gen_tokens.append("num_" + str(num[idx].item()))
    else:
        gen_tokens.append(token)
preds = " ".join(gen_tokens)
print(f"""Predicted Row 1:\n\n{preds.split("<row-end>")[0]}""")

Predicted Row 1:

<row-start> x : num_-0.4832974970340729 , num_-0.48560890555381775 : num_-0.47894158959388733 , station name : <unk> <unk> num_-0.47448650002479553 <unk> , climate identifier : 3031094 , province code : ab , local year : num_-0.4874148666858673 , num_-0.4813879430294037 month : 1 , local day : 1 , local hour : 0 , temp : num_-1.9443422555923462 , temp num_-0.47571173310279846 : missing , dew point temp : num_-0.4813378155231476 , dew point temp flag : missing , num_-0.4862686097621918 : missing , precip amount : missing , precip num_-0.4852672517299652 flag : missing , relative humidity : num_0.4904561936855316 , relative humidity num_-0.4821159541606903 : missing , station num_-0.47669723629951477 : num_-0.45177289843559265 , station pressure num_-0.4791191518306732 : missing , wind chill : missing , wind num_-0.4888112246990204 : missing , num_-0.47605130076408386 direction flag : m , wind num_-0.4834161102771759 : missing , wind speed flag : m , 


In [28]:
actuals = [str(i.value) for i in targets]
actuals_ = " ".join(actuals)
print(f"""Actual Row 1:\n\n{actuals_.split("<row-end>")[0]}""")

Actual Row 1:

<row-start> x : -0.551099305737714 , y : -0.37817406811183396 , station name : calgary int l cs , climate identifier : 3031094 , province code : ab , local year : 2010 , local month : 1 , local day : 1 , local hour : 0 , temp : -2.034580198428538 , temp flag : missing , dew point temp : -2.0356676557539686 , dew point temp flag : missing , humidex : missing , precip amount : missing , precip amount flag : missing , relative humidity : 0.6997231826522904 , relative humidity flag : missing , station pressure : -0.39734550502172294 , station pressure flag : missing , wind chill : missing , wind direction : missing , wind direction flag : m , wind speed : missing , wind speed flag : m , 


In [38]:
t1 = targets[0]

In [42]:
t1.embedding_idx

58

In [44]:
actual_idx = [str(i.value) + "=>" + str(i.embedding_idx) for i in targets]


In [45]:
actual_idx

['<row-start>=>58',
 'x=>93',
 ':=>53',
 '-0.551099305737714=>0',
 ',=>1',
 'y=>94',
 ':=>53',
 '-0.37817406811183396=>0',
 ',=>1',
 'station=>89',
 'name=>81',
 ':=>53',
 'calgary=>59',
 'int=>59',
 'l=>59',
 'cs=>59',
 ',=>1',
 'climate=>64',
 'identifier=>75',
 ':=>53',
 '3031094=>42',
 ',=>1',
 'province=>86',
 'code=>65',
 ':=>53',
 'ab=>60',
 ',=>1',
 'local=>77',
 'year=>95',
 ':=>53',
 '2010=>16',
 ',=>1',
 'local=>77',
 'month=>80',
 ':=>53',
 '1=>3',
 ',=>1',
 'local=>77',
 'day=>66',
 ':=>53',
 '1=>3',
 ',=>1',
 'local=>77',
 'hour=>72',
 ':=>53',
 '1=>3',
 ',=>1',
 'temp=>91',
 ':=>53',
 '-2.002552022767961=>0',
 ',=>1',
 'temp=>91',
 'flag=>70',
 ':=>53',
 'missing=>79',
 ',=>1',
 'dew=>67',
 'point=>83',
 'temp=>91',
 ':=>53',
 '-1.9971246260246964=>0',
 ',=>1',
 'dew=>67',
 'point=>83',
 'temp=>91',
 'flag=>70',
 ':=>53',
 'missing=>79',
 ',=>1',
 'humidex=>73',
 ':=>53',
 'missing=>79',
 ',=>1',
 'precip=>84',
 'amount=>61',
 ':=>53',
 'missing=>79',
 ',=>1',
 'precip=>

In [30]:
print(f"""Actual Row 1:\n{actuals_.split("<row-end>")[0]}""")
print(f"""Predicted Row 1:\n{preds.split("<row-end>")[0]}""")

Actual Row 1:

<row-start> x : -0.551099305737714 , y : -0.37817406811183396 , station name : calgary int l cs , climate identifier : 3031094 , province code : ab , local year : 2010 , local month : 1 , local day : 1 , local hour : 0 , temp : -2.034580198428538 , temp flag : missing , dew point temp : -2.0356676557539686 , dew point temp flag : missing , humidex : missing , precip amount : missing , precip amount flag : missing , relative humidity : 0.6997231826522904 , relative humidity flag : missing , station pressure : -0.39734550502172294 , station pressure flag : missing , wind chill : missing , wind direction : missing , wind direction flag : m , wind speed : missing , wind speed flag : m , 
Predicted Row 1:

<row-start> x : num_-0.4832974970340729 , num_-0.48560890555381775 : num_-0.47894158959388733 , station name : <unk> <unk> num_-0.47448650002479553 <unk> , climate identifier : 3031094 , province code : ab , local year : num_-0.4874148666858673 , num_-0.4813879430294037 mon

In [31]:
f1 = actuals_.split("<row-end>")[0]

In [41]:
f1.embedding_idx

AttributeError: 'str' object has no attribute 'embedding_idx'

In [29]:
def evaluate(model, data, i):
    model.eval
    with torch.no_grad():
        data, targets = batch_data(weather_ds, i, n_row=4)
        class_output, numeric_output = model(data)
        loss, loss_dict = custom_loss(class_output, numeric_output, targets)

        return data, targets, class_output, numeric_output, loss, loss_dict
data, targets, class_output, numeric_output, loss, loss_dict = evaluate(model, weather_ds, 1)


In [None]:
torch.softmax(class_output, 1).shape

In [None]:
m = nn.LogSoftmax(dim=1)
input = torch.randn(2, 3)
test_out = m(input)

In [None]:
test_out.sum(dim=1)

In [None]:
input, test_out

In [None]:
def evaluate(model: nn.Module, eval_data: Tensor) -> float:
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    src_mask = generate_square_subsequent_mask(bptt).to(device)
    with torch.no_grad():
        for i in range(0, eval_data.size(0) - 1, bptt):
            data, targets = get_batch(eval_data, i)
            seq_len = data.size(0)
            if seq_len != bptt:
                src_mask = src_mask[:seq_len, :seq_len]
            output = model(data, src_mask)
            output_flat = output.view(-1, n_token)
            total_loss += seq_len * criterion(output_flat, targets).item()
    return total_loss / (len(eval_data) - 1)

In [None]:
best_val_loss = float('inf')
epochs = 3

with TemporaryDirectory() as tempdir:
    best_model_params_path = os.path.join(tempdir, "best_model_params.pt")

    for epoch in range(1, epochs + 1):
        epoch_start_time = time.time()
        train(model)
        val_loss = evaluate(model, val_data)
        val_ppl = math.exp(val_loss)
        elapsed = time.time() - epoch_start_time
        print('-' * 89)
        print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
            f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}')
        print('-' * 89)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), best_model_params_path)

        scheduler.step()
    model.load_state_dict(torch.load(best_model_params_path)) # load best model states

## Evaluate the best model on the test dataset




In [None]:
test_loss = evaluate(model, test_data)
test_ppl = math.exp(test_loss)
print('=' * 89)
print(f'| End of training | test loss {test_loss:5.2f} | '
      f'test ppl {test_ppl:8.2f}')
print('=' * 89)

Now we want to make a fake dataset with 