# Industrial Transormers

## Libraries


In [19]:
import re

from numbers import Number
import math
import polars as pl
import torch
from torch import nn, Tensor
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer

# from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt

from dataclasses import dataclass
import typing as ty

# Data Preprocessing

- Drop all completely empty columns.
- Make local_* variables strings.
- Scale numeric variables.
- Drop id and date variables.
- Separate strings using.


In [2]:
weather = pl.read_parquet("./data/ab_weather.parquet")

# drop_cols = [weather.select]
keep_cols = [col for col in weather.columns if not weather[col].is_null().all()]
weather = weather.select(keep_cols)
weather = weather.select(pl.all().exclude(["id", "local_date"]))
weather = weather.with_columns(pl.col("^local.*$").cast(pl.Utf8))
weather = weather.with_columns(
    pl.col(pl.Utf8).fill_null("None")
)  # Replace nulls with "None"
weather = weather.with_columns(pl.col("^*_flag$").cast(pl.Utf8))
weather = weather.with_columns(
    pl.col([pl.Int64, pl.Float64]).map(lambda x: (x - x.mean()) / x.std())
)
weather = weather.with_columns(
    pl.col(pl.Utf8).str.replace_all("\s|_|\.|,|-", " ")
    # .str.split("<sep>")
)

rename_map = {}
for col in weather.columns:
    rename_map[col] = re.sub(r"\s|_|\.|,|-", " ", col)

weather = weather.rename(rename_map)

weather.head()

x,y,station name,climate identifier,province code,local year,local month,local day,local hour,temp,temp flag,dew point temp,dew point temp flag,humidex,precip amount,precip amount flag,relative humidity,relative humidity flag,station pressure,station pressure flag,windchill,wind direction,wind direction flag,wind speed,wind speed flag
f64,f64,str,str,str,str,str,str,str,f64,str,f64,str,f64,f64,str,f64,str,f64,str,f64,f64,str,f64,str
-0.551099,-0.378174,"""CALGARY INT'L ...","""3031094""","""AB""","""2010""","""1""","""1""","""0""",-2.03458,"""None""",-2.035668,"""None""",,,"""None""",0.699723,"""None""",-0.397346,"""None""",,,"""M""",,"""M"""
-0.551099,-0.378174,"""CALGARY INT'L ...","""3031094""","""AB""","""2010""","""1""","""1""","""1""",-2.002552,"""None""",-1.997125,"""None""",,,"""None""",0.699723,"""None""",-0.43779,"""None""",,,"""M""",,"""M"""
-0.551099,-0.378174,"""CALGARY INT'L ...","""3031094""","""AB""","""2010""","""1""","""1""","""2""",-1.970524,"""None""",-1.948946,"""None""",,,"""None""",0.699723,"""None""",-0.450234,"""None""",,,"""M""",,"""M"""
-0.551099,-0.378174,"""CALGARY INT'L ...","""3031094""","""AB""","""2010""","""1""","""1""","""3""",-1.938496,"""None""",-1.910403,"""None""",,,"""None""",0.74665,"""None""",-0.478234,"""None""",,,"""M""",,"""M"""
-0.551099,-0.378174,"""CALGARY INT'L ...","""3031094""","""AB""","""2010""","""1""","""1""","""4""",-1.938496,"""None""",-1.920039,"""None""",,,"""None""",0.699723,"""None""",-0.503122,"""None""",,,"""M""",,"""M"""


In [69]:
# get device type, either cuda, cpu or mps
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Using {device} device")

Using mps device


In [293]:
@dataclass
class KV:
    key: str
    value: ty.Union[str, float, None]
    is_numeric: ty.Union[bool, None] = None
    # is_numeric: bool = False
    def __post_init__(self):
        if self.value:
            try:
                self.value = float(self.value)
                self.is_numeric = True
            except ValueError:
                self.is_numeric = False
                self.value = "None"


kvn = KV("temp_flag", "1")
kvs = KV("temp_flag", "cloudy")
print(kvn, kvs)

KV(key='temp_flag', value=1.0, is_numeric=True) KV(key='temp_flag', value='None', is_numeric=False)


In [42]:
class TabularDataset(Dataset):
    # def __init__(self, df: pl.DataFrame, vocab_dict: ty.Dict, m_dim: int) -> Dataset:
    def __init__(self, df: pl.DataFrame) -> Dataset:
        self.df = df
        self.string_vocab = self.unique_strings()
        vocab_dict = {}
        for i, string in enumerate(ds.string_vocab):
            vocab_dict[string] = i + 1  # 0 is reserved for numerical values
        self.vocab_dict = vocab_dict
        # self.vocab_dict = vocab_dict
        # self.embedding = nn.Embedding(len(self.string_vocab), m_dim)
        # Numeric Scale

        # self.col_vocab = self.df.columns

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        """Returns a tuple of (input, target) at the given index."""
        row = self.df[idx]
        row = self.splitter(row)
        return row

    def splitter(self, row: pl.DataFrame) -> ty.List[ty.Union[str, float, None]]:
        vals = []
        for col in row.columns:
            value = row[col][0]
            col = col.split(" ")
            vals.extend(col)
            vals.append("::")
            if isinstance(value, Number):
                vals.append(value)
            elif value is None:
                vals.append("NumericNone")
                # Nones are only for numeric columns, others are "None"
            elif isinstance(value, str):
                vals.extend(value.split(" "))
            else:
                raise ValueError("Unknown type")
        return vals

    def unique_strings(self) -> ty.List[str]:
        unique_strings = []
        list_df = self.df.select(pl.col(pl.Utf8).map(lambda x: x.unique().to_list()))

        for col in list_df.columns:
            unique_strings.extend(list_df[col].to_list()[0])

        unique_strings.extend(self.df.columns)
        unique_strings.extend(["::", "<unk>", "NumericNone"])

        unique_strings = set(unique_strings)

        unique_strings = list(unique_strings)

        # Split elements in list
        unique_strings = [x.split(" ") for x in unique_strings]
        # Flatten list
        unique_strings = [item for sublist in unique_strings for item in sublist]

        unique_strings = list(set(unique_strings))

        return unique_strings


ds = TabularDataset(weather)
print(ds[0])
print(ds.string_vocab[:10])

['x', '::', -0.551099305737714, 'y', '::', -0.37817406811183396, 'station', 'name', '::', 'CALGARY', "INT'L", 'CS', 'climate', 'identifier', '::', '3031094', 'province', 'code', '::', 'AB', 'local', 'year', '::', '2010', 'local', 'month', '::', '1', 'local', 'day', '::', '1', 'local', 'hour', '::', '0', 'temp', '::', -2.034580198428538, 'temp', 'flag', '::', 'None', 'dew', 'point', 'temp', '::', -2.0356676557539686, 'dew', 'point', 'temp', 'flag', '::', 'None', 'humidex', '::', 'NumericNone', 'precip', 'amount', '::', 'NumericNone', 'precip', 'amount', 'flag', '::', 'None', 'relative', 'humidity', '::', 0.6997231826522904, 'relative', 'humidity', 'flag', '::', 'None', 'station', 'pressure', '::', -0.39734550502172294, 'station', 'pressure', 'flag', '::', 'None', 'windchill', '::', 'NumericNone', 'wind', 'direction', '::', 'NumericNone', 'wind', 'direction', 'flag', '::', 'M', 'wind', 'speed', '::', 'NumericNone', 'wind', 'speed', 'flag', '::', 'M']
['6', 'CS', 'MCMURRAY', 'temp', 'mont

In [None]:
for i in range(weather.shape[0]):
    x = ds[i]

In [17]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)
        )
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer("pe", pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[: x.size(0)]
        return self.dropout(x)

In [74]:
class TransformerModel(nn.Module):
    def __init__(
        self,
        vocab_dict: ty.Dict,
        d_model: int,
        nhead: int,
        d_hid: int,
        dropout,
        nlayers: int,
    ) -> None:
        super().__init__()
        self.d_model = d_model
        self.vocab_dict = vocab_dict
        self.model_type = "Transformer"
        self.pos_encoder = PositionalEncoding(
            d_model=self.d_model, dropout=0.1, max_len=5000
        )
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(len(ds.string_vocab), d_model, padding_idx=0)
        self.ntoken = len(ds.string_vocab) + 1  # +1 for numerical values
        self.decoder = nn.Linear(d_model, self.ntoken)

    def forward(self, data, src_mask=None):
        src = self.parse_data(data)
        src = src * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_mask)
        output = self.decoder(output)
        return output

    def parse_data(self, data):
        # Parse data into key value pairs
        # Key is column name
        # Value is value in column
        # Return list of KV objects
        numeric_vals = {}
        emb_vals = []
        for idx, d in enumerate(data):
            if isinstance(d, Number):
                numeric_vals[idx] = d
                emb_vals.append(0)
            else:
                emb_vals.append(self.vocab_dict[d])

        embeddings = self.encoder(torch.tensor(emb_vals))
        with torch.no_grad():
            for k, v in numeric_vals.items():
                embeddings[k][0] = v
        return embeddings
        # return embeddings.to.device(device)

In [75]:
model = TransformerModel(ds.vocab_dict, 512, 8, 2048, 0.1, 6)
# model = model.to(device)

In [77]:
model(ds[0:64])

tensor([[[ 0.7033, -0.5937, -0.4107,  ..., -0.1170,  0.0665,  0.8095],
         [ 0.3185,  0.0763, -1.2642,  ..., -0.1733, -0.3627, -0.1041],
         [ 0.5054,  0.1952,  0.1314,  ...,  0.2567,  0.2464,  0.7114],
         ...,
         [ 0.2919, -0.7702, -1.2383,  ...,  0.3820,  0.3375, -0.5542],
         [ 0.1968,  0.2290, -1.1693,  ..., -0.0277, -0.4681, -0.1580],
         [-0.2841, -0.0459, -0.4463,  ..., -0.0657, -0.4043,  0.1095]],

        [[ 0.3597, -0.3131, -0.2096,  ..., -0.0300, -0.0609,  0.6938],
         [ 0.1326,  0.2464, -1.1619,  ..., -0.1640, -0.4352, -0.3636],
         [ 0.7163, -0.2130,  0.1734,  ...,  0.2355,  0.4513,  0.7043],
         ...,
         [-0.0032, -0.5143, -0.9356,  ..., -0.1864,  0.5341, -0.3860],
         [ 0.3703,  0.0186, -1.2528,  ..., -0.2863, -0.3800,  0.0389],
         [-0.0103, -0.0815, -0.3973,  ..., -0.2231, -0.2249, -0.0037]],

        [[ 0.4737, -0.5036, -0.3257,  ..., -0.0314, -0.0600,  0.7890],
         [ 0.5292,  0.0239, -1.4047,  ..., -0

In [22]:
import torch.nn.functional as F

In [56]:
torch.tensor([1, 2, 3], dtype=torch.int32)

tensor([1, 2, 3], dtype=torch.int32)