# Industrial Transormers

## Libraries


In [98]:
import re

import polars as pl
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

# from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt

from dataclasses import dataclass
import typing as ty

# Data Preprocessing

- Drop all completely empty columns.
- Make local_* variables strings.
- Scale numeric variables.
- Drop id and date variables.
- Separate strings using.


In [318]:
weather = pl.read_parquet("./data/ab_weather.parquet")

# drop_cols = [weather.select]
keep_cols = [col for col in weather.columns if not weather[col].is_null().all()]
weather = weather.select(keep_cols)
weather = weather.select(pl.all().exclude(["id", "local_date"]))
weather = weather.with_columns(pl.col("^local.*$").cast(pl.Utf8))
weather = weather.with_columns(
    pl.col(pl.Utf8).fill_null("None")
)  # Replace nulls with "None"
weather = weather.with_columns(pl.col("^*_flag$").cast(pl.Utf8))
weather = weather.with_columns(
    pl.col([pl.Int64, pl.Float64]).map(lambda x: (x - x.mean()) / x.std())
)
weather = weather.with_columns(
    pl.col(pl.Utf8).str.replace_all("\s|_|\.|,|-", " ")
    # .str.split("<sep>")
)

rename_map = {}
for col in weather.columns:
    rename_map[col] = re.sub(r"\s|_|\.|,|-", " ", col)

weather = weather.rename(rename_map)

weather.head()

x,y,station name,climate identifier,province code,local year,local month,local day,local hour,temp,temp flag,dew point temp,dew point temp flag,humidex,precip amount,precip amount flag,relative humidity,relative humidity flag,station pressure,station pressure flag,windchill,wind direction,wind direction flag,wind speed,wind speed flag
f64,f64,str,str,str,str,str,str,str,f64,str,f64,str,f64,f64,str,f64,str,f64,str,f64,f64,str,f64,str
-0.551099,-0.378174,"""CALGARY INT'L ...","""3031094""","""AB""","""2010""","""1""","""1""","""0""",-2.03458,"""None""",-2.035668,"""None""",,,"""None""",0.699723,"""None""",-0.397346,"""None""",,,"""M""",,"""M"""
-0.551099,-0.378174,"""CALGARY INT'L ...","""3031094""","""AB""","""2010""","""1""","""1""","""1""",-2.002552,"""None""",-1.997125,"""None""",,,"""None""",0.699723,"""None""",-0.43779,"""None""",,,"""M""",,"""M"""
-0.551099,-0.378174,"""CALGARY INT'L ...","""3031094""","""AB""","""2010""","""1""","""1""","""2""",-1.970524,"""None""",-1.948946,"""None""",,,"""None""",0.699723,"""None""",-0.450234,"""None""",,,"""M""",,"""M"""
-0.551099,-0.378174,"""CALGARY INT'L ...","""3031094""","""AB""","""2010""","""1""","""1""","""3""",-1.938496,"""None""",-1.910403,"""None""",,,"""None""",0.74665,"""None""",-0.478234,"""None""",,,"""M""",,"""M"""
-0.551099,-0.378174,"""CALGARY INT'L ...","""3031094""","""AB""","""2010""","""1""","""1""","""4""",-1.938496,"""None""",-1.920039,"""None""",,,"""None""",0.699723,"""None""",-0.503122,"""None""",,,"""M""",,"""M"""


In [322]:
unique_strings = []
list_df = weather.select(pl.col(pl.Utf8).map(lambda x: x.unique().to_list()))

for col in list_df.columns:
    unique_strings.extend(list_df[col].to_list()[0])

unique_strings.extend(weather.columns)
unique_strings.extend([":", "<unk>"])

unique_strings = set(unique_strings)

unique_strings = list(unique_strings)

# Split elements in list
unique_strings = [x.split(" ") for x in unique_strings]
# Flatten list
unique_strings = [item for sublist in unique_strings for item in sublist]

unique_strings = list(set(unique_strings))

unique_strings[:5]

['3035208', '2012', '2013', 'name', 'LETHBRIDGE']

In [None]:
# Create embedding layer
embedding = nn.Embedding(len(unique_strings), 32)

In [293]:
@dataclass
class KV:
    key: str
    value: ty.Union[str, float, None]
    is_numeric: ty.Union[bool, None] = None
    # is_numeric: bool = False
    def __post_init__(self):
        if self.value:
            try:
                self.value = float(self.value)
                self.is_numeric = True
            except ValueError:
                self.is_numeric = False
                self.value = "None"


kvn = KV("temp_flag", "1")
kvs = KV("temp_flag", "cloudy")
print(kvn, kvs)

KV(key='temp_flag', value=1.0, is_numeric=True) KV(key='temp_flag', value='None', is_numeric=False)


In [224]:
class TabularDataset(Dataset):
    def __init__(
        self,
        df: pl.DataFrame,
        scale_transform: bool = True,
        string_seps: str = r"-|:|_|\.|\s|\.",
    ) -> Dataset:
        self.df = df
        self.string_seps = string_seps
        self.df = self.tokenize_col_vals()
        self.scale_transform = scale_transform
        # Numeric Scale
        if self.scale_transform:
            self.df = self.df.with_columns(
                pl.col([pl.Int64, pl.Float64]).map(lambda x: (x - x.mean()) / x.std())
            )

        col_vocab = self.df.columns
        self.col_vocab = [tokenizer(col) for col in col_vocab]
        # self.col_vocab = self.df.columns
        self.item_vocab = self.get_categorical_tokens()

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        """Returns a tuple of (input, target) at the given index."""
        return self.df[idx].to_dicts()

    def tokenize_col_vals(self):
        self.df = self.df.with_columns(
            pl.col(pl.Utf8).str.replace_all(
                "-|:|_|\.|\s|,", "<sep>"
            )  # .str.split("<sep>")
        )

        return self.df

    # Tokenzie column names
    def tokenize_col_names(self):
        columns = self.df.columns
        col_tokens = [tokenizer(col) for col in columns]
        return col_tokens

    def get_df_vocab(self):
        columns = self.df.columns
        utf8_row_vals = self.df.select(pl.col(pl.Utf8))

    def get_categorical_tokens(self):
        df = self.df.select(pl.col(pl.Utf8))
        df = df.with_columns(
            pl.col(pl.Utf8)
            .str.replace_all(self.string_seps, "<sep>")
            .str.split("<sep>")
        )

        all_vals = []
        for col in df.columns:
            vals = df[col]
            l = []
            for i in vals:
                if i is not None:
                    l.extend(i)
                else:
                    l.extend("None")
            l = set(l)
            all_vals.extend(l)

        return all_vals


ds = TabularDataset(weather)
ds[0]

ComputeError: TypeError: float() argument must be a string or a real number, not 'NoneType'

In [35]:
def get_categorical_tokens(df: pl.DataFrame, seps: str):
    df = df.select(pl.col(pl.Utf8))

    vals = []
    df = df.with_columns(
        pl.col(pl.Utf8).str.replace_all(seps, "<sep>").str.split("<sep>")
    )

    all_vals = []
    for col in df.columns:
        vals = df[col]
        l = []
        for i in vals:
            if i is not None:
                l.extend(i)
        l = set(l)
        all_vals.extend(l)

    return all_vals


# df = get_categorical_tokens(weather, r"-|:|_|\.\d+T\d+|\.")
df = get_categorical_tokens(weather, r"-|:|_|\.\d+T\d+|\.|\s")