# Diamond Transformer

This notebook investigates the use of a transformer to predict the price of diamonds. The dataset is from Kaggle and can be found [here](https://www.kaggle.com/shivam2503/diamonds).

While many traditional ML and DL techniques work on the dataset, our approach uses far less labeled data while achieving similar results. This is done by using a transformer.

--------------------------------------------------------------------------------

## Libraries


In [20]:
import hashlib
import math

import polars as pl


import polars as pl
import numpy as np


from torch import nn, Tensor
from tqdm import trange, tqdm

import hephaestus as hp
import torch
from torch.utils.tensorboard import SummaryWriter


The diamonds dataset: df = pl.read_csv("../data/diamonds.csv") df.head()


In [21]:
df = pl.read_csv("../data/diamonds.csv")
df.head()

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,price,x,y,z
i64,f64,str,str,str,f64,f64,i64,f64,f64,f64
1,0.23,"""Ideal""","""E""","""SI2""",61.5,55.0,326,3.95,3.98,2.43
2,0.21,"""Premium""","""E""","""SI1""",59.8,61.0,326,3.89,3.84,2.31
3,0.23,"""Good""","""E""","""VS1""",56.9,65.0,327,4.05,4.07,2.31
4,0.29,"""Premium""","""I""","""VS2""",62.4,58.0,334,4.2,4.23,2.63
5,0.31,"""Good""","""J""","""SI2""",63.3,58.0,335,4.34,4.35,2.75


In [22]:
df.describe()

describe,Unnamed: 1_level_0,carat,cut,color,clarity,depth,table,price,x,y,z
str,f64,f64,str,str,str,f64,f64,f64,f64,f64,f64
"""count""",53940.0,53940.0,"""53940""","""53940""","""53940""",53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
"""null_count""",0.0,0.0,"""0""","""0""","""0""",0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",26970.5,0.79794,,,,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
"""std""",15571.281097,0.474011,,,,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
"""min""",1.0,0.2,"""Fair""","""D""","""I1""",43.0,43.0,326.0,0.0,0.0,0.0
"""max""",53940.0,5.01,"""Very Good""","""J""","""VVS2""",79.0,95.0,18823.0,10.74,58.9,31.8
"""median""",26970.5,0.7,,,,61.8,57.0,2401.0,5.7,5.71,3.53
"""25%""",13486.0,0.4,,,,61.0,56.0,950.0,4.71,4.72,2.91
"""75%""",40456.0,1.04,,,,62.5,59.0,5325.0,6.54,6.54,4.04


In [23]:
df = hp.scale_numeric(df)
df.head()

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,price,x,y,z
f64,f64,str,str,str,f64,f64,f64,f64,f64,f64
-1.732003,-1.198157,"""Ideal""","""E""","""SI2""",-0.17409,-1.099662,-0.904087,-1.587823,-1.536181,-1.571115
-1.731938,-1.24035,"""Premium""","""E""","""SI1""",-1.360726,1.585514,-0.904087,-1.64131,-1.658759,-1.741159
-1.731874,-1.198157,"""Good""","""E""","""VS1""",-3.384987,3.375631,-0.903836,-1.498677,-1.457382,-1.741159
-1.73181,-1.071577,"""Premium""","""I""","""VS2""",0.454129,0.242926,-0.902081,-1.364959,-1.317293,-1.287708
-1.731746,-1.029384,"""Good""","""J""","""SI2""",1.082348,0.242926,-0.901831,-1.240155,-1.212227,-1.117663


In [24]:
df = hp.make_lower_remove_special_chars(df)
val_tokens = hp.get_unique_utf8_values(df)
col_tokens = hp.get_col_tokens(df)

In [25]:
df.head()

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,price,x,y,z
f64,f64,str,str,str,f64,f64,f64,f64,f64,f64
-1.732003,-1.198157,"""ideal""","""e""","""si2""",-0.17409,-1.099662,-0.904087,-1.587823,-1.536181,-1.571115
-1.731938,-1.24035,"""premium""","""e""","""si1""",-1.360726,1.585514,-0.904087,-1.64131,-1.658759,-1.741159
-1.731874,-1.198157,"""good""","""e""","""vs1""",-3.384987,3.375631,-0.903836,-1.498677,-1.457382,-1.741159
-1.73181,-1.071577,"""premium""","""i""","""vs2""",0.454129,0.242926,-0.902081,-1.364959,-1.317293,-1.287708
-1.731746,-1.029384,"""good""","""j""","""si2""",1.082348,0.242926,-0.901831,-1.240155,-1.212227,-1.117663


In [26]:
special_tokens = np.array(
    [
        "missing",
        "<mask>",
        "<pad>",
        "<unk>",
        "<numeric>",
        ":",
        ",",
        "<row-start>",
        "<row-end>",
    ]
)

In [27]:
tokens = np.unique(
    np.concatenate(
        (
            val_tokens,
            col_tokens,
            special_tokens,
        )
    )
)
tokens

array(['', ',', ':', '<mask>', '<numeric>', '<pad>', '<row-end>',
       '<row-start>', '<unk>', 'carat', 'clarity', 'color', 'cut', 'd',
       'depth', 'e', 'f', 'fair', 'g', 'good', 'h', 'i', 'i1', 'ideal',
       'if', 'j', 'missing', 'premium', 'price', 'si1', 'si2', 'table',
       'very good', 'vs1', 'vs2', 'vvs1', 'vvs2', 'x', 'y', 'z'],
      dtype=object)

# Train Test Split

To show the actual model performance out of sample we split the data into a training and test set. The training set will be used to train the model and the test set will be used to evaluate the model performance. We will use 80% of the data for training and 20% for testing.

We also remove the price column from the training and test sets and will only use a tiny subset of the data to simulate an industrial process with lots of input data but expensive and limited labeled data.


In [28]:
df = (
    df.with_columns(
        pl.concat_str(pl.all().exclude("price").cast(pl.Utf8)).alias("all_cols")
    )
    .with_columns(
        pl.col("all_cols")
        .apply(lambda x: hashlib.md5(x.encode()).hexdigest())
        .alias("hash")
    )
    .drop("all_cols")
)
df.select(pl.col("hash").is_duplicated().sum())

hash
u32
0


In [29]:
# Shuffle for randomness
df = df.sample(fraction=1.0, seed=42)
df.head()

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,price,x,y,z,hash
f64,f64,str,str,str,f64,f64,f64,f64,f64,f64,str
-1.732003,-1.198157,"""ideal""","""e""","""si2""",-0.17409,-1.099662,-0.904087,-1.587823,-1.536181,-1.571115,"""3921d9270c77e4…"
-1.731938,-1.24035,"""premium""","""e""","""si1""",-1.360726,1.585514,-0.904087,-1.64131,-1.658759,-1.741159,"""4e60d77efdbf6f…"
-1.731874,-1.198157,"""good""","""e""","""vs1""",-3.384987,3.375631,-0.903836,-1.498677,-1.457382,-1.741159,"""65066e09c94762…"
-1.73181,-1.071577,"""premium""","""i""","""vs2""",0.454129,0.242926,-0.902081,-1.364959,-1.317293,-1.287708,"""be9464ba76a36f…"
-1.731746,-1.029384,"""good""","""j""","""si2""",1.082348,0.242926,-0.901831,-1.240155,-1.212227,-1.117663,"""3562954f4de9ba…"


In [30]:
train_fraction = 0.8
n_train = int(train_fraction * len(df))
train_test_df = df.select(pl.all().exclude(["price", "hash"]))

train, test = train_test_df.head(n_train), train_test_df.tail(
    len(train_test_df) - n_train
)

In [31]:
train.head(), train.shape

(shape: (5, 10)
 ┌───────────┬───────────┬─────────┬───────┬───┬───────────┬───────────┬───────────┬───────────┐
 │           ┆ carat     ┆ cut     ┆ color ┆ … ┆ table     ┆ x         ┆ y         ┆ z         │
 │ ---       ┆ ---       ┆ ---     ┆ ---   ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
 │ f64       ┆ f64       ┆ str     ┆ str   ┆   ┆ f64       ┆ f64       ┆ f64       ┆ f64       │
 ╞═══════════╪═══════════╪═════════╪═══════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
 │ -1.732003 ┆ -1.198157 ┆ ideal   ┆ e     ┆ … ┆ -1.099662 ┆ -1.587823 ┆ -1.536181 ┆ -1.571115 │
 │ -1.731938 ┆ -1.24035  ┆ premium ┆ e     ┆ … ┆ 1.585514  ┆ -1.64131  ┆ -1.658759 ┆ -1.741159 │
 │ -1.731874 ┆ -1.198157 ┆ good    ┆ e     ┆ … ┆ 3.375631  ┆ -1.498677 ┆ -1.457382 ┆ -1.741159 │
 │ -1.73181  ┆ -1.071577 ┆ premium ┆ i     ┆ … ┆ 0.242926  ┆ -1.364959 ┆ -1.317293 ┆ -1.287708 │
 │ -1.731746 ┆ -1.029384 ┆ good    ┆ j     ┆ … ┆ 0.242926  ┆ -1.240155 ┆ -1.212227 ┆ -1.117663 │
 └───────────┴

In [32]:
train_test_df.head()

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,x,y,z
f64,f64,str,str,str,f64,f64,f64,f64,f64
-1.732003,-1.198157,"""ideal""","""e""","""si2""",-0.17409,-1.099662,-1.587823,-1.536181,-1.571115
-1.731938,-1.24035,"""premium""","""e""","""si1""",-1.360726,1.585514,-1.64131,-1.658759,-1.741159
-1.731874,-1.198157,"""good""","""e""","""vs1""",-3.384987,3.375631,-1.498677,-1.457382,-1.741159
-1.73181,-1.071577,"""premium""","""i""","""vs2""",0.454129,0.242926,-1.364959,-1.317293,-1.287708
-1.731746,-1.029384,"""good""","""j""","""si2""",1.082348,0.242926,-1.240155,-1.212227,-1.117663


In [33]:
ds = hp.TabularDataset(
    train,
    tokens,
    special_tokens=special_tokens,
    shuffle_cols=False,
    max_row_length=50,
)

print(len(ds[0]))

50


In [34]:
print([i.value for i in ds[0]])

['<row-start>', '', ':', -1.7320026420495056, ',', 'carat', ':', -1.1981566989627475, ',', 'cut', ':', 'ideal', ',', 'color', ':', 'e', ',', 'clarity', ':', 'si2', ',', 'depth', ':', -0.17408989455083768, ',', 'table', ':', -1.0996617971586031, ',', 'x', ':', -1.5878227303011756, ',', 'y', ':', -1.5361813230221135, ',', 'z', ':', -1.5711146235593887, ',', '<row-end>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']


In [35]:
if torch.backends.mps.is_built():
    device_name = "mps"
elif torch.cuda.is_available():
    device_name = "cuda"
else:
    device_name = "cpu"
device = torch.device(device_name)
print(device)

mps


In [36]:
n_token = len(ds.vocab)  # size of vocabulary
d_model = 32  # embedding dimension
d_hid = 200  # dimension of the feedforward network model in ``nn.TransformerEncoder``
n_layers = 2  # number of ``nn.TransformerEncoderLayer`` in ``nn.TransformerEncoder``
n_head = 2  # number of heads in ``nn.MultiheadAttention``
dropout = 0.2  # dropout probability
model = hp.TransformerModel(
    n_token, d_model, n_head, d_hid, n_layers, device, dropout
).to(device)

In [39]:
import copy
import time


lr = 0.24  # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size =100, gamma=0.5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode="min",
    factor=0.9,
    patience=5,
    threshold=0.001,
    threshold_mode="rel",
    cooldown=0,
    min_lr=0.01,
    eps=1e-08,
    verbose=False,
)


def train(model: nn.Module) -> None:
    writer = SummaryWriter()
    model.train()  # turn on train mode
    total_loss = 0.0
    log_interval = 1000
    n_row = 10 # one because it's not time series
    start_time = time.time()
    for batch, i in enumerate(trange(0, len(ds) - 1, n_row)):
        data, targets = hp.batch_data(ds, i, n_row=n_row)
        class_output, numeric_output = model(data)
        loss, loss_dict = hp.hephaestus_loss(
            class_output, numeric_output, targets, tokens, special_tokens, device
        )
        num_loss = loss_dict["reg_loss"].item()
        # class_loss = loss_dict["class_loss"].item()
        writer.add_scalar("Loss/total_loss", loss, batch)
        writer.add_scalar("Loss/numeric_loss", num_loss, batch)
        # writer.add_scalar("Loss/class_loss", class_loss, batch)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        if batch % log_interval == 0 and batch > 0:
            # lr = scheduler.get_last_lr()[0]
            lr = optimizer.param_groups[0]["lr"]

            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            cur_loss = total_loss / log_interval
            ppl = math.exp(cur_loss)
            print(  # f'| epoch {epoch:3d} | {batch:5d}/{num_batches:5d} batches | '
                f"lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | ",
                f"loss {cur_loss:5.2f} | ppl {ppl:8.2f}",
                loss_dict,
            )
            total_loss = 0
            start_time = time.time()
            scheduler.step(loss)
    writer.close()

In [40]:
for i in range(2):
    train(model)

 13%|█▎        | 546/4316 [01:31<10:31,  5.97it/s]

Unexpected exception formatting exception. Falling back to standard exception



Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniconda/base/envs/pytorch/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3508, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/rs/qflxwtyx6kvfj8jcqx5zm5hr0000gn/T/ipykernel_68469/1732875328.py", line 2, in <module>
    train(model)
  File "/var/folders/rs/qflxwtyx6kvfj8jcqx5zm5hr0000gn/T/ipykernel_68469/242740888.py", line 31, in train
    class_output, numeric_output = model(data)
                                   ^^^^^^^^^^^
  File "/opt/homebrew/Caskroom/miniconda/base/envs/pytorch/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1502, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Caskroom/miniconda/base/envs/pytorch/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^

In [None]:
# %%
torch.save(model.state_dict(), "models/diamonds2.pth")

In [None]:
def evaluate(model: nn.Module, ds, idx) -> None:
    model.eval()  # turn on train mode
    n_row = 1  # one because it's not time series
    start_time = time.time()
    with torch.no_grad():
        data, targets = hp.batch_data(ds, idx, n_row=n_row)
        class_output, numeric_output = model(data)
        loss, loss_dict = hp.hephaestus_loss(
            class_output, numeric_output, targets, tokens, special_tokens, device
        )
        return {
            "loss": loss.item(),
            "loss_dict": loss_dict,
            "data": data,
            "targets": targets,
            "class_output": class_output,
            "numeric_output": numeric_output,
        }

In [None]:
ds_test = hp.TabularDataset(
    test,
    tokens,
    special_tokens=special_tokens,
    shuffle_cols=False,
    max_row_length=50,
)

In [None]:
res = evaluate(model, ds_test, 1)

In [None]:
res["loss_dict"]["reg_loss"].item()

0.07507946342229843

In [None]:
actuals = [str(i.value) for i in res["targets"]]
actuals_ = " ".join(actuals)
actual_str = actuals_.split("<row-end>")[0]
actual_str

'<row-start>  : 1.0393171826547831 , carat : -0.5863568663158119 , cut : ideal , color : e , clarity : si1 , depth : 0.24472280362154117 , table : -0.20460319486368622 , x : -0.5626486873617523 , y : -0.520539274600677 , z : -0.5083383369869077 , '

In [None]:
# preds = [str(i.value) for i in res["class_output"]]
# preds_ = " ".join(preds)
# preds_.split("<row-end>")[0]

AttributeError: 'Tensor' object has no attribute 'value'

In [None]:
res["class_output"]

tensor([[ 0.3549,  1.7148, -2.8090,  ..., -1.5442, -0.3651,  1.8637],
        [-0.0675,  9.0304, -4.3372,  ..., -2.9266,  1.5646, -0.6628],
        [-0.4244, -2.2803, -1.1936,  ...,  1.9629,  1.3588, -0.3464],
        ...,
        [-0.1164, -0.9520,  0.9857,  ...,  2.3107,  2.3813, -1.2798],
        [-0.0663, -0.6935,  0.8468,  ...,  2.0432,  2.1547, -0.9833],
        [ 0.0592, -0.0154,  0.5057,  ...,  1.2972,  2.0831, -0.7047]],
       device='mps:0')

In [None]:
lsm = nn.Softmax(dim=0)
softmax_cats = lsm(res["class_output"])
softmax_cats = torch.argmax(softmax_cats, dim=1)

In [None]:
softmax_cats, softmax_cats.shape

(tensor([ 8,  1,  3,  5, 17, 10,  3,  5,  2, 13,  3, 24,  2, 12,  3, 16,  2, 11,
          3, 30,  2, 15,  3,  5,  2, 32,  3,  5,  2, 38,  3,  5,  2, 39,  3,  5,
          2, 40,  3,  5,  2,  7,  6,  6,  6,  6,  6,  6,  6,  6],
        device='mps:0'),
 torch.Size([50]))

In [None]:
gen_tokens = []
for idx, pred in enumerate(softmax_cats):
    token = tokens[pred-1]
    if token == "<numeric>":
        gen_tokens.append(str(res["numeric_output"][idx].item()))
    else:
        gen_tokens.append(token)
preds = " ".join(gen_tokens)
print(f"""Predicted Row:\n\n{preds.split("<row-end>")[0]}""")
print(f"""\nActual Row:\n{actual_str}""")

Predicted Row:

<row-start>  : 0.4553162157535553 f carat : -0.3599178194999695 , cut : ideal , color : e , clarity : si1 , depth : 0.006437838077545166 , table : -0.05552786588668823 , x : -0.4822646975517273 , y : -0.4959062933921814 , z : -0.2911911606788635 , 

Actual Row:
<row-start>  : 1.0393171826547831 , carat : -0.5863568663158119 , cut : ideal , color : e , clarity : si1 , depth : 0.24472280362154117 , table : -0.20460319486368622 , x : -0.5626486873617523 , y : -0.520539274600677 , z : -0.5083383369869077 , 


In [None]:
ds.vocab_len

40

In [45]:
X = np.arange(25).reshape(5, 5)
X

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24]])

In [52]:
e = np.eye(5, 5)
e

array([[1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.]])

In [53]:
np.dot(X, e)

array([[ 0.,  1.,  2.,  3.,  4.],
       [ 5.,  6.,  7.,  8.,  9.],
       [10., 11., 12., 13., 14.],
       [15., 16., 17., 18., 19.],
       [20., 21., 22., 23., 24.]])

In [55]:
ee = e.copy()
ee[2,2] = 2
ee

array([[1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 2., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.]])

In [56]:
np.dot(X, ee)

array([[ 0.,  1.,  4.,  3.,  4.],
       [ 5.,  6., 14.,  8.,  9.],
       [10., 11., 24., 13., 14.],
       [15., 16., 34., 18., 19.],
       [20., 21., 44., 23., 24.]])

In [57]:
from torch import Tensor, nn

In [58]:
embedding = nn.Embedding(5, 3)



In [59]:
embedding.weight

Parameter containing:
tensor([[ 1.7518,  1.1383, -1.7392],
        [-1.2163,  0.3151,  0.1371],
        [-1.4466,  0.6539, -1.9890],
        [ 0.5001,  0.3375,  0.2283],
        [-0.1947,  0.5245,  0.7784]], requires_grad=True)

In [60]:
embedding.

AttributeError: 'Embedding' object has no attribute 'bias'