In [2]:
#tensorboard --logdir='runs/'

In [3]:
from datetime import datetime as dt
from itertools import chain

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch.utils.tensorboard import SummaryWriter

# Load and preprocess the dataset (assuming you have a CSV file)
df = pd.read_csv("../data/diamonds.csv")
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [4]:
df.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

In [5]:
cat_columns = ["cut", "color", "clarity"]
num_columns = ["carat", "depth", "table", "x", "y", "z"]
cat_values = pd.unique(df[cat_columns].values.ravel("K"))
target_column = "price"
tokens = list(
    chain(
        cat_values,
        cat_columns,
        num_columns,
        ["PAD", "[NUMERIC_MASK]", "[MASK]"],
        [target_column],
    )
)
token_dict = {token: i for i, token in enumerate(tokens)}

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [7]:
X = df.drop("price", axis=1)
y = df["price"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Preprocess categorical features
X_train_cat = X_train[cat_columns].copy()
X_test_cat = X_test[cat_columns].copy()

label_encoders = {}
for col in cat_columns:
    le = LabelEncoder()
    X_train_cat[col] = X_train_cat[col].map(token_dict)
    X_test_cat[col] = X_test_cat[col].map(token_dict)
    # label_encoders[col] = le

# Preprocess numeric features

scaler = StandardScaler()
X_train_num = scaler.fit_transform(X_train[num_columns].copy())
X_test_num = scaler.transform(X_test[num_columns].copy())

X_train_cat_tensor = torch.tensor(
    X_train_cat.values, dtype=torch.int64
).to(device)  # Use int64 dtype for categorical indices
X_train_num_tensor = torch.tensor(X_train_num, dtype=torch.float32).to(device)
X_test_cat_tensor = torch.tensor(
    X_test_cat.values, dtype=torch.int64
).to(device)  # Use int64 dtype for categorical indices
X_test_num_tensor = torch.tensor(X_test_num, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1).to(device)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1).to(device)

In [8]:
X_test_cat

Unnamed: 0,cut,color,clarity
1388,0,10,17
50052,3,9,16
41645,0,5,16
42377,1,5,16
17244,0,5,12
...,...,...,...
44081,3,5,14
23713,3,9,16
31375,2,10,17
21772,0,9,14


In [9]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads):
        super(MultiHeadAttention, self).__init__()
        self.n_heads = n_heads
        self.d_model = d_model
        self.d_head = d_model // n_heads

        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.out_linear = nn.Linear(d_model, d_model)

    def forward(self, q, k, v, mask=None):
        batch_size = q.size(0)

        q = (
            self.q_linear(q)
            .view(batch_size, -1, self.n_heads, self.d_head)
            .transpose(1, 2)
        )
        k = (
            self.k_linear(k)
            .view(batch_size, -1, self.n_heads, self.d_head)
            .transpose(1, 2)
        )
        v = (
            self.v_linear(v)
            .view(batch_size, -1, self.n_heads, self.d_head)
            .transpose(1, 2)
        )

        attn_output, _ = self.scaled_dot_product_attention(q, k, v, mask)

        attn_output = (
            attn_output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        )
        out = self.out_linear(attn_output)
        return out

    def scaled_dot_product_attention(self, q, k, v, mask=None):
        matmul_qk = torch.matmul(q, k.transpose(-2, -1))
        d_k = q.size(-1)
        scaled_attention_logits = matmul_qk / (d_k**0.5)

        if mask is not None:
            scaled_attention_logits += mask * -1e9

        attention_weights = F.softmax(scaled_attention_logits, dim=-1)
        output = torch.matmul(attention_weights, v)

        return output, attention_weights


class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads):
        super(TransformerEncoderLayer, self).__init__()

        self.multi_head_attention = MultiHeadAttention(d_model, n_heads)

        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, 4 * d_model),
            nn.ReLU(),
            nn.Linear(4 * d_model, d_model),
        )

        self.layernorm1 = nn.LayerNorm(d_model)
        self.layernorm2 = nn.LayerNorm(d_model)

    def forward(self, q, k, v, mask=None):
        attn_output = self.multi_head_attention(q, k, v, mask)
        out1 = self.layernorm1(q + attn_output)

        ff_output = self.feed_forward(out1)
        out2 = self.layernorm2(out1 + ff_output)

        return out2


# Parameters
# d_model = 64  # Embedding dimension
# n_heads = 4  # Number of attention heads
# seq_len_q = 10  # Sequence length for the query tensor
# seq_len_k = 20  # Sequence length for the key tensor
# batch_size = 32  # Batch size

# # Random data
# q = torch.rand((batch_size, seq_len_q, d_model))
# k = torch.rand((batch_size, seq_len_k, d_model))
# v = k  # Usually, value and key are the same in many applications

# # Model
# encoder_layer = TransformerEncoderLayer(d_model, n_heads)

# # Forward pass
# output = encoder_layer(q, k, v)
# print("Output shape:", output.shape)

In [10]:
def mask_tensor(tensor, model, propability=0.8):
    if tensor.dtype == torch.float32:
        is_numeric = True
    elif tensor.dtype == torch.int64:
        is_numeric = False
    else:
        raise ValueError(f"Task {tensor.dtype} not supported.")

    tensor = tensor.clone()
    bit_mask = torch.rand(tensor.shape) > propability
    if is_numeric:
        tensor[bit_mask] =torch.tensor(float("-Inf"))
    else:
        tensor[bit_mask] = model.cat_mask_token
    return tensor.to(model.device)



In [11]:
class TabTransformer(nn.Module):
    def __init__(
        self,
        tokens,
        numeric_col_tokens,
        cat_col_tokens,
        token_dict,
        d_model=64,
        n_heads=2,
        device=device
    ):
        super(TabTransformer, self).__init__()
        self.device = device
        self.d_model = d_model
        self.tokens = tokens
        self.token_dict = token_dict
        # Masks
        self.cat_mask_token = torch.tensor(self.token_dict["[MASK]"]).to(device)
        self.numeric_mask_token = torch.tensor(self.token_dict["[NUMERIC_MASK]"]).to(device)

        self.col_tokens = cat_col_tokens + numeric_col_tokens
        self.n_tokens = len(tokens)  # TODO Make this
        # Embedding layers for categorical features
        self.embeddings = nn.Embedding(self.n_tokens, self.d_model).to(device)
        self.n_numeric_cols = len(numeric_col_tokens)
        self.n_cat_cols = len(cat_col_tokens)
        self.n_columns = self.n_numeric_cols + self.n_cat_cols
        # self.numeric_embeddings = NumericEmbedding(d_model=self.d_model)
        self.col_indices = torch.tensor(
            [self.tokens.index(col) for col in self.col_tokens], dtype=torch.long
        ).to(device)
        self.numeric_indices = torch.tensor(
            [self.tokens.index(col) for col in numeric_col_tokens], dtype=torch.long
        ).to(device)
        self.transformer_encoder = TransformerEncoderLayer(d_model, n_heads=n_heads).to(device)

        self.regressor = nn.Sequential(
            nn.Linear(d_model, d_model * 2),
            nn.ReLU(),
            nn.Linear(d_model * 2, 1),
            nn.ReLU(),
        ).to(device)

        self.mlm_decoder = nn.Sequential(
            nn.Linear(d_model, d_model)
        ).to(device)  # TODO try making more complex

        self.mnm_decoder = nn.Sequential(
            nn.Linear(self.n_columns * self.d_model, self.d_model * 4),  # Try making more complex
            nn.ReLU(),
            nn.Linear(self.d_model*4, 6),
        ).to(device)

        self.flatten_layer = nn.Linear(len(self.col_tokens), 1).to(device)

    def forward(self, num_inputs, cat_inputs, task="regression"):
        # Embed column indices
        repeated_col_indices = self.col_indices.unsqueeze(0).repeat(
            num_inputs.size(0), 1
        )
        col_embeddings = self.embeddings(repeated_col_indices)

        repeated_numeric_indices = self.numeric_indices.unsqueeze(0).repeat(
            num_inputs.size(0), 1
        )
        numeric_col_embeddings = self.embeddings(repeated_numeric_indices)

        cat_embeddings = self.embeddings(cat_inputs)
        
        expanded_num_inputs = num_inputs.unsqueeze(2).repeat(1, 1, self.d_model)
        inf_mask = (expanded_num_inputs == float('-inf')).all(dim=2)
        base_numeric = torch.zeros_like(expanded_num_inputs)
        
        num_embeddings = numeric_col_embeddings[~inf_mask] * expanded_num_inputs[~inf_mask]
        base_numeric[~inf_mask] = num_embeddings
        base_numeric[inf_mask] = self.embeddings(self.numeric_mask_token)
        # print(base_numeric)


        # Replace those positions with the new embedding
        # num_embeddings[inf_mask] = self.embeddings(self.numeric_mask_token)
        
        # if mnm is not None:
        #     mnm = mnm.to(device)
        #     # print(
        #     #     f"mnm.shape: {mnm.shape}, num_embeddings.shape: {num_embeddings.shape}"
        #     # )
        #     # print(num_embeddings.sum())
        #     numeric_mask_embedding = self.embeddings(self.numeric_mask_token)
        #     # numeric_mask_embedding = torch.ones_like(numeric_mask_embedding)
        #     # numeric_mask_embedding = torch.zeros_like(numeric_mask_embedding)
        #     num_embeddings[mnm] = numeric_mask_embedding
        #     # print(num_embeddings.sum())

        query_embeddings = torch.cat([cat_embeddings, base_numeric], dim=1)
        out = self.transformer_encoder(
            col_embeddings,
            query_embeddings,
            query_embeddings
            # col_embeddings, query_embeddings, query_embeddings
        )
        if task == "regression":
            out = self.regressor(out)
            out = self.flatten_layer(out.squeeze(-1))

            return out
        elif task == "mlm":
            cat_out = self.mlm_decoder(out)
            # print(f"Out shape: {out.shape}, cat_out shape: {cat_out.shape}")
            numeric_out = out.view(out.size(0), -1)
            # print(f"numeric_out shape: {numeric_out.shape}")
            numeric_out = self.mnm_decoder(numeric_out)
            return cat_out, numeric_out
        else:
            raise ValueError(f"Task {task} not supported.")


no_price_tokens = tokens.copy()
no_price_tokens.remove("price")

numeric_col_tokens = (
    df.head().drop("price", axis=1).select_dtypes(include=np.number).columns.to_list()
)
cat_col_tokens = df.head().select_dtypes(exclude=np.number).columns.to_list()

model = TabTransformer(
    no_price_tokens,
    numeric_col_tokens=numeric_col_tokens,
    cat_col_tokens=cat_col_tokens,
    token_dict=token_dict,
).to(device)
batch_size = 3
test_num = X_train_num_tensor[0:batch_size, :]
test_num_mask = mask_tensor(test_num, model)
test_cat = X_train_cat_tensor[0:batch_size, :]
test_cat_mask = mask_tensor(test_cat, model)
with torch.no_grad():
    x = model(
        test_num_mask,
        test_cat_mask,
        task="mlm",
    )
x[0].shape, x[1].shape

(torch.Size([3, 9, 64]), torch.Size([3, 6]))

In [12]:
mse_loss = nn.MSELoss()

mse_loss(test_num, x[1])

tensor(2.5147, device='cuda:0')

In [25]:
model.col_tokens

['cut', 'color', 'clarity', 'carat', 'depth', 'table', 'x', 'y', 'z']

In [None]:
def show_mask_pred(idx, batch_size, model, probability):
    numeric_values = X_train_num_tensor[i : i + batch_size, :]
    categorical_values = X_train_cat_tensor[i : i + batch_size, :]
    numeric_masked = mask_tensor(numeric_values, model, propability=probability)
    categorical_masked = mask_tensor(categorical_values, model, propability=probability)
    
    with torch.zero_grad():
        cat_preds, numeric_preds = model(
            numeric_masked, categorical_masked, task="mlm"
        )
    cat_preds

In [27]:
cat_preds.shape

torch.Size([152, 64, 9])

In [24]:
# Masked Tabualr Modeling
epochs = 20
batch_size = 1000
lr = 0.001
mse_loss = nn.MSELoss()
ce_loss = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

model_time = dt.now()
model_time = model_time.strftime("%Y-%m-%dT%H:%M:%S")
model_name = f"TryAgainPretrain.8_{model_time}"

summary_writer = SummaryWriter("runs/" + model_name)

batch_count = 0
model.train()
for epoch in range(epochs):
    for i in range(0, X_train_num_tensor.size(0), batch_size):
        numeric_values = X_train_num_tensor[i : i + batch_size, :]
        categorical_values = X_train_cat_tensor[i : i + batch_size, :]
        numeric_masked = mask_tensor(numeric_values, model, propability=0.8)
        categorical_masked = mask_tensor(categorical_values, model, propability=0.8)
        optimizer.zero_grad()
        cat_preds, numeric_preds = model(
            numeric_masked, categorical_masked, task="mlm"
        )
        cat_targets = torch.cat(
            (categorical_values, model.numeric_indices.expand(categorical_values.size(0), -1)), dim=1
        )
        cat_preds = cat_preds.permute(0, 2, 1)
        # print(
        #     f"cat_preds.shape: {cat_preds.shape}, cat_targets.shape: {cat_targets.shape}"
        # )
        cat_loss = ce_loss(cat_preds, cat_targets)
        numeric_loss = mse_loss(numeric_preds, numeric_values)
        loss = cat_loss + numeric_loss
        loss.backward()
        optimizer.step()
        batch_count += 1
        learning_rate = optimizer.param_groups[0]["lr"]
        summary_writer.add_scalar("Loss/masked_loss", loss.item(), batch_count)
        summary_writer.add_scalar("Loss/mlm_loss", cat_loss.item(), batch_count)
        summary_writer.add_scalar("Loss/mnm_loss", numeric_loss.item(), batch_count)
        summary_writer.add_scalar("Metrics/LearningRate", learning_rate, batch_count)
        if batch_count % 100 == 0:
            print(f"Epoch {epoch+1}/{epochs} Loss: {loss.item():,.4f}")

Epoch 3/20 Loss: 0.8996
Epoch 5/20 Loss: 0.5680
Epoch 7/20 Loss: 0.4508
Epoch 10/20 Loss: 0.3985
Epoch 12/20 Loss: 0.3511
Epoch 14/20 Loss: 0.3219
Epoch 16/20 Loss: 0.2886
Epoch 19/20 Loss: 0.2807


Finetuning the previous model seems to work but when I pre-train, we run into issues. Let's try again


In [23]:
# Regression Model

epochs = 20
batch_size = 1000
lr = 0.01
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

model_time = dt.now()
model_time = model_time.strftime("%Y-%m-%dT%H:%M:%S")
model_name = f"AfterAlreadyPreTrain.8_{model_time}"

summary_writer = SummaryWriter("runs/" + model_name)


batch_count = 0
model.train()
for epoch in range(epochs):
    for i in range(0, X_train_num_tensor.size(0), batch_size):
        num_inputs = X_train_num_tensor[i : i + batch_size, :]
        cat_inputs = X_train_cat_tensor[i : i + batch_size, :]
        # numeric_mask = torch.rand(num_inputs.shape) > 0.8
        # cat_mask = torch.rand(cat_inputs.shape) > 0.8
        # mlm = torch.rand(cat_inputs.shape) > 0.8  # Greater than 1 for testing
        # mnm = torch.rand(num_inputs.shape) > 0.8
        # mlm = None
        # mnm = None
        optimizer.zero_grad()
        y_pred = model(num_inputs, cat_inputs)
        loss = loss_fn(y_pred, y_train_tensor[i : i + batch_size, :])
        loss.backward()
        optimizer.step()
        batch_count += 1
        learning_rate = optimizer.param_groups[0]["lr"]
        summary_writer.add_scalar("Loss/train", loss.item(), batch_count)
        summary_writer.add_scalar("Metrics/LearningRate", learning_rate, batch_count)
        if batch_count % 100 == 0:
            print(f"Epoch {epoch+1}/{epochs} Loss: {loss.item():,.2f}")

Epoch 3/20 Loss: 790,781.75
Epoch 5/20 Loss: 555,031.38
Epoch 7/20 Loss: 639,944.38
Epoch 10/20 Loss: 475,933.25
Epoch 12/20 Loss: 554,887.69
Epoch 14/20 Loss: 562,919.56
Epoch 16/20 Loss: 403,517.44
Epoch 19/20 Loss: 454,190.78


```
Epoch 3/20 Loss: 416,668.31
Epoch 5/20 Loss: 308,602.41
Epoch 7/20 Loss: 375,060.84
Epoch 10/20 Loss: 346,249.38
Epoch 12/20 Loss: 421,598.84
Epoch 14/20 Loss: 362,426.00
Epoch 16/20 Loss: 339,481.22
Epoch 19/20 Loss: 379,399.06
```

In [15]:
with torch.no_grad():
    y_pred = model(X_test_num_tensor[0:10, :], X_test_cat_tensor[0:10, :])
    loss = loss_fn(y_pred, y_test_tensor[0:10])
    print(f"Test loss: {loss.item():,.2f}")

Test loss: 660,564.94


In [16]:
for i in range(10):
    print(
        f"Predicted: {y_pred[i].item():,.2f} Actual: {y_test_tensor[i].item():,.2f}",
        f"Diff: {y_pred[i].item() - y_test_tensor[i].item():,.2f}",
    )

Predicted: 643.20 Actual: 559.00 Diff: 84.20
Predicted: 2,318.71 Actual: 2,201.00 Diff: 117.71
Predicted: 1,285.72 Actual: 1,238.00 Diff: 47.72
Predicted: 1,374.23 Actual: 1,304.00 Diff: 70.23
Predicted: 8,949.88 Actual: 6,901.00 Diff: 2,048.88
Predicted: 4,276.42 Actual: 3,011.00 Diff: 1,265.42
Predicted: 1,850.08 Actual: 1,765.00 Diff: 85.08
Predicted: 1,819.77 Actual: 1,679.00 Diff: 140.77
Predicted: 2,033.86 Actual: 2,102.00 Diff: -68.14
Predicted: 5,653.07 Actual: 4,789.00 Diff: 864.07


```
Predicted: 2,085.14 Actual: 559.00 Diff: 1,526.14
Predicted: 3,381.02 Actual: 2,201.00 Diff: 1,180.02
Predicted: 1,725.17 Actual: 1,238.00 Diff: 487.17
Predicted: 1,914.37 Actual: 1,304.00 Diff: 610.37
Predicted: 15,271.41 Actual: 6,901.00 Diff: 8,370.41
Predicted: 7,173.91 Actual: 3,011.00 Diff: 4,162.91
Predicted: 947.38 Actual: 1,765.00 Diff: -817.62
Predicted: 604.91 Actual: 1,679.00 Diff: -1,074.09
Predicted: 989.06 Actual: 2,102.00 Diff: -1,112.94
Predicted: 9,508.21 Actual: 4,789.00 Diff: 4,719.21
```


In [17]:
for i in range(10):
    print(f"Predicted: {y_pred[i].item():,.2f} Actual: {y_test_tensor[i].item():,.2f}")

Predicted: 643.20 Actual: 559.00
Predicted: 2,318.71 Actual: 2,201.00
Predicted: 1,285.72 Actual: 1,238.00
Predicted: 1,374.23 Actual: 1,304.00
Predicted: 8,949.88 Actual: 6,901.00
Predicted: 4,276.42 Actual: 3,011.00
Predicted: 1,850.08 Actual: 1,765.00
Predicted: 1,819.77 Actual: 1,679.00
Predicted: 2,033.86 Actual: 2,102.00
Predicted: 5,653.07 Actual: 4,789.00


In [18]:
for i in range(10):
    print(f"Predicted: {y_pred[i].item():,.2f} Actual: {y_test_tensor[i].item():,.2f}")

Predicted: 643.20 Actual: 559.00
Predicted: 2,318.71 Actual: 2,201.00
Predicted: 1,285.72 Actual: 1,238.00
Predicted: 1,374.23 Actual: 1,304.00
Predicted: 8,949.88 Actual: 6,901.00
Predicted: 4,276.42 Actual: 3,011.00
Predicted: 1,850.08 Actual: 1,765.00
Predicted: 1,819.77 Actual: 1,679.00
Predicted: 2,033.86 Actual: 2,102.00
Predicted: 5,653.07 Actual: 4,789.00


In [19]:
# Predicted: 688.47 Actual: 559.00
# Predicted: 2,547.17 Actual: 2,201.00
# Predicted: 1,044.95 Actual: 1,238.00
# Predicted: 1,790.95 Actual: 1,304.00
# Predicted: 10,160.45 Actual: 6,901.00
# Predicted: 3,790.90 Actual: 3,011.00
# Predicted: 1,729.77 Actual: 1,765.00
# Predicted: 1,749.63 Actual: 1,679.00
# Predicted: 2,328.92 Actual: 2,102.00
# Predicted: 5,928.75 Actual: 4,789.00

In [20]:
torch.rand(3, 4, 5) > 0.8

tensor([[[False,  True, False, False, False],
         [False, False, False, False,  True],
         [False, False, False, False, False],
         [False, False,  True, False, False]],

        [[False,  True, False, False,  True],
         [False,  True, False,  True, False],
         [False,  True,  True, False, False],
         [ True,  True, False, False, False]],

        [[False, False, False, False, False],
         [False, False,  True,  True, False],
         [False, False, False, False, False],
         [False,  True, False,  True,  True]]])

In [21]:
torch.rand()

TypeError: rand() received an invalid combination of arguments - got (), but expected one of:
 * (tuple of ints size, *, torch.Generator generator, tuple of names names, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)
 * (tuple of ints size, *, torch.Generator generator, Tensor out, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)
 * (tuple of ints size, *, Tensor out, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)
 * (tuple of ints size, *, tuple of names names, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)
