In [21]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import torch.nn.functional as F

import numpy as np

# Load and preprocess the dataset (assuming you have a CSV file)
df = pd.read_csv("../data/diamonds.csv")
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
df.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

In [4]:
from itertools import chain

In [5]:
cat_columns = ["cut", "color", "clarity"]
num_columns = ["carat", "depth", "table", "x", "y", "z"]
cat_values = pd.unique(df[cat_columns].values.ravel("K"))
target_column = "price"
tokens = list(chain(cat_values, cat_columns, num_columns, [target_column]))
token_dict = {token: i for i, token in enumerate(tokens)}

In [6]:
embedding = nn.Embedding(len(token_dict), 64)
cat_values_emb = torch.tensor(
    [[token_dict[token] for token in row] for row in df[cat_columns].values],
    dtype=torch.long,
)

col_names_emb = torch.tensor([token_dict[col] for col in df.columns], dtype=torch.long)
embedding(col_names_emb).shape

torch.Size([10, 64])

In [7]:
X = df.drop("price", axis=1)
y = df["price"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Preprocess categorical features
X_train_cat = X_train[cat_columns].copy()
X_test_cat = X_test[cat_columns].copy()

label_encoders = {}
for col in cat_columns:
    le = LabelEncoder()
    X_train_cat[col] = X_train_cat[col].map(token_dict)
    X_test_cat[col] = X_test_cat[col].map(token_dict)
    # label_encoders[col] = le

# Preprocess numeric features

scaler = StandardScaler()
X_train_num = scaler.fit_transform(X_train[num_columns].copy())
X_test_num = scaler.transform(X_test[num_columns].copy())

X_train_cat_tensor = torch.tensor(
    X_train_cat.values, dtype=torch.int64
)  # Use int64 dtype for categorical indices
X_train_num_tensor = torch.tensor(X_train_num, dtype=torch.float32)
X_test_cat_tensor = torch.tensor(
    X_test_cat.values, dtype=torch.int64
)  # Use int64 dtype for categorical indices
X_test_num_tensor = torch.tensor(X_test_num, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

In [8]:
X_test_cat

Unnamed: 0,cut,color,clarity
1388,0,10,17
50052,3,9,16
41645,0,5,16
42377,1,5,16
17244,0,5,12
...,...,...,...
44081,3,5,14
23713,3,9,16
31375,2,10,17
21772,0,9,14


In [9]:
input_tensor = torch.randn(10, 6)
repeated_tensor = input_tensor.unsqueeze(1).repeat(1, 4, 1)

print(repeated_tensor.shape)

torch.Size([10, 4, 6])


In [10]:
X_train_num_tensor[0:10, :].unsqueeze(2).shape

torch.Size([10, 6, 1])

In [11]:
class NumericEmbedding(nn.Module):
    def __init__(self, d_model=64):
        super(NumericEmbedding, self).__init__()
        self.d_model = d_model
        self.linear = nn.Sequential(
            nn.Linear(1, d_model),
            nn.ReLU(),
            nn.Linear(d_model, 64),
        )

    def forward(self, numeric_tensor, first_four=False):
        if first_four:
            if numeric_tensor.ndim == 2:
                numeric_tensor = numeric_tensor.unsqueeze(2).repeat(1, 1, 4)
            elif numeric_tensor.ndim == 1:
                numeric_tensor = numeric_tensor.repeat(1, 1, 4)
            else:
                raise ValueError("numeric_tensor must be 1D or 2D")

            zero_embd = torch.zeros(
                numeric_tensor.size(0),
                numeric_tensor.size(1),
                self.d_model - numeric_tensor.size(2),
            )
            out = torch.cat([numeric_tensor, zero_embd], dim=2)
            return out
        else:
            numeric_tensor = numeric_tensor.unsqueeze(2)
            return self.linear(numeric_tensor)


numeric_embedding = NumericEmbedding()
test_embd = numeric_embedding(X_train_num_tensor[0:10, :])
test_embd.shape

torch.Size([10, 6, 64])

In [12]:
test_mat = torch.rand(3, 4, 5)
test_mat

tensor([[[0.7997, 0.1061, 0.6729, 0.9715, 0.2032],
         [0.0363, 0.9206, 0.4723, 0.1466, 0.1256],
         [0.9140, 0.8157, 0.3155, 0.2908, 0.4379],
         [0.8437, 0.3004, 0.7294, 0.3675, 0.5573]],

        [[0.5491, 0.7337, 0.5207, 0.9817, 0.2760],
         [0.4689, 0.2343, 0.8757, 0.0412, 0.1125],
         [0.3713, 0.6183, 0.3796, 0.7878, 0.3109],
         [0.5083, 0.3049, 0.2495, 0.6776, 0.6589]],

        [[0.7199, 0.8538, 0.0758, 0.1217, 0.2505],
         [0.8753, 0.2642, 0.2211, 0.4952, 0.3732],
         [0.5731, 0.4509, 0.3424, 0.4213, 0.0531],
         [0.2671, 0.0692, 0.2255, 0.4098, 0.9692]]])

In [13]:
test_mat[0, 0], test_mat[0, 0] * 10

(tensor([0.7997, 0.1061, 0.6729, 0.9715, 0.2032]),
 tensor([7.9969, 1.0608, 6.7286, 9.7145, 2.0316]))

In [14]:
(X_train_num_tensor[0:10, :].shape, X_train_num_tensor[0:10, :].unsqueeze(2).shape)

(torch.Size([10, 6]), torch.Size([10, 6, 1]))

In [15]:
scalars = torch.tensor([10, 2, 3, 4])
scalars = scalars.unsqueeze(1).unsqueeze(0)
scalars.shape

torch.Size([1, 4, 1])

In [16]:
double_test_mat = scalars * test_mat
double_test_mat

tensor([[[7.9969, 1.0608, 6.7286, 9.7145, 2.0316],
         [0.0727, 1.8412, 0.9445, 0.2931, 0.2512],
         [2.7419, 2.4472, 0.9464, 0.8724, 1.3137],
         [3.3749, 1.2016, 2.9176, 1.4700, 2.2291]],

        [[5.4908, 7.3373, 5.2070, 9.8168, 2.7600],
         [0.9378, 0.4687, 1.7513, 0.0825, 0.2250],
         [1.1139, 1.8548, 1.1388, 2.3635, 0.9327],
         [2.0333, 1.2194, 0.9980, 2.7105, 2.6357]],

        [[7.1989, 8.5382, 0.7580, 1.2169, 2.5052],
         [1.7507, 0.5283, 0.4421, 0.9905, 0.7463],
         [1.7194, 1.3526, 1.0271, 1.2638, 0.1593],
         [1.0685, 0.2767, 0.9020, 1.6392, 3.8767]]])

In [17]:
double_test_mat[0, 0]

tensor([7.9969, 1.0608, 6.7286, 9.7145, 2.0316])

In [18]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.d_head = d_model // num_heads

        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.out_linear = nn.Linear(d_model, d_model)

    def forward(self, q, k, v, mask=None):
        batch_size = q.size(0)

        q = (
            self.q_linear(q)
            .view(batch_size, -1, self.num_heads, self.d_head)
            .transpose(1, 2)
        )
        k = (
            self.k_linear(k)
            .view(batch_size, -1, self.num_heads, self.d_head)
            .transpose(1, 2)
        )
        v = (
            self.v_linear(v)
            .view(batch_size, -1, self.num_heads, self.d_head)
            .transpose(1, 2)
        )

        attn_output, _ = self.scaled_dot_product_attention(q, k, v, mask)

        attn_output = (
            attn_output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        )
        out = self.out_linear(attn_output)
        return out

    def scaled_dot_product_attention(self, q, k, v, mask=None):
        matmul_qk = torch.matmul(q, k.transpose(-2, -1))
        d_k = q.size(-1)
        scaled_attention_logits = matmul_qk / (d_k**0.5)

        if mask is not None:
            scaled_attention_logits += mask * -1e9

        attention_weights = F.softmax(scaled_attention_logits, dim=-1)
        output = torch.matmul(attention_weights, v)

        return output, attention_weights


class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads):
        super(TransformerEncoderLayer, self).__init__()

        self.multi_head_attention = MultiHeadAttention(d_model, num_heads)

        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, 4 * d_model),
            nn.ReLU(),
            nn.Linear(4 * d_model, d_model),
        )

        self.layernorm1 = nn.LayerNorm(d_model)
        self.layernorm2 = nn.LayerNorm(d_model)

    def forward(self, q, k, v, mask=None):
        attn_output = self.multi_head_attention(q, k, v, mask)
        out1 = self.layernorm1(q + attn_output)

        ff_output = self.feed_forward(out1)
        out2 = self.layernorm2(out1 + ff_output)

        return out2


# Parameters
d_model = 64  # Embedding dimension
num_heads = 4  # Number of attention heads
seq_len_q = 10  # Sequence length for the query tensor
seq_len_k = 20  # Sequence length for the key tensor
batch_size = 32  # Batch size

# Random data
q = torch.rand((batch_size, seq_len_q, d_model))
k = torch.rand((batch_size, seq_len_k, d_model))
v = k  # Usually, value and key are the same in many applications

# Model
encoder_layer = TransformerEncoderLayer(d_model, num_heads)

# Forward pass
output = encoder_layer(q, k, v)
print("Output shape:", output.shape)

Output shape: torch.Size([32, 10, 64])


In [19]:
df.head().drop("price", axis=1)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75


In [1]:
[1, 2, 3] + [4, 5, 6]

[1, 2, 3, 4, 5, 6]

In [78]:
class TabTransformer(nn.Module):
    def __init__(
        self,
        tokens,
        numeric_col_tokens,
        cat_col_tokens,
        d_model=64,
    ):
        super(TabTransformer, self).__init__()
        self.d_model = d_model
        self.tokens = tokens
        self.col_tokens = cat_col_tokens + numeric_col_tokens
        self.n_tokens = len(tokens)  # TODO Make this
        # Embedding layers for categorical features
        self.embeddings = nn.Embedding(self.n_tokens, self.d_model)
        self.n_num_embeddings = 6
        # self.numeric_embeddings = NumericEmbedding(d_model=self.d_model)
        self.col_indices = torch.tensor(
            [self.tokens.index(col) for col in self.col_tokens], dtype=torch.long
        )
        self.numeric_indices = torch.tensor(
            [self.tokens.index(col) for col in numeric_col_tokens], dtype=torch.long
        )
        self.transformer_encoder = TransformerEncoderLayer(d_model, num_heads=4)

        self.numeric_predictor = nn.Sequential(
            nn.Linear(d_model, d_model * 2),
            nn.ReLU(),
            nn.Linear(d_model * 2, 1),
            nn.ReLU(),
        )

        self.flatten_layer = nn.Linear(len(self.col_tokens), 1)

    def forward(self, num_inputs, cat_inputs):
        # Embed column indices
        repeated_col_indices = self.col_indices.unsqueeze(0).repeat(
            num_inputs.size(0), 1
        )
        col_embeddings = self.embeddings(repeated_col_indices)

        repeated_numeric_indices = self.numeric_indices.unsqueeze(0).repeat(
            num_inputs.size(0), 1
        )
        numeric_col_embeddings = self.embeddings(repeated_numeric_indices)

        cat_embeddings = self.embeddings(cat_inputs)
        # num_embeddings = self.numeric_embeddings(num_inputs)
        expanded_num_inputs = num_inputs.unsqueeze(2).repeat(1, 1, 64)
        # print(
        #     f"NumCol: {numeric_col_embeddings.shape}",
        #     f"NumericInputs: {expanded_num_inputs.shape}",
        #     sep="\n",
        # )
        num_embeddings = numeric_col_embeddings * expanded_num_inputs
        # Put a multiplier of the numeric embeddings and the
        # numeric columns to embed the numeric weights instead of the lame ass
        # embedder you currently have.
        # return col_embeddings, cat_embeddings, num_embeddings
        query_embeddings = torch.cat([cat_embeddings, num_embeddings], dim=1)
        out = self.transformer_encoder(
            col_embeddings,
            query_embeddings,
            query_embeddings
            # col_embeddings, query_embeddings, query_embeddings
        )
        out = self.numeric_predictor(out)
        out = self.flatten_layer(out.squeeze(-1))
        return out


no_price_tokens = tokens.copy()
no_price_tokens.remove("price")

numeric_col_tokens = (
    df.head().drop("price", axis=1).select_dtypes(include=np.number).columns.to_list()
)
cat_col_tokens = df.head().select_dtypes(exclude=np.number).columns.to_list()

model = TabTransformer(
    no_price_tokens,
    numeric_col_tokens=numeric_col_tokens,
    cat_col_tokens=cat_col_tokens,
)
x = model(X_train_num_tensor[0:32, :], X_train_cat_tensor[0:32, :])
x.shape

torch.Size([32, 1])

In [81]:
epochs = 100
batch_size = 1000
lr = 0.001
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
batch_count = 0
model.train()
for epoch in range(epochs):
    for i in range(0, X_train_num_tensor.size(0), batch_size):
        optimizer.zero_grad()
        y_pred = model(
            X_train_num_tensor[i : i + batch_size, :],
            X_train_cat_tensor[i : i + batch_size, :],
        )
        loss = loss_fn(y_pred, y_train_tensor[i : i + batch_size, :])
        loss.backward()
        optimizer.step()
        batch_count += 1
        if batch_count % 100 == 0:
            print(f"Epoch {epoch+1}/{epochs} Loss: {loss.item():,.2f}")

Epoch 3/100 Loss: 26,664,342.00
Epoch 5/100 Loss: 26,189,848.00
Epoch 7/100 Loss: 12,412,362.00
Epoch 10/100 Loss: 2,980,646.75
Epoch 12/100 Loss: 970,601.81
Epoch 14/100 Loss: 480,428.09
Epoch 16/100 Loss: 292,571.12
Epoch 19/100 Loss: 377,992.75
Epoch 21/100 Loss: 320,729.69
Epoch 23/100 Loss: 290,103.44
Epoch 25/100 Loss: 368,362.25
Epoch 28/100 Loss: 396,008.84
Epoch 30/100 Loss: 276,316.81
Epoch 32/100 Loss: 372,379.69
Epoch 35/100 Loss: 271,976.69
Epoch 37/100 Loss: 299,415.38
Epoch 39/100 Loss: 335,881.06
Epoch 41/100 Loss: 290,430.78
Epoch 44/100 Loss: 347,811.00
Epoch 46/100 Loss: 301,464.31
Epoch 48/100 Loss: 272,074.50
Epoch 50/100 Loss: 394,978.00
Epoch 53/100 Loss: 356,123.31
Epoch 55/100 Loss: 259,912.48
Epoch 57/100 Loss: 326,279.12
Epoch 60/100 Loss: 267,084.22
Epoch 62/100 Loss: 281,352.78
Epoch 64/100 Loss: 322,211.56
Epoch 66/100 Loss: 287,824.53
Epoch 69/100 Loss: 356,516.47
Epoch 71/100 Loss: 287,055.81
Epoch 73/100 Loss: 247,519.22
Epoch 75/100 Loss: 327,964.88
Ep

In [75]:
with torch.no_grad():
    y_pred = model(X_test_num_tensor[0:10, :], X_test_cat_tensor[0:10, :])
    loss = loss_fn(y_pred, y_test_tensor[0:10])
    print(f"Test loss: {loss.item():,.2f}")

Test loss: 985,492.12


In [77]:
for i in range(10):
    print(
        f"Predicted: {y_pred[i].item():,.2f} Actual: {y_test_tensor[i].item():,.2f}",
        f"Diff: {y_pred[i].item() - y_test_tensor[i].item():,.2f}",
    )

Predicted: 601.22 Actual: 559.00 Diff: 42.22
Predicted: 2,328.35 Actual: 2,201.00 Diff: 127.35
Predicted: 1,207.12 Actual: 1,238.00 Diff: -30.88
Predicted: 1,467.03 Actual: 1,304.00 Diff: 163.03
Predicted: 9,597.38 Actual: 6,901.00 Diff: 2,696.38
Predicted: 3,875.39 Actual: 3,011.00 Diff: 864.39
Predicted: 1,825.58 Actual: 1,765.00 Diff: 60.58
Predicted: 1,854.22 Actual: 1,679.00 Diff: 175.22
Predicted: 2,105.56 Actual: 2,102.00 Diff: 3.56
Predicted: 6,114.66 Actual: 4,789.00 Diff: 1,325.66


In [None]:
for i in range(10):
    print(f"Predicted: {y_pred[i].item():,.2f} Actual: {y_test_tensor[i].item():,.2f}")

Predicted: 754.16 Actual: 559.00
Predicted: 2,390.27 Actual: 2,201.00
Predicted: 993.24 Actual: 1,238.00
Predicted: 1,583.13 Actual: 1,304.00
Predicted: 9,786.17 Actual: 6,901.00
Predicted: 3,888.29 Actual: 3,011.00
Predicted: 1,574.12 Actual: 1,765.00
Predicted: 1,755.46 Actual: 1,679.00
Predicted: 2,334.63 Actual: 2,102.00
Predicted: 5,939.16 Actual: 4,789.00


In [None]:
for i in range(10):
    print(f"Predicted: {y_pred[i].item():,.2f} Actual: {y_test_tensor[i].item():,.2f}")

Predicted: 688.47 Actual: 559.00
Predicted: 2,547.17 Actual: 2,201.00
Predicted: 1,044.95 Actual: 1,238.00
Predicted: 1,790.95 Actual: 1,304.00
Predicted: 10,160.45 Actual: 6,901.00
Predicted: 3,790.90 Actual: 3,011.00
Predicted: 1,729.77 Actual: 1,765.00
Predicted: 1,749.63 Actual: 1,679.00
Predicted: 2,328.92 Actual: 2,102.00
Predicted: 5,928.75 Actual: 4,789.00


In [None]:
# Predicted: 688.47 Actual: 559.00
# Predicted: 2,547.17 Actual: 2,201.00
# Predicted: 1,044.95 Actual: 1,238.00
# Predicted: 1,790.95 Actual: 1,304.00
# Predicted: 10,160.45 Actual: 6,901.00
# Predicted: 3,790.90 Actual: 3,011.00
# Predicted: 1,729.77 Actual: 1,765.00
# Predicted: 1,749.63 Actual: 1,679.00
# Predicted: 2,328.92 Actual: 2,102.00
# Predicted: 5,928.75 Actual: 4,789.00