In [1]:
# tensorboard --logdir='runs/'

In [2]:
from datetime import datetime as dt
from itertools import chain

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import torch.nn.init as init

from torch.utils.tensorboard import SummaryWriter

# Load and preprocess the dataset (assuming you have a CSV file)
df = pd.read_csv("../data/diamonds.csv")
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
df.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

In [4]:
def initialize_parameters(module):
    if isinstance(module, (nn.Linear, nn.Conv2d)):
        init.xavier_uniform_(module.weight, gain=5)
        if module.bias is not None:
            init.constant_(module.bias, 0.001)
    elif isinstance(module, nn.Embedding):
        init.uniform_(module.weight, -0.4, 0.4)
    elif isinstance(module, nn.LayerNorm):
        init.normal_(module.weight, mean=1, std=0.2)
        init.constant_(module.bias, 0.001)

In [5]:
cat_columns = ["cut", "color", "clarity"]
num_columns = ["carat", "depth", "table", "x", "y", "z"]
cat_values = pd.unique(df[cat_columns].values.ravel("K"))
target_column = "price"
tokens = list(
    chain(
        cat_values,
        cat_columns,
        num_columns,
        ["PAD", "[NUMERIC_MASK]", "[MASK]"],
        [target_column],
    )
)
token_dict = {token: i for i, token in enumerate(tokens)}

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [7]:
X = df.drop("price", axis=1)
y = df["price"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Preprocess categorical features
X_train_cat = X_train[cat_columns].copy()
X_test_cat = X_test[cat_columns].copy()

label_encoders = {}
for col in cat_columns:
    le = LabelEncoder()
    X_train_cat[col] = X_train_cat[col].map(token_dict)
    X_test_cat[col] = X_test_cat[col].map(token_dict)
    # label_encoders[col] = le

# Preprocess numeric features

scaler = StandardScaler()
X_train_num = scaler.fit_transform(X_train[num_columns].copy())
X_test_num = scaler.transform(X_test[num_columns].copy())

X_train_cat_tensor = torch.tensor(X_train_cat.values, dtype=torch.int64).to(
    device
)  # Use int64 dtype for categorical indices
X_train_num_tensor = torch.tensor(X_train_num, dtype=torch.float32).to(device)
X_test_cat_tensor = torch.tensor(X_test_cat.values, dtype=torch.int64).to(
    device
)  # Use int64 dtype for categorical indices
X_test_num_tensor = torch.tensor(X_test_num, dtype=torch.float32).to(device)
y_train_tensor = (
    torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1).to(device)
)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1).to(device)

In [8]:
nn.Embedding

torch.nn.modules.sparse.Embedding

In [9]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads):
        super(MultiHeadAttention, self).__init__()
        self.n_heads = n_heads
        self.d_model = d_model
        self.d_head = d_model // n_heads

        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.out_linear = nn.Linear(d_model, d_model)

        # self.initialize_parameters()
        self.apply(initialize_parameters)

    def forward(self, q, k, v, mask=None):
        batch_size = q.size(0)

        q = (
            self.q_linear(q)
            .view(batch_size, -1, self.n_heads, self.d_head)
            .transpose(1, 2)
        )
        k = (
            self.k_linear(k)
            .view(batch_size, -1, self.n_heads, self.d_head)
            .transpose(1, 2)
        )
        v = (
            self.v_linear(v)
            .view(batch_size, -1, self.n_heads, self.d_head)
            .transpose(1, 2)
        )

        attn_output, _ = self.scaled_dot_product_attention(q, k, v, mask)

        attn_output = (
            attn_output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        )
        out = self.out_linear(attn_output)
        return out

    def scaled_dot_product_attention(self, q, k, v, mask=None):
        matmul_qk = torch.matmul(q, k.transpose(-2, -1))
        d_k = q.size(-1)
        scaled_attention_logits = matmul_qk / (d_k**0.5)

        if mask is not None:
            scaled_attention_logits += mask * -1e9

        attention_weights = F.softmax(scaled_attention_logits, dim=-1)
        output = torch.matmul(attention_weights, v)

        return output, attention_weights


class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads):
        super(TransformerEncoderLayer, self).__init__()

        self.multi_head_attention = MultiHeadAttention(d_model, n_heads)

        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, 4 * d_model),
            nn.ReLU(),
            nn.Linear(4 * d_model, d_model),
        )

        self.layernorm1 = nn.LayerNorm(d_model)
        self.layernorm2 = nn.LayerNorm(d_model)
        # self.initialize_parameters()
        self.apply(initialize_parameters)

    def forward(self, q, k, v, mask=None):
        attn_output = self.multi_head_attention(q, k, v, mask)
        out1 = self.layernorm1(q + attn_output)

        ff_output = self.feed_forward(out1)
        out2 = self.layernorm2(out1 + ff_output)

        return out2


# Parameters
d_model = 64  # Embedding dimension
n_heads = 4  # Number of attention heads
seq_len_q = 10  # Sequence length for the query tensor
seq_len_k = 20  # Sequence length for the key tensor
batch_size = 32  # Batch size

# Random data
q = torch.rand((batch_size, seq_len_q, d_model))
k = torch.rand((batch_size, seq_len_k, d_model))
v = k  # Usually, value and key are the same in many applications

# Model
encoder_layer = TransformerEncoderLayer(d_model, n_heads)

# Forward pass
output = encoder_layer(q, k, v)
print("Output shape:", output.shape)

Output shape: torch.Size([32, 10, 64])


In [10]:
def mask_tensor(tensor, model, probability=0.8):
    if tensor.dtype == torch.float32:
        is_numeric = True
    elif tensor.dtype == torch.int64:
        is_numeric = False
    else:
        raise ValueError(f"Task {tensor.dtype} not supported.")

    tensor = tensor.clone()
    bit_mask = torch.rand(tensor.shape) > probability
    if is_numeric:
        tensor[bit_mask] = torch.tensor(float("-Inf"))
    else:
        tensor[bit_mask] = model.cat_mask_token
    return tensor.to(model.device)

In [11]:
class TabTransformer(nn.Module):
    def __init__(
        self,
        tokens,
        numeric_col_tokens,
        cat_col_tokens,
        token_dict,
        d_model=64,
        n_heads=4,
        device=device,
    ):
        super(TabTransformer, self).__init__()
        self.device = device
        self.d_model = d_model
        self.tokens = tokens
        self.token_dict = token_dict
        self.decoder_dict = {v: k for k, v in token_dict.items()}
        # Masks
        self.cat_mask_token = torch.tensor(self.token_dict["[MASK]"]).to(device)
        self.numeric_mask_token = torch.tensor(self.token_dict["[NUMERIC_MASK]"]).to(
            device
        )

        self.col_tokens = cat_col_tokens + numeric_col_tokens
        self.n_tokens = len(tokens)  # TODO Make this
        # Embedding layers for categorical features
        self.embeddings = nn.Embedding(self.n_tokens, self.d_model).to(device)
        self.n_numeric_cols = len(numeric_col_tokens)
        self.n_cat_cols = len(cat_col_tokens)
        self.n_columns = self.n_numeric_cols + self.n_cat_cols
        # self.numeric_embeddings = NumericEmbedding(d_model=self.d_model)
        self.col_indices = torch.tensor(
            [self.tokens.index(col) for col in self.col_tokens], dtype=torch.long
        ).to(device)
        self.numeric_indices = torch.tensor(
            [self.tokens.index(col) for col in numeric_col_tokens], dtype=torch.long
        ).to(device)
        self.transformer_encoder = TransformerEncoderLayer(d_model, n_heads=n_heads).to(
            device
        )

        self.regressor = nn.Sequential(
            nn.Linear(d_model, d_model * 2),
            nn.ReLU(),
            nn.Linear(d_model * 2, 1),
            nn.ReLU(),
        ).to(device)

        self.mlm_decoder = nn.Sequential(nn.Linear(d_model, self.n_tokens)).to(
            device
        )  # TODO try making more complex

        self.mnm_decoder = nn.Sequential(
            nn.Linear(
                self.n_columns * self.d_model, self.d_model * 4
            ),  # Try making more complex
            nn.ReLU(),
            nn.Linear(self.d_model * 4, self.n_numeric_cols),
        ).to(device)

        self.flatten_layer = nn.Linear(len(self.col_tokens), 1).to(device)
        self.apply(initialize_parameters)

    def forward(self, num_inputs, cat_inputs, task="regression"):
        # Embed column indices
        repeated_col_indices = self.col_indices.unsqueeze(0).repeat(
            num_inputs.size(0), 1
        )
        col_embeddings = self.embeddings(repeated_col_indices)

        repeated_numeric_indices = self.numeric_indices.unsqueeze(0).repeat(
            num_inputs.size(0), 1
        )
        numeric_col_embeddings = self.embeddings(repeated_numeric_indices)

        cat_embeddings = self.embeddings(cat_inputs)

        expanded_num_inputs = num_inputs.unsqueeze(2).repeat(1, 1, self.d_model)
        inf_mask = (expanded_num_inputs == float("-inf")).all(dim=2)
        base_numeric = torch.zeros_like(expanded_num_inputs)

        num_embeddings = (
            numeric_col_embeddings[~inf_mask] * expanded_num_inputs[~inf_mask]
        )
        base_numeric[~inf_mask] = num_embeddings
        base_numeric[inf_mask] = self.embeddings(self.numeric_mask_token)

        query_embeddings = torch.cat([cat_embeddings, base_numeric], dim=1)
        out = self.transformer_encoder(
            col_embeddings,
            # query_embeddings,
            query_embeddings,
            query_embeddings
            # col_embeddings, query_embeddings, query_embeddings
        )
        if task == "regression":
            out = self.regressor(out)
            out = self.flatten_layer(out.squeeze(-1))

            return out
        elif task == "mlm":
            cat_out = self.mlm_decoder(out)
            # print(f"Out shape: {out.shape}, cat_out shape: {cat_out.shape}")
            numeric_out = out.view(out.size(0), -1)
            # print(f"numeric_out shape: {numeric_out.shape}")
            numeric_out = self.mnm_decoder(numeric_out)
            return cat_out, numeric_out
        else:
            raise ValueError(f"Task {task} not supported.")


no_price_tokens = tokens.copy()
no_price_tokens.remove("price")

numeric_col_tokens = (
    df.head().drop("price", axis=1).select_dtypes(include=np.number).columns.to_list()
)
cat_col_tokens = df.head().select_dtypes(exclude=np.number).columns.to_list()

model = TabTransformer(
    no_price_tokens,
    numeric_col_tokens=numeric_col_tokens,
    cat_col_tokens=cat_col_tokens,
    token_dict=token_dict,
).to(device)
batch_size = 3
test_num = X_train_num_tensor[0:batch_size, :]
test_num_mask = mask_tensor(test_num, model)
test_cat = X_train_cat_tensor[0:batch_size, :]
test_cat_mask = mask_tensor(test_cat, model)
with torch.no_grad():
    x = model(
        test_num_mask,
        test_cat_mask,
        task="mlm",
    )
x[0].shape, x[1].shape

(torch.Size([3, 9, 32]), torch.Size([3, 6]))

In [12]:
def show_mask_pred(i, model, probability):
    numeric_values = X_train_num_tensor[i : i + 1, :]
    categorical_values = X_train_cat_tensor[i : i + 1, :]
    numeric_masked = mask_tensor(numeric_values, model, probability=probability)
    categorical_masked = mask_tensor(categorical_values, model, probability=probability)
    # Predictions
    with torch.no_grad():
        cat_preds, numeric_preds = model(numeric_masked, categorical_masked, task="mlm")
    # Get the predicted tokens from cat_preds
    cat_preds = cat_preds.argmax(dim=2)
    # Get the words from the tokens
    decoder_dict = model.decoder_dict
    cat_preds = [decoder_dict[i.item()] for i in cat_preds[0]]

    results_dict = {k: cat_preds[i] for i, k in enumerate(model.col_tokens)}
    for i, k in enumerate(model.col_tokens[model.n_cat_cols :]):
        results_dict[k] = numeric_preds[0][i].item()
    # Get the masked values
    categorical_masked = [decoder_dict[i.item()] for i in categorical_masked[0]]
    numeric_masked = numeric_masked[0].tolist()
    masked_values = categorical_masked + numeric_masked
    # zip the masked values with the column names
    masked_values = dict(zip(model.col_tokens, masked_values))
    # Get the original values
    categorical_values = [decoder_dict[i.item()] for i in categorical_values[0]]
    numeric_values = numeric_values[0].tolist()
    original_values = categorical_values + numeric_values
    # zip the original values with the column names
    original_values = dict(zip(model.col_tokens, original_values))
    # print(numeric_masked)
    # print(categorical_masked)
    result_dict = {
        "actual": original_values,
        "masked": masked_values,
        "pred": results_dict,
    }

    return result_dict


show_mask_pred(0, model, 0.8)

{'actual': {'cut': 'Good',
  'color': 'F',
  'clarity': 'SI2',
  'carat': 2.56005597114563,
  'depth': -2.5507476329803467,
  'table': 2.9338605403900146,
  'x': 2.229450225830078,
  'y': 2.138209104537964,
  'z': 1.7382067441940308},
 'masked': {'cut': 'Good',
  'color': 'F',
  'clarity': 'SI2',
  'carat': 2.56005597114563,
  'depth': -2.5507476329803467,
  'table': -inf,
  'x': -inf,
  'y': -inf,
  'z': -inf},
 'pred': {'cut': 'VS1',
  'color': 'carat',
  'clarity': 'carat',
  'carat': 23.145444869995117,
  'depth': 10.518811225891113,
  'table': 24.12872314453125,
  'x': 43.176856994628906,
  'y': -1.0302671194076538,
  'z': -14.537619590759277}}

In [13]:
# Masked Tabualr Modeling
base_model_name = "OneHundred"

model_time = dt.now()
model_time = model_time.strftime("%Y-%m-%dT%H:%M:%S")
model_name = f"{base_model_name}_{model_time}"

In [14]:
# Masked Tabualr Modeling
epochs = 20
batch_size = 1000
lr = 0.01
mse_loss = nn.MSELoss()
ce_loss = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

summary_writer = SummaryWriter("runs/" + model_name)

batch_count = 0
model.train()
for epoch in range(epochs):
    for i in range(0, X_train_num_tensor.size(0), batch_size):
        numeric_values = X_train_num_tensor[i : i + batch_size, :]
        categorical_values = X_train_cat_tensor[i : i + batch_size, :]
        numeric_masked = mask_tensor(numeric_values, model, probability=0.4)
        categorical_masked = mask_tensor(categorical_values, model, probability=0.4)
        optimizer.zero_grad()
        cat_preds, numeric_preds = model(numeric_masked, categorical_masked, task="mlm")
        cat_targets = torch.cat(
            (
                categorical_values,
                model.numeric_indices.expand(categorical_values.size(0), -1),
            ),
            dim=1,
        )

        cat_preds = cat_preds.permute(0, 2, 1)
        # print(
        #     f"cat_preds.shape: {cat_preds.shape}, cat_targets.shape: {cat_targets.shape}"
        # )
        cat_loss = ce_loss(cat_preds, cat_targets)
        numeric_loss = mse_loss(numeric_preds, numeric_values)
        loss = cat_loss + numeric_loss
        loss.backward()
        optimizer.step()
        batch_count += 1
        learning_rate = optimizer.param_groups[0]["lr"]
        summary_writer.add_scalar("LossTrain/agg_mask", loss.item(), batch_count)
        summary_writer.add_scalar("LossTrain/mlm_loss", cat_loss.item(), batch_count)
        summary_writer.add_scalar(
            "LossTrain/mnm_loss", numeric_loss.item(), batch_count
        )
        summary_writer.add_scalar("Metrics/mtm_lr", learning_rate, batch_count)
        if batch_count % 100 == 0:
            print(f"Epoch {epoch+1}/{epochs} Loss: {loss.item():,.4f}")
            # Test set
            with torch.no_grad():
                numeric_values = X_test_num_tensor
                categorical_values = X_test_cat_tensor
                numeric_masked = mask_tensor(numeric_values, model, probability=0.8)
                categorical_masked = mask_tensor(
                    categorical_values, model, probability=0.8
                )
                optimizer.zero_grad()
                cat_preds, numeric_preds = model(
                    numeric_masked, categorical_masked, task="mlm"
                )
                cat_targets = torch.cat(
                    (
                        categorical_values,
                        model.numeric_indices.expand(categorical_values.size(0), -1),
                    ),
                    dim=1,
                )

                cat_preds = cat_preds.permute(0, 2, 1)
                # print(
                #     f"cat_preds.shape: {cat_preds.shape}, cat_targets.shape: {cat_targets.shape}"
                # )
                cat_loss = ce_loss(cat_preds, cat_targets)
                numeric_loss = mse_loss(numeric_preds, numeric_values)
                loss = cat_loss + numeric_loss
                summary_writer.add_scalar("LossTest/agg_loss", loss.item(), batch_count)
            summary_writer.add_scalar("LossTest/mlm_loss", cat_loss.item(), batch_count)
            summary_writer.add_scalar(
                "LossTest/mnm_loss", numeric_loss.item(), batch_count
            )

Epoch 3/20 Loss: 2.8567
Epoch 5/20 Loss: 2.0133
Epoch 7/20 Loss: 1.5894
Epoch 10/20 Loss: 1.2841
Epoch 12/20 Loss: 1.1439
Epoch 14/20 Loss: 1.0028
Epoch 16/20 Loss: 0.9855
Epoch 19/20 Loss: 0.9056


In [15]:
show_mask_pred(0, model, 0.8)  # Check for learning... XFKAT

{'actual': {'cut': 'Good',
  'color': 'F',
  'clarity': 'SI2',
  'carat': 2.56005597114563,
  'depth': -2.5507476329803467,
  'table': 2.9338605403900146,
  'x': 2.229450225830078,
  'y': 2.138209104537964,
  'z': 1.7382067441940308},
 'masked': {'cut': 'Good',
  'color': 'F',
  'clarity': 'SI2',
  'carat': 2.56005597114563,
  'depth': -2.5507476329803467,
  'table': 2.9338605403900146,
  'x': 2.229450225830078,
  'y': 2.138209104537964,
  'z': -inf},
 'pred': {'cut': 'Premium',
  'color': 'I',
  'clarity': 'SI2',
  'carat': 0.8274498581886292,
  'depth': -0.6303593516349792,
  'table': 3.284083366394043,
  'x': 3.764676332473755,
  'y': 1.7112181186676025,
  'z': 1.3544758558273315}}

Finetuning the previous model seems to work but when I pre-train, we run into issues. Let's try again


In [16]:
1000 / df.shape[0]

0.01853911753800519

In [17]:
# Regression Model
epochs = 40
batch_size = 1000
lr = 0.1
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

# model_time = dt.now()
# model_time = model_time.strftime("%Y-%m-%dT%H:%M:%S")
# model_name = f"FullDSNFT{model_time}"

summary_writer = SummaryWriter("runs/" + model_name)

small_test = False

batch_count = 0
model.train()
for epoch in range(epochs):
    # for i in range(0, batch_size, batch_size):
    for i in range(0, X_train_num_tensor.size(0), batch_size):
        num_inputs = X_train_num_tensor[i : i + batch_size, :]
        cat_inputs = X_train_cat_tensor[i : i + batch_size, :]
        optimizer.zero_grad()
        y_pred = model(num_inputs, cat_inputs)
        loss = loss_fn(y_pred, y_train_tensor[i : i + batch_size, :])
        loss.backward()
        optimizer.step()
        batch_count += 1
        learning_rate = optimizer.param_groups[0]["lr"]
        summary_writer.add_scalar("LossTrain/regression_loss", loss.item(), batch_count)
        summary_writer.add_scalar("Metrics/regression_lr", learning_rate, batch_count)
        if (batch_count % 100 == 0) or small_test:
            # Test set
            with torch.no_grad():
                y_pred = model(X_test_num_tensor, X_test_cat_tensor)
                loss = loss_fn(y_pred, y_test_tensor)
                summary_writer.add_scalar(
                    "LossTest/regression_loss", loss.item(), batch_count
                )
                print(
                    f"Epoch {epoch+1}/{epochs} Loss: {loss.item():,.2f} "
                    + f"Test loss: {loss.item():,.2f}"
                )

Epoch 3/40 Loss: 1,349,244.12 Test loss: 1,349,244.12
Epoch 5/40 Loss: 790,227.94 Test loss: 790,227.94
Epoch 7/40 Loss: 599,433.25 Test loss: 599,433.25
Epoch 10/40 Loss: 546,980.00 Test loss: 546,980.00
Epoch 12/40 Loss: 798,963.94 Test loss: 798,963.94
Epoch 14/40 Loss: 503,871.94 Test loss: 503,871.94
Epoch 16/40 Loss: 502,906.34 Test loss: 502,906.34
Epoch 19/40 Loss: 844,923.19 Test loss: 844,923.19
Epoch 21/40 Loss: 1,544,781.12 Test loss: 1,544,781.12
Epoch 23/40 Loss: 824,364.50 Test loss: 824,364.50
Epoch 25/40 Loss: 646,225.62 Test loss: 646,225.62
Epoch 28/40 Loss: 2,756,673.00 Test loss: 2,756,673.00
Epoch 30/40 Loss: 2,407,429.75 Test loss: 2,407,429.75
Epoch 32/40 Loss: 1,916,535.62 Test loss: 1,916,535.62
Epoch 35/40 Loss: 1,555,317.62 Test loss: 1,555,317.62
Epoch 37/40 Loss: 1,528,491.75 Test loss: 1,528,491.75
Epoch 39/40 Loss: 1,501,334.50 Test loss: 1,501,334.50


```
Epoch 3/20 Loss: 416,668.31
Epoch 5/20 Loss: 308,602.41
Epoch 7/20 Loss: 375,060.84
Epoch 10/20 Loss: 346,249.38
Epoch 12/20 Loss: 421,598.84
Epoch 14/20 Loss: 362,426.00
Epoch 16/20 Loss: 339,481.22
Epoch 19/20 Loss: 379,399.06
```


In [18]:
with torch.no_grad():
    y_pred = model(X_test_num_tensor[0:10, :], X_test_cat_tensor[0:10, :])
    loss = loss_fn(y_pred, y_test_tensor[0:10])
    print(f"Test loss: {loss.item():,.2f}")

Test loss: 1,659,003.25


In [19]:
for i in range(10):
    print(
        f"Predicted: {y_pred[i].item():,.2f} Actual: {y_test_tensor[i].item():,.2f}",
        f"Diff: {y_pred[i].item() - y_test_tensor[i].item():,.2f}",
    )

Predicted: 983.38 Actual: 559.00 Diff: 424.38
Predicted: 3,007.07 Actual: 2,201.00 Diff: 806.07
Predicted: 1,324.15 Actual: 1,238.00 Diff: 86.15
Predicted: 1,280.24 Actual: 1,304.00 Diff: -23.76
Predicted: 10,587.29 Actual: 6,901.00 Diff: 3,686.29
Predicted: 2,165.83 Actual: 3,011.00 Diff: -845.17
Predicted: 1,630.87 Actual: 1,765.00 Diff: -134.13
Predicted: 2,078.92 Actual: 1,679.00 Diff: 399.92
Predicted: 2,313.34 Actual: 2,102.00 Diff: 211.34
Predicted: 5,896.50 Actual: 4,789.00 Diff: 1,107.50


```
Predicted: 1,711.44 Actual: 559.00 Diff: 1,152.44
Predicted: 1,711.50 Actual: 2,201.00 Diff: -489.50
Predicted: 1,711.50 Actual: 1,238.00 Diff: 473.50
Predicted: 1,711.61 Actual: 1,304.00 Diff: 407.61
Predicted: 1,710.44 Actual: 6,901.00 Diff: -5,190.56
Predicted: 1,711.26 Actual: 3,011.00 Diff: -1,299.74
Predicted: 1,711.60 Actual: 1,765.00 Diff: -53.40
Predicted: 1,711.57 Actual: 1,679.00 Diff: 32.57
Predicted: 1,711.03 Actual: 2,102.00 Diff: -390.97
Predicted: 1,711.64 Actual: 4,789.00 Diff: -3,077.36
```


```
Predicted: 2,085.14 Actual: 559.00 Diff: 1,526.14
Predicted: 3,381.02 Actual: 2,201.00 Diff: 1,180.02
Predicted: 1,725.17 Actual: 1,238.00 Diff: 487.17
Predicted: 1,914.37 Actual: 1,304.00 Diff: 610.37
Predicted: 15,271.41 Actual: 6,901.00 Diff: 8,370.41
Predicted: 7,173.91 Actual: 3,011.00 Diff: 4,162.91
Predicted: 947.38 Actual: 1,765.00 Diff: -817.62
Predicted: 604.91 Actual: 1,679.00 Diff: -1,074.09
Predicted: 989.06 Actual: 2,102.00 Diff: -1,112.94
Predicted: 9,508.21 Actual: 4,789.00 Diff: 4,719.21
```


In [24]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error

# Load and preprocess the dataset (assuming you have a CSV file)
data = pd.read_csv("../data/diamonds.csv")

# Separate features and target variable
X = data.drop("price", axis=1)
y = data["price"]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Preprocess categorical features
cat_columns = ["cut", "color", "clarity"]
X_train_cat = X_train[cat_columns]
X_test_cat = X_test[cat_columns]

label_encoders = {}
for col in cat_columns:
    le = LabelEncoder()
    X_train_cat[col] = le.fit_transform(X_train_cat.loc[:, col])
    X_test_cat[col] = le.transform(X_test_cat.loc[:, col])
    label_encoders[col] = le

# Preprocess numeric features
num_columns = ["carat", "depth", "table", "x", "y", "z"]
scaler = StandardScaler()
X_train_num = scaler.fit_transform(X_train[num_columns])
X_test_num = scaler.transform(X_test[num_columns])


# Convert data to PyTorch tensors
X_train_cat_tensor = torch.tensor(X_train_cat.values, dtype=torch.int64)
X_train_num_tensor = torch.tensor(X_train_num, dtype=torch.float32)
X_test_cat_tensor = torch.tensor(X_test_cat.values, dtype=torch.int64)
X_test_num_tensor = torch.tensor(X_test_num, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)


# Define the neural network model
class DiamondPricePredictor(nn.Module):
    def __init__(self, num_input_dim, cat_embedding_sizes, hidden_dim):
        super(DiamondPricePredictor, self).__init__()

        # Embedding layers for categorical features
        self.embeddings = nn.ModuleList(
            [
                nn.Embedding(num_classes, emb_size)
                for num_classes, emb_size in cat_embedding_sizes
            ]
        )

        total_emb_dim = sum(emb_size for _, emb_size in cat_embedding_sizes)
        self.predictor = nn.Sequential(
            nn.Linear(num_input_dim + total_emb_dim, hidden_dim * 4),
            nn.ReLU(),
            nn.Linear(hidden_dim * 4, hidden_dim * 2),
            nn.ReLU(),
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1),
        )
        # self.fc1 = nn.Linear(num_input_dim + total_emb_dim, hidden_dim)
        # self.fc2 = nn.Linear(hidden_dim, 1)

    def forward(self, num_inputs, cat_inputs):
        embeddings = [
            embedding(cat_inputs[:, i]) for i, embedding in enumerate(self.embeddings)
        ]
        cat_features = torch.cat(embeddings, dim=1)
        x = torch.cat([num_inputs, cat_features], dim=1)

        x = self.predictor(x)
        return x


# Initialize the model
num_input_dim = X_train_num.shape[1]
cat_embedding_sizes = [
    (len(le.classes_), min(50, (len(le.classes_) + 1) // 2))
    for le in label_encoders.values()
]
hidden_dim = 64
simple_model = DiamondPricePredictor(num_input_dim, cat_embedding_sizes, hidden_dim)

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(simple_model.parameters(), lr=0.01)

# Training loop
epochs = 2000
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = simple_model(
        X_train_num_tensor,  # [0:batch_size],
        X_train_cat_tensor,  # [0:batch_size],
    )  # Pass numeric and categorical tensors separately
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()
    if epoch % 200 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():,.2f}")

print("Training complete!")
# Evaluate the model

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_cat[col] = le.fit_transform(X_train_cat.loc[:, col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_cat[col] = le.transform(X_test_cat.loc[:, col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_cat[col] = le.fit_transform(X_train_cat.loc[:, col])
A value is trying to be set 

Epoch 1/5000, Loss: 31,439,204.00
Epoch 201/5000, Loss: 516,823.56
Epoch 401/5000, Loss: 368,840.97
Epoch 601/5000, Loss: 326,413.62
Epoch 801/5000, Loss: 302,466.31
Epoch 1001/5000, Loss: 279,538.16
Epoch 1201/5000, Loss: 275,397.34
Epoch 1401/5000, Loss: 267,349.38
Epoch 1601/5000, Loss: 258,712.78
Epoch 1801/5000, Loss: 252,671.48
Epoch 2001/5000, Loss: 249,290.19
Epoch 2201/5000, Loss: 253,728.94
Epoch 2401/5000, Loss: 253,446.72
Epoch 2601/5000, Loss: 249,049.94
Epoch 2801/5000, Loss: 243,626.77
Epoch 3001/5000, Loss: 242,147.86
Epoch 3201/5000, Loss: 253,064.92
Epoch 3401/5000, Loss: 233,698.25
Epoch 3601/5000, Loss: 235,164.19
Epoch 3801/5000, Loss: 234,696.61
Epoch 4001/5000, Loss: 231,627.31
Epoch 4201/5000, Loss: 229,746.69
Epoch 4401/5000, Loss: 228,175.61
Epoch 4601/5000, Loss: 234,943.69
Epoch 4801/5000, Loss: 225,193.94
Training complete!


In [26]:
model.eval()
with torch.no_grad():
    test_predictions = model(X_test_num_tensor, X_test_cat_tensor)
    mse = mean_squared_error(y_test_tensor, test_predictions)
    print(f"Mean Squared Error on Test Data: {mse:,.2f}")  # 735,349.69

Mean Squared Error on Test Data: 4,832,811.50


In [27]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(
    f"Number of Params in Tabular Model:{count_parameters(model):,}",
    f"Number of Params in Simple Model:{count_parameters(simple_model):,}",
)

Number of Params in Tabular Model:211,825 Number of Params in Simple Model:45,900


In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

# Load the diamonds dataset
diamonds_data = pd.read_csv("../data/diamonds.csv")

# Encode categorical features using LabelEncoder
label_encoders = {}
categorical_features = ["cut", "color", "clarity"]
for feature in categorical_features:
    le = LabelEncoder()
    diamonds_data[feature] = le.fit_transform(diamonds_data[feature])
    label_encoders[feature] = le

# Split the dataset into features (X) and target (y)
X = diamonds_data.drop("price", axis=1)
y = diamonds_data["price"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Initialize and train the XGBoost regressor
xgb_regressor = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_regressor.fit(
    X_train,  # <[0:batch_size],
    y_train,  # [0:batch_size],
)

# Predict on the test set
y_pred = xgb_regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:,.2f}")

# You can also access feature importance scores
# feature_importances = xgb_regressor.feature_importances_
# print("Feature Importance:")
# for feature, importance in zip(X.columns, feature_importances):
#     print(f"{feature}: {importance:.4f}")

Mean Squared Error: 278,657.75


In [None]:
# %pip install pandas==2.0.3

In [None]:
f"{1.7e6:,.2f}"

'1,700,000.00'