In [1]:
# tensorboard --logdir='runs/'

In [2]:
from datetime import datetime as dt
from itertools import chain

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import torch.nn.init as init

from torch.utils.tensorboard import SummaryWriter

# Load and preprocess the dataset (assuming you have a CSV file)
df = pd.read_csv("../data/diamonds.csv")
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
df.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

In [4]:
def initialize_parameters(module):
    if isinstance(module, (nn.Linear, nn.Conv2d)):
        init.xavier_uniform_(module.weight, gain=5)
        if module.bias is not None:
            init.constant_(module.bias, 0.001)
    elif isinstance(module, nn.Embedding):
        init.uniform_(module.weight, -0.4, 0.4)
    elif isinstance(module, nn.LayerNorm):
        init.normal_(module.weight, mean=1, std=0.2)
        init.constant_(module.bias, 0.001)

In [5]:
cat_columns = ["cut", "color", "clarity"]
num_columns = ["carat", "depth", "table", "x", "y", "z"]
cat_values = pd.unique(df[cat_columns].values.ravel("K"))
target_column = "price"
tokens = list(
    chain(
        cat_values,
        cat_columns,
        num_columns,
        ["PAD", "[NUMERIC_MASK]", "[MASK]"],
        [target_column],
    )
)
token_dict = {token: i for i, token in enumerate(tokens)}
token_dict

{'Ideal': 0,
 'Premium': 1,
 'Good': 2,
 'Very Good': 3,
 'Fair': 4,
 'E': 5,
 'I': 6,
 'J': 7,
 'H': 8,
 'F': 9,
 'G': 10,
 'D': 11,
 'SI2': 12,
 'SI1': 13,
 'VS1': 14,
 'VS2': 15,
 'VVS2': 16,
 'VVS1': 17,
 'I1': 18,
 'IF': 19,
 'cut': 20,
 'color': 21,
 'clarity': 22,
 'carat': 23,
 'depth': 24,
 'table': 25,
 'x': 26,
 'y': 27,
 'z': 28,
 'PAD': 29,
 '[NUMERIC_MASK]': 30,
 '[MASK]': 31,
 'price': 32}

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [7]:
rnd_df = df.sample(frac=1, random_state=42)
scaled_df = rnd_df.loc[:, cat_columns + num_columns + [target_column]]
for col in cat_columns:
    scaled_df[col] = scaled_df[col].map(token_dict)

scaler = StandardScaler()
numeric_scaled = scaler.fit_transform(scaled_df[num_columns].copy())
scaled_df[num_columns] = numeric_scaled

X = scaled_df.drop("price", axis=1)
y = scaled_df["price"]
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
# Categorical columns
X_train_cat_tensor = torch.tensor(X_train[cat_columns].values, dtype=torch.int32).to(
    device
)
X_test_cat_tensor = torch.tensor(X_test[cat_columns].values, dtype=torch.int32).to(
    device
)
# Numeric columns
X_train_num_tensor = torch.tensor(X_train[num_columns].values, dtype=torch.float32).to(
    device
)
X_test_num_tensor = torch.tensor(X_test[num_columns].values, dtype=torch.float32).to(
    device
)
y_train_tensor = (
    torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1).to(device)
)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1).to(device)

In [8]:
X_train.head(1)

Unnamed: 0,cut,color,clarity,carat,depth,table,x,y,z
1388,0,10,17,-1.177071,0.244725,-0.652139,-1.570008,-1.518684,-1.514447


In [9]:
X_train_cat_tensor[0], X_train_num_tensor[0], y_train_tensor[0]

(tensor([ 0, 10, 17], dtype=torch.int32),
 tensor([-1.1771,  0.2447, -0.6521, -1.5700, -1.5187, -1.5144]),
 tensor([559.]))

In [10]:
reverse_token_dict = {v: k for k, v in token_dict.items()}

In [11]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads):
        super(MultiHeadAttention, self).__init__()
        self.n_heads = n_heads
        self.d_model = d_model
        self.d_head = d_model // n_heads

        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.out_linear = nn.Linear(d_model, d_model)

        # self.initialize_parameters()
        self.apply(initialize_parameters)

    def forward(self, q, k, v, mask=None, input_feed_forward=False):
        batch_size = q.size(0)

        if input_feed_forward:
            q = (
                self.q_linear(q)
                .view(batch_size, -1, self.n_heads, self.d_head)
                .transpose(1, 2)
            )
            k = (
                self.k_linear(k)
                .view(batch_size, -1, self.n_heads, self.d_head)
                .transpose(1, 2)
            )
            v = (
                self.v_linear(v)
                .view(batch_size, -1, self.n_heads, self.d_head)
                .transpose(1, 2)
            )

        attn_output, _ = self.scaled_dot_product_attention(q, k, v, mask)

        attn_output = (
            attn_output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        )
        out = self.out_linear(attn_output)
        return out

    def scaled_dot_product_attention(self, q, k, v, mask=None):
        matmul_qk = torch.matmul(q, k.transpose(-2, -1))
        d_k = q.size(-1)
        scaled_attention_logits = matmul_qk / (d_k**0.5)

        if mask is not None:
            scaled_attention_logits += mask * -1e9

        attention_weights = F.softmax(scaled_attention_logits, dim=-1)
        output = torch.matmul(attention_weights, v)

        return output, attention_weights


class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads):
        super(TransformerEncoderLayer, self).__init__()

        self.multi_head_attention = MultiHeadAttention(d_model, n_heads)

        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, 4 * d_model),
            nn.ReLU(),
            nn.Linear(4 * d_model, d_model),
        )

        self.layernorm1 = nn.LayerNorm(d_model)
        self.layernorm2 = nn.LayerNorm(d_model)
        # self.initialize_parameters()
        self.apply(initialize_parameters)

    def forward(self, q, k, v, mask=None, input_feed_forward=False):
        attn_output = self.multi_head_attention(q, k, v, mask, input_feed_forward)
        out1 = self.layernorm1(q + attn_output)

        ff_output = self.feed_forward(out1)
        out2 = self.layernorm2(out1 + ff_output)

        return out2


# Parameters
d_model = 64  # Embedding dimension
n_heads = 4  # Number of attention heads
seq_len_q = 10  # Sequence length for the query tensor
seq_len_k = 20  # Sequence length for the key tensor
batch_size = 32  # Batch size

# Random data
q = torch.rand((batch_size, seq_len_q, d_model))
k = torch.rand((batch_size, seq_len_k, d_model))
v = k  # Usually, value and key are the same in many applications

# Model
encoder_layer = TransformerEncoderLayer(d_model, n_heads)

# Forward pass
output = encoder_layer(q, k, v)
print("Output shape:", output.shape)

Output shape: torch.Size([32, 10, 64])


In [12]:
def mask_tensor(tensor, model, probability=0.8):
    if tensor.dtype == torch.float32:
        is_numeric = True
    elif tensor.dtype == torch.int32:
        is_numeric = False
    else:
        raise ValueError(f"Task {tensor.dtype} not supported.")

    tensor = tensor.clone()
    bit_mask = torch.rand(tensor.shape) > probability
    if is_numeric:
        tensor[bit_mask] = torch.tensor(float("-Inf"))
    else:
        tensor[bit_mask] = model.cat_mask_token
    return tensor.to(model.device)

In [13]:
class TabTransformer(nn.Module):
    def __init__(
        self,
        tokens,
        numeric_col_tokens,
        cat_col_tokens,
        token_dict,
        d_model=64,
        n_heads=4,
        device=device,
    ):
        super(TabTransformer, self).__init__()
        self.device = device
        self.d_model = d_model
        self.tokens = tokens
        self.token_dict = token_dict
        self.decoder_dict = {v: k for k, v in token_dict.items()}
        # Masks
        self.cat_mask_token = torch.tensor(self.token_dict["[MASK]"]).to(device)
        self.numeric_mask_token = torch.tensor(self.token_dict["[NUMERIC_MASK]"]).to(
            device
        )

        self.col_tokens = cat_col_tokens + numeric_col_tokens
        self.n_tokens = len(tokens)  # TODO Make this
        # Embedding layers for categorical features
        self.embeddings = nn.Embedding(self.n_tokens, self.d_model).to(device)
        self.n_numeric_cols = len(numeric_col_tokens)
        self.n_cat_cols = len(cat_col_tokens)
        self.n_columns = self.n_numeric_cols + self.n_cat_cols
        # self.numeric_embeddings = NumericEmbedding(d_model=self.d_model)
        self.col_indices = torch.tensor(
            [self.tokens.index(col) for col in self.col_tokens], dtype=torch.long
        ).to(device)
        self.numeric_indices = torch.tensor(
            [self.tokens.index(col) for col in numeric_col_tokens], dtype=torch.long
        ).to(device)
        self.transformer_encoder1 = TransformerEncoderLayer(
            d_model, n_heads=n_heads
        ).to(device)
        self.transformer_encoder2 = TransformerEncoderLayer(
            d_model, n_heads=n_heads
        ).to(device)
        self.regressor = nn.Sequential(
            nn.Linear(d_model, d_model * 2),
            nn.ReLU(),
            nn.Linear(d_model * 2, 1),
            # nn.ReLU(),
        ).to(device)

        self.mlm_decoder = nn.Sequential(nn.Linear(d_model, self.n_tokens)).to(
            device
        )  # TODO try making more complex

        self.mnm_decoder = nn.Sequential(
            nn.Linear(
                self.n_columns * self.d_model, self.d_model * 4
            ),  # Try making more complex
            nn.ReLU(),
            nn.Linear(self.d_model * 4, self.n_numeric_cols),
        ).to(device)

        self.flatten_layer = nn.Linear(len(self.col_tokens), 1).to(device)
        self.apply(initialize_parameters)

    def forward(self, num_inputs, cat_inputs, task="regression"):
        # Embed column indices
        repeated_col_indices = self.col_indices.unsqueeze(0).repeat(
            num_inputs.size(0), 1
        )
        col_embeddings = self.embeddings(repeated_col_indices)

        repeated_numeric_indices = self.numeric_indices.unsqueeze(0).repeat(
            num_inputs.size(0), 1
        )
        numeric_col_embeddings = self.embeddings(repeated_numeric_indices)

        cat_embeddings = self.embeddings(cat_inputs)

        expanded_num_inputs = num_inputs.unsqueeze(2).repeat(1, 1, self.d_model)
        inf_mask = (expanded_num_inputs == float("-inf")).all(dim=2)
        base_numeric = torch.zeros_like(expanded_num_inputs)

        num_embeddings = (
            numeric_col_embeddings[~inf_mask] * expanded_num_inputs[~inf_mask]
        )
        base_numeric[~inf_mask] = num_embeddings
        base_numeric[inf_mask] = self.embeddings(self.numeric_mask_token)

        query_embeddings = torch.cat([cat_embeddings, base_numeric], dim=1)
        out = self.transformer_encoder1(
            col_embeddings,
            # query_embeddings,
            query_embeddings,
            query_embeddings
            # col_embeddings, query_embeddings, query_embeddings
        )
        out = self.transformer_encoder2(out, out, out)

        if task == "regression":
            out = self.regressor(out)
            out = self.flatten_layer(out.squeeze(-1))

            return out
        elif task == "mlm":
            cat_out = self.mlm_decoder(out)
            # print(f"Out shape: {out.shape}, cat_out shape: {cat_out.shape}")
            numeric_out = out.view(out.size(0), -1)
            # print(f"numeric_out shape: {numeric_out.shape}")
            numeric_out = self.mnm_decoder(numeric_out)
            return cat_out, numeric_out
        else:
            raise ValueError(f"Task {task} not supported.")


no_price_tokens = tokens.copy()
no_price_tokens.remove("price")

numeric_col_tokens = (
    df.head().drop("price", axis=1).select_dtypes(include=np.number).columns.to_list()
)
cat_col_tokens = df.head().select_dtypes(exclude=np.number).columns.to_list()

model = TabTransformer(
    no_price_tokens,
    numeric_col_tokens=numeric_col_tokens,
    cat_col_tokens=cat_col_tokens,
    token_dict=token_dict,
).to(device)
batch_size = 3
test_num = X_train_num_tensor[0:batch_size, :]
test_num_mask = mask_tensor(test_num, model)
test_cat = X_train_cat_tensor[0:batch_size, :]
test_cat_mask = mask_tensor(test_cat, model)
with torch.no_grad():
    x = model(
        test_num_mask,
        test_cat_mask,
        task="mlm",
    )
x[0].shape, x[1].shape

(torch.Size([3, 9, 32]), torch.Size([3, 6]))

In [14]:
[model.decoder_dict[i.item()] for i in X_train_cat_tensor[0 : 0 + 1, :][0]]

['Ideal', 'G', 'VVS1']

In [15]:
def show_mask_pred(i, model, probability):
    numeric_values = X_train_num_tensor[i : i + 1, :]
    categorical_values = X_train_cat_tensor[i : i + 1, :]
    numeric_masked = mask_tensor(numeric_values, model, probability=probability)
    categorical_masked = mask_tensor(categorical_values, model, probability=probability)
    # Predictions
    with torch.no_grad():
        cat_preds, numeric_preds = model(numeric_masked, categorical_masked, task="mlm")
    # Get the predicted tokens from cat_preds
    cat_preds = cat_preds.argmax(dim=2)
    # Get the words from the tokens
    decoder_dict = model.decoder_dict
    cat_preds = [decoder_dict[i.item()] for i in cat_preds[0]]

    results_dict = {k: cat_preds[i] for i, k in enumerate(model.col_tokens)}
    for i, k in enumerate(model.col_tokens[model.n_cat_cols :]):
        results_dict[k] = numeric_preds[0][i].item()
    # Get the masked values
    categorical_masked = [decoder_dict[i.item()] for i in categorical_masked[0]]
    numeric_masked = numeric_masked[0].tolist()
    masked_values = categorical_masked + numeric_masked
    # zip the masked values with the column names
    masked_values = dict(zip(model.col_tokens, masked_values))
    # Get the original values
    categorical_values = [decoder_dict[i.item()] for i in categorical_values[0]]
    numeric_values = numeric_values[0].tolist()
    original_values = categorical_values + numeric_values
    # zip the original values with the column names
    original_values = dict(zip(model.col_tokens, original_values))
    # print(numeric_masked)
    # print(categorical_masked)
    result_dict = {
        "actual": original_values,
        "masked": masked_values,
        "pred": results_dict,
    }

    return result_dict


show_mask_pred(0, model, 0.8)

{'actual': {'cut': 'Ideal',
  'color': 'G',
  'clarity': 'VVS1',
  'carat': -1.1770710945129395,
  'depth': 0.2447250783443451,
  'table': -0.6521385312080383,
  'x': -1.5700081586837769,
  'y': -1.5186843872070312,
  'z': -1.5144472122192383},
 'masked': {'cut': 'Ideal',
  'color': 'G',
  'clarity': '[MASK]',
  'carat': -1.1770710945129395,
  'depth': -inf,
  'table': -0.6521385312080383,
  'x': -1.5700081586837769,
  'y': -1.5186843872070312,
  'z': -inf},
 'pred': {'cut': 'E',
  'color': 'J',
  'clarity': 'IF',
  'carat': -26.44237518310547,
  'depth': -0.8387407660484314,
  'table': 0.017967773601412773,
  'x': 39.614593505859375,
  'y': 66.12740325927734,
  'z': -68.3674545288086}}

In [16]:
# Masked Tabualr Modeling
base_model_name = "BetterPreTrain"

model_time = dt.now()
model_time = model_time.strftime("%Y-%m-%dT%H:%M:%S")
model_name = f"{base_model_name}_{model_time}"

In [17]:
# Masked Tabualr Modeling
epochs = 100
batch_size = 1000
lr = 0.001
mse_loss = nn.MSELoss()
ce_loss = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

summary_writer = SummaryWriter("runs/" + model_name)

batch_count = 0
model.train()
for epoch in range(epochs):
    for i in range(0, X_train_num_tensor.size(0), batch_size):
        numeric_values = X_train_num_tensor[i : i + batch_size, :]
        categorical_values = X_train_cat_tensor[i : i + batch_size, :]
        numeric_masked = mask_tensor(numeric_values, model, probability=0.8)
        categorical_masked = mask_tensor(categorical_values, model, probability=0.8)
        optimizer.zero_grad()
        cat_preds, numeric_preds = model(numeric_masked, categorical_masked, task="mlm")
        cat_targets = torch.cat(
            (
                categorical_values,
                model.numeric_indices.expand(categorical_values.size(0), -1),
            ),
            dim=1,
        )

        cat_preds = cat_preds.permute(0, 2, 1)  # TODO investigate as possible bug
        # print(
        #     f"cat_preds.shape: {cat_preds.shape}, cat_targets.shape: {cat_targets.shape}"
        # )
        cat_loss = ce_loss(cat_preds, cat_targets)
        numeric_loss = mse_loss(numeric_preds, numeric_values)
        loss = cat_loss + numeric_loss  # TODO Look at scaling
        loss.backward()
        optimizer.step()
        batch_count += 1
        learning_rate = optimizer.param_groups[0]["lr"]
        summary_writer.add_scalar("LossTrain/agg_mask", loss.item(), batch_count)
        summary_writer.add_scalar("LossTrain/mlm_loss", cat_loss.item(), batch_count)
        summary_writer.add_scalar(
            "LossTrain/mnm_loss", numeric_loss.item(), batch_count
        )
        summary_writer.add_scalar("Metrics/mtm_lr", learning_rate, batch_count)
        if batch_count % 100 == 0:
            print(f"Epoch {epoch+1}/{epochs} Loss: {loss.item():,.4f}")
            # Test set
            with torch.no_grad():
                numeric_values = X_test_num_tensor
                categorical_values = X_test_cat_tensor
                numeric_masked = mask_tensor(numeric_values, model, probability=0.8)
                categorical_masked = mask_tensor(
                    categorical_values, model, probability=0.8
                )
                optimizer.zero_grad()
                cat_preds, numeric_preds = model(
                    numeric_masked, categorical_masked, task="mlm"
                )
                cat_targets = torch.cat(
                    (
                        categorical_values,
                        model.numeric_indices.expand(categorical_values.size(0), -1),
                    ),
                    dim=1,
                )

                cat_preds = cat_preds.permute(0, 2, 1)
                # print(
                #     f"cat_preds.shape: {cat_preds.shape}, cat_targets.shape: {cat_targets.shape}"
                # )
                cat_loss = ce_loss(cat_preds, cat_targets)
                numeric_loss = mse_loss(numeric_preds, numeric_values)
                loss = cat_loss + numeric_loss
                summary_writer.add_scalar("LossTest/agg_loss", loss.item(), batch_count)
            summary_writer.add_scalar("LossTest/mlm_loss", cat_loss.item(), batch_count)
            summary_writer.add_scalar(
                "LossTest/mnm_loss", numeric_loss.item(), batch_count
            )

Epoch 3/100 Loss: 14.4455
Epoch 5/100 Loss: 5.9586
Epoch 7/100 Loss: 3.5236
Epoch 10/100 Loss: 2.4071
Epoch 12/100 Loss: 1.9122
Epoch 14/100 Loss: 1.5690
Epoch 16/100 Loss: 1.3454
Epoch 19/100 Loss: 1.1029
Epoch 21/100 Loss: 1.0029
Epoch 23/100 Loss: 0.9342
Epoch 25/100 Loss: 0.8361
Epoch 28/100 Loss: 0.8184
Epoch 30/100 Loss: 0.7799
Epoch 32/100 Loss: 0.7069
Epoch 35/100 Loss: 0.6857
Epoch 37/100 Loss: 0.6665
Epoch 39/100 Loss: 0.6344
Epoch 41/100 Loss: 0.6081
Epoch 44/100 Loss: 0.5640
Epoch 46/100 Loss: 0.5319
Epoch 48/100 Loss: 0.5379
Epoch 50/100 Loss: 0.5033
Epoch 53/100 Loss: 0.5069
Epoch 55/100 Loss: 0.5129
Epoch 57/100 Loss: 0.4745
Epoch 60/100 Loss: 0.4816
Epoch 62/100 Loss: 0.4588
Epoch 64/100 Loss: 0.4521
Epoch 66/100 Loss: 0.4237
Epoch 69/100 Loss: 0.3926
Epoch 71/100 Loss: 0.3906
Epoch 73/100 Loss: 0.4563
Epoch 75/100 Loss: 0.3339
Epoch 78/100 Loss: 0.3578
Epoch 80/100 Loss: 0.3725
Epoch 82/100 Loss: 0.4078
Epoch 85/100 Loss: 0.3512
Epoch 87/100 Loss: 0.3696
Epoch 89/100 L

In [18]:
show_mask_pred(
    0, model, 0.8
)  # Check for learning... XFKAT # Why is color `GOOD`????? TODO

{'actual': {'cut': 'Ideal',
  'color': 'G',
  'clarity': 'VVS1',
  'carat': -1.1770710945129395,
  'depth': 0.2447250783443451,
  'table': -0.6521385312080383,
  'x': -1.5700081586837769,
  'y': -1.5186843872070312,
  'z': -1.5144472122192383},
 'masked': {'cut': 'Ideal',
  'color': 'G',
  'clarity': 'VVS1',
  'carat': -1.1770710945129395,
  'depth': 0.2447250783443451,
  'table': -0.6521385312080383,
  'x': -1.5700081586837769,
  'y': -1.5186843872070312,
  'z': -1.5144472122192383},
 'pred': {'cut': 'Ideal',
  'color': 'G',
  'clarity': 'VVS1',
  'carat': -1.3263070583343506,
  'depth': 0.4353136718273163,
  'table': -0.547584056854248,
  'x': -1.5333627462387085,
  'y': -2.008849620819092,
  'z': -2.2112772464752197}}

Finetuning the previous model seems to work but when I pre-train, we run into issues. Let's try again


In [19]:
# Regression Model
epochs = 40
batch_size = 1000
lr = 0.1
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

# model_time = dt.now()
# model_time = model_time.strftime("%Y-%m-%dT%H:%M:%S")
# model_name = f"FullDSNFT{model_time}"

summary_writer = SummaryWriter("runs/" + model_name)

small_test = True
train_set_size = 1000  # X_train_num_tensor.size(0)
batch_count = 0
model.train()
for epoch in range(epochs):
    for i in range(0, train_set_size, batch_size):
        # for i in range(0, batch_size, batch_size):
        num_inputs = X_train_num_tensor[i : i + batch_size, :]
        cat_inputs = X_train_cat_tensor[i : i + batch_size, :]
        optimizer.zero_grad()
        y_pred = model(num_inputs, cat_inputs)
        loss = loss_fn(y_pred, y_train_tensor[i : i + batch_size, :])
        loss.backward()
        optimizer.step()
        batch_count += 1
        learning_rate = optimizer.param_groups[0]["lr"]
        summary_writer.add_scalar("LossTrain/regression_loss", loss.item(), batch_count)
        summary_writer.add_scalar("Metrics/regression_lr", learning_rate, batch_count)

    # Test set
    with torch.no_grad():
        y_pred = model(X_test_num_tensor, X_test_cat_tensor)
        loss = loss_fn(y_pred, y_test_tensor)
        summary_writer.add_scalar("LossTest/regression_loss", loss.item(), batch_count)
        print(
            f"Epoch {epoch+1}/{epochs} Loss: {loss.item():,.2f} "
            + f"Test loss: {loss.item():,.2f}"
        )

Epoch 3/40 Loss: 421,813.50 Test loss: 421,813.50
Epoch 5/40 Loss: 349,646.06 Test loss: 349,646.06
Epoch 7/40 Loss: 337,694.91 Test loss: 337,694.91
Epoch 10/40 Loss: 349,826.78 Test loss: 349,826.78
Epoch 12/40 Loss: 328,518.94 Test loss: 328,518.94
Epoch 14/40 Loss: 310,538.12 Test loss: 310,538.12
Epoch 16/40 Loss: 325,130.81 Test loss: 325,130.81
Epoch 19/40 Loss: 313,784.94 Test loss: 313,784.94
Epoch 21/40 Loss: 389,621.28 Test loss: 389,621.28
Epoch 23/40 Loss: 322,446.25 Test loss: 322,446.25
Epoch 25/40 Loss: 309,423.19 Test loss: 309,423.19
Epoch 28/40 Loss: 318,806.47 Test loss: 318,806.47
Epoch 30/40 Loss: 403,155.91 Test loss: 403,155.91
Epoch 32/40 Loss: 358,183.34 Test loss: 358,183.34
Epoch 35/40 Loss: 315,748.34 Test loss: 315,748.34
Epoch 37/40 Loss: 309,868.69 Test loss: 309,868.69
Epoch 39/40 Loss: 317,470.12 Test loss: 317,470.12


In [20]:
with torch.no_grad():
    y_pred = model(X_test_num_tensor[0:10, :], X_test_cat_tensor[0:10, :])
    loss = loss_fn(y_pred, y_test_tensor[0:10])
    print(f"Test loss: {loss.item():,.2f}")

Test loss: 290,563.16


In [21]:
for i in range(10):
    print(
        f"Predicted: {y_pred[i].item():,.2f} Actual: {y_test_tensor[i].item():,.2f}",
        f"Diff: {y_pred[i].item() - y_test_tensor[i].item():,.2f}",
    )

Predicted: 1,780.74 Actual: 1,754.00 Diff: 26.74
Predicted: 6,250.25 Actual: 6,927.00 Diff: -676.75
Predicted: 1,176.17 Actual: 1,264.00 Diff: -87.83
Predicted: 2,540.79 Actual: 2,278.00 Diff: 262.79
Predicted: 3,224.54 Actual: 2,858.00 Diff: 366.54
Predicted: 6,740.07 Actual: 8,133.00 Diff: -1,392.93
Predicted: 813.26 Actual: 840.00 Diff: -26.74
Predicted: 16,256.74 Actual: 16,792.00 Diff: -535.26
Predicted: 827.36 Actual: 815.00 Diff: 12.36
Predicted: 643.58 Actual: 734.00 Diff: -90.42


In [22]:
1 / 0

ZeroDivisionError: division by zero

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error

# Load and preprocess the dataset (assuming you have a CSV file)
data = pd.read_csv("../data/diamonds.csv")

# Separate features and target variable
X = data.drop("price", axis=1)
y = data["price"]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Preprocess categorical features
cat_columns = ["cut", "color", "clarity"]
X_train_cat = X_train[cat_columns]
X_test_cat = X_test[cat_columns]

label_encoders = {}
for col in cat_columns:
    le = LabelEncoder()
    X_train_cat[col] = le.fit_transform(X_train_cat.loc[:, col])
    X_test_cat[col] = le.transform(X_test_cat.loc[:, col])
    label_encoders[col] = le

# Preprocess numeric features
num_columns = ["carat", "depth", "table", "x", "y", "z"]
scaler = StandardScaler()
X_train_num = scaler.fit_transform(X_train[num_columns])
X_test_num = scaler.transform(X_test[num_columns])


# Convert data to PyTorch tensors
X_train_cat_tensor = torch.tensor(X_train_cat.values, dtype=torch.int64)
X_train_num_tensor = torch.tensor(X_train_num, dtype=torch.float32)
X_test_cat_tensor = torch.tensor(X_test_cat.values, dtype=torch.int64)
X_test_num_tensor = torch.tensor(X_test_num, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)


# Define the neural network model
class DiamondPricePredictor(nn.Module):
    def __init__(self, num_input_dim, cat_embedding_sizes, hidden_dim):
        super(DiamondPricePredictor, self).__init__()

        # Embedding layers for categorical features
        self.embeddings = nn.ModuleList(
            [
                nn.Embedding(num_classes, emb_size)
                for num_classes, emb_size in cat_embedding_sizes
            ]
        )

        total_emb_dim = sum(emb_size for _, emb_size in cat_embedding_sizes)
        self.predictor = nn.Sequential(
            nn.Linear(num_input_dim + total_emb_dim, hidden_dim * 4),
            nn.ReLU(),
            nn.Linear(hidden_dim * 4, hidden_dim * 2),
            nn.ReLU(),
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1),
        )
        # self.fc1 = nn.Linear(num_input_dim + total_emb_dim, hidden_dim)
        # self.fc2 = nn.Linear(hidden_dim, 1)

    def forward(self, num_inputs, cat_inputs):
        embeddings = [
            embedding(cat_inputs[:, i]) for i, embedding in enumerate(self.embeddings)
        ]
        cat_features = torch.cat(embeddings, dim=1)
        x = torch.cat([num_inputs, cat_features], dim=1)

        x = self.predictor(x)
        return x


# Initialize the model
num_input_dim = X_train_num.shape[1]
cat_embedding_sizes = [
    (len(le.classes_), min(50, (len(le.classes_) + 1) // 2))
    for le in label_encoders.values()
]
hidden_dim = 64
simple_model = DiamondPricePredictor(num_input_dim, cat_embedding_sizes, hidden_dim)

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(simple_model.parameters(), lr=0.01)

# Training loop
epochs = 2000
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = simple_model(
        X_train_num_tensor,  # [0:batch_size],
        X_train_cat_tensor,  # [0:batch_size],
    )  # Pass numeric and categorical tensors separately
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()
    if epoch % 200 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():,.2f}")

print("Training complete!")
# Evaluate the model

Epoch 1/2000, Loss: 31,439,220.00


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_cat[col] = le.fit_transform(X_train_cat.loc[:, col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_cat[col] = le.transform(X_test_cat.loc[:, col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_cat[col] = le.fit_transform(X_train_cat.loc[:, col])
A value is trying to be set 

Epoch 201/2000, Loss: 669,738.62
Epoch 401/2000, Loss: 439,377.12


KeyboardInterrupt: 

In [None]:
model.eval()
with torch.no_grad():
    test_predictions = model(X_test_num_tensor, X_test_cat_tensor)
    mse = mean_squared_error(y_test_tensor, test_predictions)
    print(f"Mean Squared Error on Test Data: {mse:,.2f}")  # 735,349.69

Mean Squared Error on Test Data: 30,910,758.00


In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(
    f"Number of Params in Tabular Model:{count_parameters(model):,}",
    f"Number of Params in Simple Model:{count_parameters(simple_model):,}",
)

Number of Params in Tabular Model:261,809 Number of Params in Simple Model:45,900


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

# Load the diamonds dataset
diamonds_data = pd.read_csv("../data/diamonds.csv")

# Encode categorical features using LabelEncoder
label_encoders = {}
categorical_features = ["cut", "color", "clarity"]
for feature in categorical_features:
    le = LabelEncoder()
    diamonds_data[feature] = le.fit_transform(diamonds_data[feature])
    label_encoders[feature] = le

# Split the dataset into features (X) and target (y)
X = diamonds_data.drop("price", axis=1)
y = diamonds_data["price"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Initialize and train the XGBoost regressor
xgb_regressor = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_regressor.fit(
    X_train,  # <[0:batch_size],
    y_train,  # [0:batch_size],
)

# Predict on the test set
y_pred = xgb_regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:,.2f}")

# You can also access feature importance scores
# feature_importances = xgb_regressor.feature_importances_
# print("Feature Importance:")
# for feature, importance in zip(X.columns, feature_importances):
#     print(f"{feature}: {importance:.4f}")

Mean Squared Error: 278,657.75


In [None]:
# %pip install pandas==2.0.3

In [None]:
f"{1.7e6:,.2f}"

'1,700,000.00'