In [1]:
import pandas as pd
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [2]:
# Load the diamonds dataset
df = pd.read_csv(
    "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/diamonds.csv"
)
df = df.sample(1_000, random_state=42)
# Use all columns except for 'price' as features
features = df.drop("price", axis=1)
labels = df["price"]

In [14]:
features_str

1388          0.24 Ideal G VVS1 62.1 56.0 3.97 4.0 2.47
50052    0.58 Very Good F VVS2 60.0 57.0 5.44 5.42 3.26
41645         0.4 Ideal E VVS2 62.1 55.0 4.76 4.74 2.95
42377      0.43 Premium E VVS2 60.8 57.0 4.92 4.89 2.98
17244         1.55 Ideal E SI2 62.3 55.0 7.44 7.37 4.61
                              ...                      
35207          0.33 Ideal I IF 61.6 55.0 4.47 4.46 2.75
15806          1.0 Ideal E SI1 62.4 55.0 6.34 6.42 3.98
45884          0.58 Ideal G VS2 61.1 56.0 5.4 5.43 3.31
22681         0.38 Ideal J VVS2 62.0 55.0 4.67 4.69 2.9
21429       1.18 Premium G VVS2 59.7 58.0 6.94 6.9 4.13
Length: 1000, dtype: object

In [3]:
# Convert features to string and concatenate
features_str = features.applymap(str).apply(lambda row: " ".join(row), axis=1)

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize features
tokens = tokenizer(
    features_str.tolist(), padding=True, truncation=True, return_tensors="pt"
)

# Standardize labels (price)
scaler = StandardScaler()
labels_scaled = scaler.fit_transform(labels.values.reshape(-1, 1))

# Convert labels to tensor
labels_tensor = torch.tensor(labels_scaled, dtype=torch.float32)

In [4]:
class BertRegressor(nn.Module):
    def __init__(self):
        super(BertRegressor, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.regressor = nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooler_output = outputs.pooler_output
        return self.regressor(pooler_output)

In [5]:
# Create TensorDataset
dataset = TensorDataset(tokens.input_ids, tokens.attention_mask, labels_tensor)

# Split data into training and test sets
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [6]:
# Initialize model and optimizer
from tqdm.notebook import tqdm

model = BertRegressor()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

# Loss function
criterion = nn.MSELoss()

# Training loop
for epoch in range(3):
    print(epoch)  # Number of epochs can be adjusted
    model.train()
    for batch in tqdm(train_loader):
        input_ids, attention_mask, y = batch
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask).squeeze()
        # loss = criterion(outputs, y.squeeze())
        # outputs = model(input_ids, attention_mask).squeeze()
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()

0


  0%|          | 0/25 [00:00<?, ?it/s]

  return F.mse_loss(input, target, reduction=self.reduction)


1


  0%|          | 0/25 [00:00<?, ?it/s]

2


  0%|          | 0/25 [00:00<?, ?it/s]

In [15]:
loss

tensor(1.3075, grad_fn=<MseLossBackward0>)

In [10]:
# Evaluation
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, y = batch
        outputs = model(input_ids, attention_mask).squeeze()
        all_preds.extend(outputs.tolist())
        all_labels.extend(y.tolist())

# Calculate RMSE
rmse = mean_squared_error(all_labels, all_preds, squared=False)
print(f"RMSE: {rmse}")

RMSE: 0.997130391336721


In [11]:
outputs

tensor([0.0906, 0.0755, 0.0768, 0.0922, 0.0760, 0.0631, 0.0727, 0.0883])

In [12]:
y

tensor([[-0.8692],
        [-0.8181],
        [-0.8596],
        [ 2.0640],
        [-0.6108],
        [-0.5678],
        [-0.5316],
        [-0.7376]])

In [13]:
outputs

tensor([0.0906, 0.0755, 0.0768, 0.0922, 0.0760, 0.0631, 0.0727, 0.0883])