In [1]:
# Train a transformer on 1d regression

In [2]:
import transformers
import torch

In [61]:
# Create a transformer model on numerical data
# So no embedding layers
# Use GPT architecture

# Initializing a GPT2 configuration
configuration = transformers.GPT2Config()

# Remove the embedding layer
configuration.n_embd = 128
configuration.n_layer = 3
configuration.n_head = 4

# Initializing a model (with random weights) from the configuration
model = transformers.GPT2Model(configuration)

# Add a linear layer
model.lm_head = torch.nn.Linear(128, 1)

# Replace the embedding layer by a linear layer
model.wte = torch.nn.Linear(1, 128)

# Make positional encoding 1d
model.wpe = torch.nn.Embedding(100, 128)


In [62]:
# Create a 1d regression dataset, with different slopes for each sequence

# Create a dataset with 1000 sequences of length 100
# Each sequence has a different slope
# The slope is a random number between 0 and 1
import numpy as np

n_sequences = 1000
seq_len = 100
slopes = np.random.rand(n_sequences)
intercepts = np.zeros(n_sequences)

x = np.array([np.arange(seq_len) for i in range(n_sequences)])
y = slopes[:,np.newaxis]*x + intercepts[:,np.newaxis]

# shuffle the sequences
idx = np.random.permutation(seq_len)
x = x[:,idx]
y = y[:,idx]


# Convert into a torch dataset
import torch

x = torch.tensor(x, dtype=torch.float32).reshape(-1, seq_len, 1)
y = torch.tensor(y, dtype=torch.float32).reshape(-1, seq_len, 1)

# Create a dataloader
from torch.utils.data import TensorDataset, DataLoader

dataset = TensorDataset(x, y)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


In [63]:
# Train the model

# Define the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Define the loss function
loss_fn = torch.nn.MSELoss()

# Train the model
for epoch in range(100):
    for batch in dataloader:
        x, y = batch
        optimizer.zero_grad()
        print(x.shape)
        output = model(x)[0]
        loss = loss_fn(output, y)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch}: {loss.item()}")
    

torch.Size([32, 100, 1])


RuntimeError: The size of tensor a (1024) must match the size of tensor b (3200) at non-singleton dimension 3