In [1]:
%pip install datasets torch==2.1.0 torchtext==0.16.0 numpy

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting torch==2.1.0
  Downloading torch-2.1.0-cp311-cp311-win_amd64.whl.metadata (25 kB)
Collecting torchtext==0.16.0
  Downloading torchtext-0.16.0-cp311-cp311-win_amd64.whl.metadata (7.5 kB)
Collecting torchdata==0.7.0 (from torchtext==0.16.0)
  Downloading torchdata-0.7.0-cp311-cp311-win_amd64.whl.metadata (13 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-19.0.1-cp311-cp311-win_amd64.whl.metadata (3.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec (from torch==2.1.0)
  Using cached fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading torch-2.1.0-cp311-cp311-win_amd64

In [2]:
from datasets import load_dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import torch

# Load dataset
dataset = load_dataset("facebook/natural_reasoning")

# Format data: question -> reference_answer
def format_data(example):
    input_text = f"Question: {example['question']}"
    output_text = f"Answer: {example['reference_answer']}"
    return {"input": input_text, "output": output_text}

train_data = dataset["train"].map(format_data)

# Tokenizer and vocabulary
tokenizer = get_tokenizer("basic_english")

def yield_tokens(data):
    for example in data:
        yield tokenizer(example["input"])
        yield tokenizer(example["output"])

vocab = build_vocab_from_iterator(yield_tokens(train_data), specials=["<unk>", "<pad>", "<sos>", "<eos>"], max_tokens=10000)
vocab.set_default_index(vocab["<unk>"])

# Convert text to tensors
def text_to_tensor(text, vocab, max_len=100):
    tokens = ["<sos>"] + tokenizer(text)[:max_len-2] + ["<eos>"]
    return torch.tensor([vocab[token] for token in tokens], dtype=torch.long)

train_inputs = [text_to_tensor(ex["input"], vocab) for ex in train_data]
train_outputs = [text_to_tensor(ex["output"], vocab) for ex in train_data]

# Verify
print("Sample input:", train_data[0]["input"])
print("Sample output:", train_data[0]["output"])
print("Vocab size:", len(vocab))

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 1145824/1145824 [00:11<00:00, 103652.52 examples/s]
Map: 100%|██████████| 1145824/1145824 [01:14<00:00, 15363.63 examples/s]


Sample input: Question: What is the total work done on an object when it is moved upwards against gravity, considering both the change in kinetic energy and potential energy? Use the Work-Energy Theorem and the principle of conservation of mechanical energy to derive your answer.
Sample output: Answer: W = delta ME = delta KE + delta PE
Vocab size: 10000


In [3]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset

class ReasoningDataset(Dataset):
    def __init__(self, inputs, outputs):
        self.inputs = inputs
        self.outputs = outputs

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.outputs[idx]

def collate_fn(batch):
    inputs, outputs = zip(*batch)
    inputs = pad_sequence(inputs, batch_first=True, padding_value=vocab["<pad>"])
    outputs = pad_sequence(outputs, batch_first=True, padding_value=vocab["<pad>"])
    return inputs, outputs

dataset = ReasoningDataset(train_inputs, train_outputs)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)

In [4]:
import torch.nn as nn

class Seq2SeqModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=256):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.encoder = nn.GRU(embed_dim, hidden_dim, num_layers=2, batch_first=True)
        self.decoder = nn.GRU(embed_dim, hidden_dim, num_layers=2, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        tgt_len = tgt.size(1)
        vocab_size = self.fc.out_features

        # Encode
        embedded = self.embedding(src)
        enc_output, hidden = self.encoder(embedded)

        # Decode
        outputs = torch.zeros(batch_size, tgt_len, vocab_size).to(src.device)
        input = tgt[:, 0]  # <sos>

        for t in range(1, tgt_len):
            embedded = self.embedding(input).unsqueeze(1)
            dec_output, hidden = self.decoder(embedded, hidden)
            output = self.fc(dec_output.squeeze(1))
            outputs[:, t, :] = output
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            input = tgt[:, t] if teacher_force else output.argmax(1)

        return outputs

# Initialize model
model = Seq2SeqModel(vocab_size=len(vocab))
print(f"Parameters: {sum(p.numel() for p in model.parameters())}")

Parameters: 5232400


In [None]:
import torch.nn as nn

class Seq2SeqModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=64, hidden_dim=128):  # Reduced from 128 and 256
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.encoder = nn.GRU(embed_dim, hidden_dim, num_layers=1, batch_first=True)  # Reduced to 1 layer
        self.decoder = nn.GRU(embed_dim, hidden_dim, num_layers=1, batch_first=True)  # Reduced to 1 layer
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        tgt_len = tgt.size(1)
        vocab_size = self.fc.out_features

        # Encode
        embedded = self.embedding(src)
        enc_output, hidden = self.encoder(embedded)

        # Decode
        outputs = torch.zeros(batch_size, tgt_len, vocab_size).to(src.device)
        input = tgt[:, 0]  # <sos>

        for t in range(1, tgt_len):
            embedded = self.embedding(input).unsqueeze(1)
            dec_output, hidden = self.decoder(embedded, hidden)
            output = self.fc(dec_output.squeeze(1))
            outputs[:, t, :] = output
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            input = tgt[:, t] if teacher_force else output.argmax(1)

        return outputs

# Initialize model with vocab_size from your code
vocab_size = len(vocab)  # 10,000 in your case
model = Seq2SeqModel(vocab_size=vocab_size, embed_dim=64, hidden_dim=128)
print(f"Parameters: {sum(p.numel() for p in model.parameters())}")

Parameters: 2078992


In [6]:
device = torch.device("cpu")
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=vocab["<pad>"])

epochs = 10
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for src, tgt in dataloader:
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, tgt)
        output = output[:, 1:, :].reshape(-1, len(vocab))
        tgt = tgt[:, 1:].reshape(-1)
        loss = criterion(output, tgt)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(dataloader)}")

# Save model
torch.save(model.state_dict(), "reasoning_model.pt")

KeyboardInterrupt: 

In [None]:
def generate_answer(question, model, vocab, tokenizer, max_len=50):
    model.eval()
    input_text = f"Question: {question}"
    src = text_to_tensor(input_text, vocab, max_len=100).unsqueeze(0).to(device)

    output = [vocab["<sos>"]]
    hidden = None

    with torch.no_grad():
        embedded = model.embedding(src)
        _, hidden = model.encoder(embedded)
        input = torch.tensor([vocab["<sos>"]], dtype=torch.long).to(device)

        for _ in range(max_len):
            embedded = model.embedding(input).unsqueeze(1)
            dec_output, hidden = model.decoder(embedded, hidden)
            pred = model.fc(dec_output.squeeze(1)).argmax(1).item()
            output.append(pred)
            if pred == vocab["<eos>"]:
                break
            input = torch.tensor([pred], dtype=torch.long).to(device)

    return " ".join(vocab.lookup_tokens(output[1:-1]))

# Test
question = "What is the total work done on an object when it is moved upwards against gravity, considering both the change in kinetic energy and potential energy? Use the Work-Energy Theorem and the principle of conservation of mechanical energy to derive your answer."
print("Generated answer:", generate_answer(question, model, vocab, tokenizer))