# Pretrain Model

In [4]:
# Import necessary libraries
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW

# Define the MathDataset class
class MathDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        problem = self.dataframe.iloc[idx]["problem"]
        solution = self.dataframe.iloc[idx]["solution"]

        return problem, solution

# collate_fn function to handle padding and tokenization for a whole batch
def collate_fn(batch):
    problems, solutions = zip(*batch)
    split_token = " =" # has additional space in front as this is a special token
    split_token_id = tokenizer.encode(split_token)[0]

    questions = [f"{p} {s}{tokenizer.eos_token}" for p, s in zip(problems, solutions)] # concatenate and add eos_token

    encoder = tokenizer(
        questions,  # Concatenate problems and solutions for encoding
        padding=True,
        padding_side="left",
        truncation=True,
        max_length=20, # TODO: Adjust max_length based on model
        return_tensors="pt"
    )

    # mask the labels for the solutions
    labels = encoder["input_ids"].clone()
    for i in range(len(problems)):
        # Find the index of the split token in the input_ids
        split_index = (encoder["input_ids"][i] == split_token_id).nonzero(as_tuple=True)[0]
        # Set the labels to -100 for the problem part, so they won't be used in loss calculation
        labels[i][:(split_index+1)] = -100

    return {
        "input_ids": encoder["input_ids"],
        "attention_mask": encoder["attention_mask"],
        "labels": labels,  # Use the masked labels for loss calculation
    }


# Load the math dataset
def load_math_data(problem_filename="math_problems.txt", solution_filename="math_solutions.txt"):
    import pandas as pd
    problems = [line.strip() for line in open(problem_filename, "r")]
    solutions = [line.strip() for line in open(solution_filename, "r")]
    return pd.DataFrame({"problem": problems, "solution": solutions})

data = load_math_data("math_problems.txt", "math_solutions.txt")

# split into train and test sets
train_size = int(0.8 * len(data))
test_size = len(data) - train_size
train_data = data[:train_size]
test_data = data[train_size:]

# Initialize tokenizer and dataset
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '<|pad|>'})  # Explicitly add a special padding token
    tokenizer.pad_token = '<|pad|>'

train_math_dataset = MathDataset(train_data)
test_math_dataset = MathDataset(test_data)

train_data_loader = DataLoader(train_math_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_data_loader = DataLoader(test_math_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

In [None]:
# Used to test the Dataloader
# Extract a single batch from the DataLoader
batch = next(iter(train_data_loader))

# Access the batch data
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]

# Decode and print the batch data
print("Input IDs:", tokenizer.batch_decode(input_ids.tolist(), skip_special_tokens=False))
print("Attention Mask:", attention_mask)
print("Labels:", labels)

NameError: name 'data_loader' is not defined

In [6]:
from tqdm.auto import tqdm
from transformers import get_scheduler

num_epochs = 4

# Load the GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

# Define optimizer and training parameters
optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = num_epochs * len(train_data_loader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

# select device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
model.to(device)

# Training loop

for epoch in range(num_epochs):
    progress_bar = tqdm(train_data_loader, desc=f"Epoch {epoch+1} Training Progress") # Add progress bar
    model.train()
    for batch in progress_bar:
        # Move batch to device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        
        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        # Update progress bar
        progress_bar.set_postfix(loss=loss.item())

    # validation loss calculation
    model.eval()
    total_val_loss = 0
    total_val_samples = 0
    for batch in test_data_loader:
        with torch.no_grad():
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss = outputs.loss

            batch_size = input_ids.size(0)
            total_val_loss += val_loss.item() * batch_size
            total_val_samples += batch_size
    # Calculate average validation loss
    mean_val_loss = total_val_loss / total_val_samples if total_val_samples > 0 else 0

    print(f"Epoch {epoch + 1}/{num_epochs}, Validation Loss: {mean_val_loss:.4f}")
    # Save the finetuned model
    model.save_pretrained("finetuned_gpt2_math_epoch_{}".format(epoch + 1))



Using device: cuda


Epoch 1 Training Progress: 100%|██████████| 1000/1000 [01:11<00:00, 14.04it/s, loss=1.87]


Epoch 1/4, Validation Loss: 1.5739


Epoch 2 Training Progress: 100%|██████████| 1000/1000 [01:10<00:00, 14.14it/s, loss=1.4]


Epoch 2/4, Validation Loss: 1.2184


Epoch 3 Training Progress: 100%|██████████| 1000/1000 [01:10<00:00, 14.10it/s, loss=1.19]


Epoch 3/4, Validation Loss: 1.0605


Epoch 4 Training Progress: 100%|██████████| 1000/1000 [01:10<00:00, 14.17it/s, loss=1.24]


Epoch 4/4, Validation Loss: 0.9926


# Load the Finetuned Model and Run Inference
This section demonstrates how to load the finetuned GPT-2 model and use it to generate solutions for new math problems.

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

# Load the finetuned model and tokenizer
model_path = "finetuned_gpt2_math_epoch_4"
model = GPT2LMHeadModel.from_pretrained(model_path)

# Ensure the model is on the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to generate a solution for a given math problem
def generate_solution(problem, max_length=50):
    model.eval()
    with torch.no_grad():
        # Tokenize the input problem
        input_enc = tokenizer(
            problem,
            return_tensors="pt",
            padding=False,
            truncation=True,
            max_length=max_length
        )
        input_ids = input_enc["input_ids"].to(device)
        attention_mask = input_enc["attention_mask"].to(device)

        # Generate output
        output_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=max_length,
            num_beams=5,
            early_stopping=True
        )

        # Decode the generated output
        solution = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        return solution

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Problem: 35 * 28 =
Solution: 35 * 28 = 940


In [49]:
import random
# Select a random example from the test set for inference
random_idx = random.randint(0, len(test_data) - 1)
example_problem = test_data.iloc[random_idx]["problem"]
solution = generate_solution(example_problem)
print(f"Problem: {example_problem}")
print(f"Solution: {solution}")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Problem: 24 * 22 =
Solution: 24 * 22 = 576
