<a href="https://colab.research.google.com/github/Leotzu/transformer-arxiv-classification/blob/main/arxiv_transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Step 1)** Setup environment and give your notebook access to google drive

In [None]:
# install libraries
!pip install torch pandas numpy tqdm

In [2]:
# mount drive to access json file and save/load models and vocab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
import numpy as np
import pandas as pd
import json
from tqdm import tqdm
import pickle
import matplotlib.pyplot as plt

**Step 2)** Define your model and functions for data loading and preprocessing

- In Config, make sure to change the *project_dir* and *data_path* to where you have this project in your drive and where the json arXiv data is stored.

- Change *prefix* to differentiate this training run from any others you do (it will be added to the beginning of every model, checkpoint, and vocab file saved during preprocessing and training)

In [6]:
class Config:
    # define project directories
    project_dir = '/content/drive/MyDrive/your_project_path'
    data_path = project_dir + '/data/arxiv-metadata-oai-snapshot.json'
    models_path = project_dir + '/models'
    vocab_path = project_dir + '/vocab'
    # this prefix will go infront of all the saved models, checkpoints, vocab, etc to differential between training sessions
    prefix = '50k'

    # data points (number of abstracts)
    num_data_points = 50000

    # model hyperparameters
    d_model = 256
    nhead = 8
    num_encoder_layers = 3
    num_decoder_layers = 3
    dim_feedforward = 1024
    max_seq_length = 256
    batch_size = 64
    learning_rate = 0.001
    dropout_rate = 0.3
    num_epochs = 30


In [7]:
class Vocabulary:
    def __init__(self):
        self.stoi = {"<pad>": 0, "<unk>": 1, "<eos>": 2}
        self.itos = {0: "<pad>", 1: "<unk>", 2: "<eos>"}

    def build_vocab(self, texts, min_freq=2):
        counter = {}
        for text in texts:
            for word in text.split():
                if word not in counter:
                    counter[word] = 0
                counter[word] += 1
        idx = len(self.stoi)
        for word, count in counter.items():
            if count >= min_freq:
                self.stoi[word] = idx
                self.itos[idx] = word
                idx += 1

class ArxivDataset(Dataset):
    def __init__(self, abstracts, vocab):
        self.vocab = vocab
        self.data = [self.vectorize(text) for text in abstracts]

    def vectorize(self, text):
        tokens = [self.vocab.stoi.get(word, self.vocab.stoi['<unk>']) for word in text.split()]
        # Append <eos> token at the end of each abstract
        tokens.append(self.vocab.stoi['<eos>'])
        if len(tokens) > Config.max_seq_length:
            tokens = tokens[:Config.max_seq_length]
        else:
            tokens += [self.vocab.stoi['<pad>']] * (Config.max_seq_length - len(tokens))
        return torch.tensor(tokens, dtype=torch.long)

    def __getitem__(self, idx):
        item = self.data[idx]
        return item[:-1], item[1:]

    def __len__(self):
        return len(self.data)

# Pad sequences to ensure each tensor is equal in size
def collate_batch(batch):
    # Separate source and target sequences
    src_batch, tgt_batch = zip(*batch)

    # Pad the sequences in the batch
    src_batch_padded = pad_sequence(src_batch, padding_value=0, batch_first=True)
    tgt_batch_padded = pad_sequence(tgt_batch, padding_value=0, batch_first=True)

    return src_batch_padded, tgt_batch_padded

def load_data(file_path, num_rows=Config.num_data_points, split_ratio=0.8):
    np.random.seed(5)
    with open(file_path, 'r', encoding='utf-8') as file:
        total_rows = sum(1 for line in file)
    indices_to_keep = np.random.choice(range(total_rows), num_rows, replace=False)

    train_idx = int(len(indices_to_keep) * split_ratio)
    train_indices = set(indices_to_keep[:train_idx])
    test_indices = set(indices_to_keep[train_idx:])

    train_abstracts = []
    test_abstracts = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for i, line in enumerate(file):
            if i in train_indices:
                data_line = json.loads(line)
                if 'abstract' in data_line:
                    abstract = data_line['abstract'].strip()
                    train_abstracts.append(abstract)
            elif i in test_indices:
                data_line = json.loads(line)
                if 'abstract' in data_line:
                    abstract = data_line['abstract'].strip()
                    test_abstracts.append(abstract)
    return train_abstracts, test_abstracts

def get_data():
    train_abstracts, test_abstracts = load_data(Config.data_path)
    vocab = Vocabulary()
    vocab.build_vocab(train_abstracts + test_abstracts)  # Build vocab using both to ensure consistency

    train_dataset = ArxivDataset(train_abstracts, vocab)
    test_dataset = ArxivDataset(test_abstracts, vocab)

    train_loader = DataLoader(train_dataset, batch_size=Config.batch_size, shuffle=True, collate_fn=collate_batch)
    test_loader = DataLoader(test_dataset, batch_size=Config.batch_size, shuffle=False, collate_fn=collate_batch)

    return train_loader, test_loader, vocab

In [9]:
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, max_seq_length, dropout_rate):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = nn.Parameter(torch.randn(max_seq_length, d_model))
        self.transformer = nn.Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout=dropout_rate)
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, src, tgt):
        src = self.embedding(src) + self.positional_encoding[:src.size(1), :]
        tgt = self.embedding(tgt) + self.positional_encoding[:tgt.size(1), :]
        output = self.transformer(src, tgt)
        return self.fc_out(output)

**Step 3)** Train the model

- This function will save your models and checkpoints to model_path after each epoch, as well as the history of losses and dataloader for later evaluation

In [None]:
def train():
    train_loader, test_loader, vocab = get_data()
    # save vocab for later use during evaluation inference
    with open(Config.vocab_path + f'/{Config.prefix}_vocab.pkl', 'wb') as f:
        pickle.dump(vocab, f)
    # define model
    model = TransformerModel(
        vocab_size=len(vocab.stoi),
        d_model=Config.d_model,
        nhead=Config.nhead,
        num_encoder_layers=Config.num_encoder_layers,
        num_decoder_layers=Config.num_decoder_layers,
        dim_feedforward=Config.dim_feedforward,
        max_seq_length=Config.max_seq_length,
        dropout_rate=Config.dropout_rate
    ).to('cuda')

    optimizer = optim.Adam(model.parameters(), lr=Config.learning_rate)
    criterion = nn.CrossEntropyLoss(ignore_index=vocab.stoi['<pad>'])

    # Variables to save for evaluation
    training_losses = []
    testing_losses = []

    print("training started...")
    for epoch in range(Config.num_epochs):
        model.train()
        total_train_loss = 0
        for src, tgt in train_loader:
            src, tgt = src.to('cuda'), tgt.to('cuda')
            optimizer.zero_grad()
            output = model(src, tgt)
            loss = criterion(output.view(-1, len(vocab.stoi)), tgt.view(-1))
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()

        print(f'Epoch {epoch+1}: Loss {loss.item()}')

        # Calculate average losses for the epoch
        avg_train_loss = total_train_loss / len(train_loader)
        training_losses.append(avg_train_loss)

        # Evaluate on test set
        model.eval()
        total_test_loss = 0
        with torch.no_grad():
            for src, tgt in test_loader:
                src, tgt = src.to('cuda'), tgt.to('cuda')
                output = model(src, tgt)
                loss = criterion(output.view(-1, len(vocab.stoi)), tgt.view(-1))
                total_test_loss += loss.item()
        avg_test_loss = total_test_loss / len(test_loader)
        testing_losses.append(avg_test_loss)

        # Save the model and checkpoint
        model_save_path = f'{Config.models_path}/{Config.prefix}_model_epoch_{epoch+1}.pth'
        checkpoint_path = f'{Config.models_path}/{Config.prefix}_checkpoint_epoch_{epoch+1}.pth'
        save_model(model, model_save_path)
        save_checkpoint(model, optimizer, epoch, checkpoint_path)

    # Save losses and dataloader for evaluation
    torch.save({
        'training_losses': training_losses,
        'testing_losses': testing_losses,
        'model': model.state_dict(),
        'vocab': vocab,
        'criterion': criterion
    }, f'{Config.models_path}/{Config.prefix}_evaluation_data.pth')

def save_model(model, filename):
    torch.save(model.state_dict(), filename)

def save_checkpoint(model, optimizer, epoch, filepath):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, filepath)

train()

**Step 4)** Evaluate the training of your model and perform inference in generate_text to create new abstracts from a prompt

In [None]:
def evaluate():
    # Load saved training and testing data
    data = torch.load(f'{Config.models_path}/{Config.prefix}_evaluation_data.pth') # Change to a different model name if needed. This evalutes last trained model by default
    training_losses = data['training_losses']
    testing_losses = data['testing_losses']

    plt.plot(training_losses, label='Training Loss')
    plt.plot(testing_losses, label='Testing Loss', linestyle='--')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Testing Losses Over Epochs')
    plt.legend()
    plt.show()

evaluate()

In [None]:
def generate_text(prompt, vocab_path, model_path, top_k=5, max_output_length=50):
    with open(vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    model = TransformerModel(
        vocab_size=len(vocab.stoi),
        d_model=Config.d_model,
        nhead=Config.nhead,
        num_encoder_layers=Config.num_encoder_layers,
        num_decoder_layers=Config.num_decoder_layers,
        dim_feedforward=Config.dim_feedforward,
        max_seq_length=Config.max_seq_length,
        dropout_rate=Config.dropout_rate
    ).to('cuda')
    model.load_state_dict(torch.load(model_path))
    model.eval()

    tokens = [vocab.stoi.get(word, vocab.stoi['<unk>']) for word in prompt.split()]
    input_tensor = torch.tensor([tokens], dtype=torch.long).to('cuda')
    generated_tokens = tokens[:]

    while len(generated_tokens) < max_output_length:
        with torch.no_grad():
            output = model(input_tensor, input_tensor)
            logits = output[:, -1, :]  # Focus only on the last output token's logits
            values, indices = torch.topk(logits, top_k)
            probabilities = F.softmax(values, dim=-1)
            next_token_index = torch.multinomial(probabilities, 1).item()  # Get scalar index
            next_token = indices[0][next_token_index].item()  # Access the corresponding token index

        if next_token == vocab.stoi['<eos>']:
            break

        generated_tokens.append(next_token)
        input_tensor = torch.tensor([generated_tokens], dtype=torch.long).to('cuda')  # Update input for next prediction

    generated_text = ' '.join(vocab.itos[token] for token in generated_tokens if token not in (vocab.stoi['<eos>'], vocab.stoi['<pad>']))
    return generated_text


prompt = 'due to destructive interference between different paths for' # Enter custom prompt
vocab_path = Config.vocab_path + f'/{Config.prefix}_vocab.pkl'
model_path = Config.models_path + f'/{Config.prefix}_model_epoch_30.pth' # NOTE: ensure the latest model epoch is being used
print(f'vocab_path: {vocab_path}')
print(f'model_path: {model_path}')
print('Model output:')
print(generate_text(prompt, vocab_path, model_path, max_output_length=50))

The next step is to load the model you've trained and finetune it on a new dataset that determines whether an abstract is AI-related or not. This is done in a different notebook, titled ***finetuned_classifier.ipynb***