# Data Modeling

Notebook to generate a datasets.Dataset file for models. \

For GPT-2 the columns will be:
- input_ids
- attention_mask

For IntiniTransformer the columns will be:
- input_ids
- labels
    

In [6]:
import tiktoken
import torch
from datasets import Dataset, load_dataset
from transformers import GPT2Config, GPT2Tokenizer

### Dataset generation

In [7]:
def generate_dataset_tiktoken(dataset, tokenizer, config):

    # Tokenize the dataset
    text = " ".join(dataset["text"])

    # Encode the text
    encoded_text = tokenizer.encode(text)

    # Convert to tensor
    encoded_text = torch.tensor(encoded_text, dtype=torch.long)

    # Shift to labels and remove last token
    encoded_labels = encoded_text[1:]
    encoded_text = encoded_text[:-1]

    # split chunks
    input_ids = torch.split(encoded_text, config.n_positions, dim=0)[:-1]
    labels = torch.split(encoded_labels, config.n_positions, dim=0)[:-1]

    assert len(input_ids) == len(labels)
    assert set([len(x) for x in input_ids]) == {config.n_positions}
    assert set([len(x) for x in labels]) == {config.n_positions}

    train_ds = Dataset.from_dict(
        {
            "input_ids": input_ids,
            "labels": labels,
        }
    )

    return train_ds


def generate_dataset_gpt2tokenizer(dataset, tokenizer, config):
    # Tokenize the dataset
    text = dataset["text"]

    # Encode the text
    encoded_text = tokenizer(" ".join(dataset["text"]), return_tensors="pt")
    input_ids = encoded_text["input_ids"]
    attention_mask = encoded_text["attention_mask"]

    # split chunks
    input_ids = torch.split(input_ids, config.n_positions, dim=1)[:-1]
    attention_mask = torch.split(attention_mask, config.n_positions, dim=1)[:-1]

    assert len(input_ids) == len(attention_mask)
    assert set([x.shape[1] for x in input_ids]) == {config.n_positions}
    assert set([x.shape[1] for x in attention_mask]) == {config.n_positions}

    train_ds = Dataset.from_dict(
        {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
        }
    )

    return train_ds

### Generate Dataset

In [8]:
# Parameters
SAMPLE = 0.0001  # Partition of the dataset to sample

# Load dataset
dataset = load_dataset("carolina-c4ai/corpus-carolina", taxonomy="wik")["corpus"]

# Sample dataset
dataset = dataset.shuffle(seed=42).select(range(int(len(dataset) * SAMPLE)))

# Load the GPT2 configuration
config = GPT2Config()

# Tokenizer (select the tokenizer)

# GPT-2 TOKENIZERS
# tokenizer = GPT2Tokenizer(
#     vocab_file="./tokenizer/vocab.json", merges_file="./tokenizer/merges.txt"
# ) # Custom Tokenizer
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2") # Original GPT-2 Tokenizer

# tokenizer.model_max_length = config.n_positions
# tokenizer.eos_token = tokenizer.bos_token
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.pad_token_id = tokenizer.eos_token_id

# TIKTOKEN
tokenizer = tiktoken.get_encoding("cl100k_base")

In [9]:
if type(tokenizer) == GPT2Tokenizer:
    train_ds = generate_dataset_gpt2tokenizer(dataset, tokenizer, config)
elif type(tokenizer) == tiktoken.core.Encoding:
    train_ds = generate_dataset_tiktoken(dataset, tokenizer, config)

In [10]:
train_ds

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 52
})

In [None]:
train_ds.save_to_disk("data/train")