In [1]:
"""
This notebook takes a flat text file of training
examples that are delimited by EOS tokens (<|endoftext|>)
and converts it into a training and evaluation dataset
to be used with the minimal_trainer.py training script.

It will filter, pad, and attention mask based on the
maximum length. This should match the block_size in the
minimal_trainer.py script.
"""

from transformers import AutoTokenizer
from datasets import load_from_disk, Dataset
import pandas as pd
import datasets
import torch
import random

In [2]:
# Load tokenizer

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")

In [3]:
# Load text file line by line
# Rallio_test.txt is just some random text examples.
# Caution: has not been carefully reviewed, could contain toxic materials.

with open('rallio_test.txt') as my_file:
    data = my_file.read()
print(len(data))

335035


In [5]:
# Split entries by EOS token and remove any unneeded newlines

entries=data.split("<|endoftext|>")
count=0
fixed=[]
for i in entries:
    new_line=""
    if i[-1]=="\n" and i[0] =="\n":
        new_line=i[1:-1]
        count+=1
    elif i[0]=="\n":
        new_line=i[1:]
    elif i[-1] == "\n":
        new_line=i[:-1]
    if len(new_line) > 5:
        fixed.append(new_line)
    else:
        fixed.append(i)
print("You have this many training examples: "+str(len(fixed)))

You have this many training examples: 697


In [6]:
#Add back EOS tokens. I have chosen to put two endoftext tokens.
#Probably only one is needed.

fixed_tokens=[]
for i in fixed:
    line=i+"<|endoftext|><|endoftext|>"
    tokens=tokenizer.encode(line)
    fixed_tokens.append((line,tokens))

In [None]:
# Set the maximum token length per item.
# Pad and mask any entries shorter than max_length.

max_length=280

attention_mask=[]
input_ids=[]
labels=[]

for i in fixed_tokens:
    length=len(i[1])
    attention=[]
    if length < max_length:
        for k in range(0,(max_length-length)):
            entry=i[1]
            entry.append(1)
        for k in range(0,(length)):
            attention.append(1)
        for k in range(0,(max_length-length)):
            attention.append(0)
        attention_mask.append(attention)
        input_ids.append(entry)
        labels.append(entry)

# Print out and inspect the first entry.
print(fixed_tokens[0])
print(attention_mask[0])
print(input_ids[0])
print(labels[0])

In [8]:
# Create pandas dataframe

df = pd.DataFrame({"attention_mask": attention_mask, "input_ids":input_ids,"labels":labels})

In [12]:
# Create dataset

new_dataset=datasets.Dataset.from_pandas(df)
split_dataset = new_dataset.train_test_split(test_size=0.01)
train_dataset=split_dataset['train']
eval_dataset=split_dataset['test']

print("Training examples: "+str(len(train_dataset)))
print("Evaluation examples: "+str(len(eval_dataset)))

Training examples: 690
Evaluation examples: 7


In [13]:
# Save dataset

train_dataset.save_to_disk("my_train_data")
eval_dataset.save_to_disk("my_eval_data")

Flattening the indices:   0%|          | 0/1 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/1 [00:00<?, ?ba/s]

In [14]:
# Reload the dataset if necessary

my_train_dataset = load_from_disk("my_train_data")
my_eval_dataset = load_from_disk("my_eval_data")

In [15]:
my_train_dataset

Dataset({
    features: ['attention_mask', 'input_ids', 'labels'],
    num_rows: 690
})

In [16]:
my_eval_dataset

Dataset({
    features: ['attention_mask', 'input_ids', 'labels'],
    num_rows: 7
})