<span style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">An Exception was encountered at '<a href="#papermill-error-cell">In [12]</a>'.</span>

In [12]:
from transformers import set_seed

seed = 42
set_seed(seed)

In [13]:
from transformers import GPT2Tokenizer, AutoTokenizer
from transformers import DataCollatorForLanguageModeling

def init_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
    tokenizer.add_special_tokens({"pad_token":"<pad>",
                                  "bos_token":"<func>",
                                  "eos_token":"</func>"})
    print(tokenizer)
    return tokenizer

In [None]:
from transformers import GPT2Config, GPT2LMHeadModel

def init_model(tokenizer):
    config = GPT2Config(
        vocab_size=tokenizer.vocab_size,
        n_positions=1024,
        n_embd=128,
        n_layer=1,
        n_head=1
    )

    model = GPT2LMHeadModel(config)
    print(len(tokenizer))
    model.resize_token_embeddings(len(tokenizer))
    return model

In [15]:
def init_collator(tokenizer):
    return DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )


In [16]:
from transformers import TrainingArguments

def init_args():
    training_args = TrainingArguments(
        output_dir=f"./results/scratch",
        label_names=['input_ids'],
        weight_decay=0.01,
        num_train_epochs=3,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=2,
        dataloader_num_workers=4,
        logging_dir="./logs",
        logging_strategy="steps",
        logging_steps=250,
        learning_rate=1e-4,
        fp16=True,
        use_cpu=True,
        optim="adamw_torch",
        save_strategy="steps",
        save_steps = 6401,
        eval_strategy="steps",
        eval_steps=1500
    )
    return training_args

In [17]:
from transformers import Trainer

def init_trainer(model, args, train, valid, tokenizer, data_collator):
    return Trainer(
        model=model,
        args=args,
        train_dataset=train,
        eval_dataset=valid,
        tokenizer=tokenizer,
        data_collator=data_collator
    )


In [18]:
from datasets import DatasetDict, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import gc

def add_special_tokens(example):
    example['body'] = f"{tokenizer.bos_token} {example['body']} {tokenizer.eos_token}"
    return example


def tokenize_function(examples):
    return tokenizer(
    examples['body'], 
    return_tensors="np", 
    padding="max_length",
)


def load_dataset():
    """
    df = pd.read_parquet(f"data/filtered_funcs.parquet")
    
    train, valid = train_test_split(df, train_size=0.8, test_size=0.2, random_state=42)
    
    ds = DatasetDict({
        'train': Dataset.from_pandas(train),
        'valid': Dataset.from_pandas(valid)}
    )
    
    ds = ds.map(add_special_tokens)
    tokenized_ds = ds.map(tokenize_function, batched=True)
    print(tokenized_ds)
    
    del df
    del train
    del valid
    del ds
    gc.collect()
    """
    
    ds = DatasetDict({
        'train': Dataset.from_parquet("data/filtered_funcs_tokenized_llama_train.parquet"),
        'eval': Dataset.from_parquet("data/filtered_funcs_tokenized_llama_eval.parquet")}
    )
    
    return ds

In [19]:
tokenized_ds = load_dataset()

Loading dataset shards:   0%|          | 0/112 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/28 [00:00<?, ?it/s]

In [20]:
import os

os.environ["TOKENIZERS_PARALLELISM"]="true"

In [21]:
tokenizer = init_tokenizer()
model = init_model(tokenizer)

data_collator = init_collator(tokenizer)

training_args = init_args()

trainer = init_trainer(
    model,
    training_args,
    tokenized_ds["train"],
    tokenized_ds["eval"],
    tokenizer, 
    data_collator
)


#max_index = max([max(input_ids) for input_ids in tokenized_ds['train']['input_ids']])
#print(f"Max token index in the dataset: {max_index}")
print(f"Tokenizer length: {len(tokenizer)}")
print(f"Model embedding size: {model.get_input_embeddings().num_embeddings}")
#embedding_layer = model.get_input_embeddings()
#print(f"Embedding layer size: {embedding_layer.num_embeddings}")


#trainer.train()

LlamaTokenizerFast(name_or_path='TinyLlama/TinyLlama-1.1B-Chat-v1.0', vocab_size=32000, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<func>', 'eos_token': '</func>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32000: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32001: AddedToken("<func>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32002: AddedToken("</func>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
32003
Tokenizer lengt

IndexError: index out of range in self

In [22]:
import time

print("starting")
current_time = time.time()
i = 0
for input_ids in tokenized_ds["train"]['input_ids']:
    if not(i+1 % 10000):
        print(i, " took ", time.time()-current_time, " s")
        current_time = time.time()
    if any(token >= model.get_input_embeddings().num_embeddings for token in input_ids):
        print(f"Out of bounds token found: {input_ids}")
    i += 1