In [1]:
import gc
import os
import torch
import pandas as pd
from torch.utils.data import Dataset, random_split
from transformers import TrainingArguments, Trainer, AutoModelForCausalLM, AutoTokenizer
output_path = 'Models/gpt-neo/125M-final-fantasy'
model_name = "google-t5/t5-large"

torch.manual_seed(42)
texts = pd.read_csv('final_fantasy.csv')
tokenizer = AutoTokenizer.from_pretrained(model_name, bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
valid_dataset = []
for sentence in texts['sentence']:
    if len(tokenizer.encode(sentence)) < 1024:
        valid_dataset.append(sentence)
        
class TextDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.labels = []
        self.input_ids = []
        self.attn_masks = []        
        for sentence in txt_list:
            encodings_dict = tokenizer(sentence, truncation=True, max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
    def __len__(self): return len(self.input_ids)
    def __getitem__(self, idx): return self.input_ids[idx], self.attn_masks[idx]

max_length = max([len(tokenizer.encode(sentence)) for sentence in valid_dataset])
text_dataset = TextDataset(valid_dataset, tokenizer, max_length=max_length)
train_size = int(0.8 * len(valid_dataset))
train_dataset, val_dataset = random_split(text_dataset, [train_size, len(text_dataset) - train_size])
# print(texts)
print('train_size', train_size)
print('valid_dataset', len(valid_dataset))
print('max_length', max_length)
# os.environ["WANDB_PROJECT"]='gpt-neo-125M-fantasy'
# os.environ["WANDB_LOG_MODEL"]="true"
# os.environ["WANDB_WATCH"]="false"
# os.environ["WANDB_NAME"]="gpt-neo-fantasy"
# os.environ["WANDB_API_KEY"] = "b689f7c91f1ec7520fa8da927f175f1efd587181"

Token indices sequence length is longer than the specified maximum sequence length for this model (5773 > 2048). Running this sequence through the model will result in indexing errors


train_size 1597
valid_dataset 1997
max_length 1023


In [3]:
# try:
#     model = AutoModelForCausalLM.from_pretrained(os.path.join(output_path, 'results', 'checkpoint-3184')).cuda()
#     print('saved')
# except:
model = AutoModelForCausalLM.from_pretrained(model_name).cuda()
print('downloaded')


model.resize_token_embeddings(len(tokenizer))


downloaded


Embedding(50259, 768)

In [6]:
from transformers import EarlyStoppingCallback
torch.cuda.empty_cache()

training_args = TrainingArguments(output_dir=os.path.join(output_path, 'results'),
                                  num_train_epochs=8,
                                  load_best_model_at_end=True,
                                  overwrite_output_dir=True,
                                  eval_strategy="epoch",
                                  save_strategy="epoch",
                                  per_device_train_batch_size=3,
                                  per_device_eval_batch_size=3,
                                  warmup_steps=100,
                                  weight_decay=0.03,
                                  logging_dir=os.path.join(output_path, 'logs'),
                                  report_to = 'wandb')

trainer = Trainer(model=model,
        args=training_args,
        train_dataset = train_dataset, 
        eval_dataset = val_dataset,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
        data_collator = lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                      'attention_mask': torch.stack([f[1] for f in data]),
                                      'labels': torch.stack([f[0] for f in data])})

trainer.train()
# model.save_pretrained(os.path.join(output_path, 'results'))
# tokenizer.save_pretrained(os.path.join(output_path, 'results'))

# add t5 model to training
# add gpt-2-large 

Epoch,Training Loss,Validation Loss
1,6.4875,6.829645
2,10.8434,12.372838
3,8.7134,4.951364
4,8.0838,8.312641
5,8.0213,9.681642
6,7.1976,4.26756
7,5.0263,4.432347
8,4.5303,4.28886


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=4264, training_loss=7.179816326549308, metrics={'train_runtime': 2603.1589, 'train_samples_per_second': 4.908, 'train_steps_per_second': 1.638, 'total_flos': 6667845817982976.0, 'train_loss': 7.179816326549308, 'epoch': 8.0})

In [7]:
input_text = "Title: A Bargain Struck Dialogue:"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.cuda()

model.eval()
try:
    sample_outputs = model.generate(
        input_ids=input_ids,
        pad_token_id=tokenizer.pad_token_id,
        do_sample=True,
        top_k=50,
        max_length=300,
        top_p=0.95,
        temperature=0.7,
        num_return_sequences=100
    )
    # Decode and print generated texts
    generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in sample_outputs]
    with open(os.path.join(output_path, 'results','output.txt'), 'w') as file:
        file.writelines([f"Generated text {i+1}:\n{text}\n" for i, text in enumerate(generated_texts)])

except RuntimeError as e:
    print("RuntimeError during generation:", e)

    # Additional Debugging: Check logits
    with torch.no_grad():
        outputs = model(input_ids=input_ids)
        logits = outputs.logits
        assert not torch.isnan(logits).any(), "logits contain NaNs"
        assert not torch.isinf(logits).any(), "logits contain Infs"
        print("Logits sample:", logits[0, -1, :10])
