In [2]:
import gc
import os
import torch
import pandas as pd
from torch.utils.data import Dataset, random_split
from transformers import TrainingArguments, Trainer, AutoModelForCausalLM, AutoTokenizer
output_path = 'Models/gpt2-large/final-fantasy'
model_name = "gpt2-large"

torch.manual_seed(42)
texts = pd.read_csv('final_fantasy.csv')
tokenizer = AutoTokenizer.from_pretrained(model_name, bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
valid_dataset = []
for sentence in texts['sentence']:
    if len(tokenizer.encode(sentence)) < 1024:
        valid_dataset.append(sentence)
        
class TextDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.labels = []
        self.input_ids = []
        self.attn_masks = []        
        for sentence in txt_list:
            encodings_dict = tokenizer(sentence, truncation=True, max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
    def __len__(self): return len(self.input_ids)
    def __getitem__(self, idx): return self.input_ids[idx], self.attn_masks[idx]

max_length = max([len(tokenizer.encode(sentence)) for sentence in valid_dataset])
text_dataset = TextDataset(valid_dataset, tokenizer, max_length=max_length)
train_size = int(0.8 * len(valid_dataset))
train_dataset, val_dataset = random_split(text_dataset, [train_size, len(text_dataset) - train_size])
# print(texts)
print('train_size', train_size)
print('valid_dataset', len(valid_dataset))
print('max_length', max_length)
# os.environ["WANDB_PROJECT"]='gpt-neo-125M-fantasy'
# os.environ["WANDB_LOG_MODEL"]="true"
# os.environ["WANDB_WATCH"]="false"
# os.environ["WANDB_NAME"]="gpt-neo-fantasy"
# os.environ["WANDB_API_KEY"] = "b689f7c91f1ec7520fa8da927f175f1efd587181"

Token indices sequence length is longer than the specified maximum sequence length for this model (1682 > 1024). Running this sequence through the model will result in indexing errors


train_size 1597
valid_dataset 1997
max_length 1023


In [6]:
try:
    # model = AutoModelForCausalLM.from_pretrained(os.path.join(output_path, 'results')).cuda()
    checkpoint = 'checkpoint-3995'
    model = AutoModelForCausalLM.from_pretrained(os.path.join(output_path, 'results', checkpoint)).cuda()
    print('saved')
except:
    model = AutoModelForCausalLM.from_pretrained(model_name).cuda()
    print('downloaded')

model.resize_token_embeddings(len(tokenizer))
print(max_length)

saved
1023


In [3]:
from transformers import EarlyStoppingCallback
torch.cuda.empty_cache()

training_args = TrainingArguments(output_dir=os.path.join(output_path, 'results'),
                                  num_train_epochs=25,
                                  load_best_model_at_end=True,
                                  overwrite_output_dir=True,
                                  eval_strategy="epoch",
                                  save_strategy="epoch",
                                  per_device_train_batch_size=2,
                                  per_device_eval_batch_size=2,
                                  warmup_steps=10,
                                  weight_decay=0.05,
                                  logging_dir=os.path.join(output_path, 'logs'),
                                  report_to = 'wandb')

trainer = Trainer(model=model,
        args=training_args,
        train_dataset = train_dataset, 
        eval_dataset = val_dataset,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
        data_collator = lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                      'attention_mask': torch.stack([f[1] for f in data]),
                                      'labels': torch.stack([f[0] for f in data])})

trainer.train()
# model.save_pretrained(os.path.join(output_path, 'results'))
# tokenizer.save_pretrained(os.path.join(output_path, 'results'))

[2024-07-31 10:06:24,439] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/opt/conda/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




[34m[1mwandb[0m: Currently logged in as: [33mgarbacik-mateusz[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,1.3714,1.21511
2,0.9769,1.213737
3,0.7671,1.271381
4,0.5029,1.363086
5,0.4239,1.439664


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=3995, training_loss=0.7830976923058119, metrics={'train_runtime': 7907.3459, 'train_samples_per_second': 5.049, 'train_steps_per_second': 2.526, 'total_flos': 3.4719580697472e+16, 'train_loss': 0.7830976923058119, 'epoch': 5.0})

In [3]:
# Epoch	Training Loss	Validation Loss
# 1	1.371400	1.215110
# 2	0.976900	1.213737
# 3	0.767100	1.271381
# 4	0.502900	1.363086
# 5	0.423900	1.439664
# https://wandb.ai/garbacik-mateusz/huggingface/runs/8qjsymw5
# TrainOutput(global_step=3995, training_loss=0.7830976923058119, metrics={'train_runtime': 7907.3459, 'train_samples_per_second': 5.049, 'train_steps_per_second': 2.526, 'total_flos': 3.4719580697472e+16, 'train_loss': 0.7830976923058119, 'epoch': 5.0})

0     A Bargain Struck
1    A Beeautiful Plan
Name: title, dtype: object


In [None]:
checkpoints = [799, 1598, 2397, 3196, 3995]
for checkpoint in checkpoints:
    print('checkpoint', checkpoint)
    try:
        checkpoint_name = 'checkpoint-' + str(checkpoint)
        model = AutoModelForCausalLM.from_pretrained(os.path.join(output_path, 'results', checkpoint_name)).cuda()
        print('saved')
    except:
        model = AutoModelForCausalLM.from_pretrained(model_name).cuda()
        print('downloaded')

    model.resize_token_embeddings(len(tokenizer))
    print(max_length)
    for nrows in range(10):
        print('nrows', nrows)
        for length in range(100, 951, 50):
            print('length', length)
            ff_texts = pd.read_csv('data2.csv', nrows=nrows)
            with open(os.path.join(output_path, 'results/generated', checkpoint_name + '_output_nRows_' + str(nrows) + '_length_' + str(length) + '.txt'), 'w') as file:
                for title in ff_texts['title']:
                    # input_text = "Title: Sharptalon's Claw\nDescription:"
                    input_text = 'Title: ' + title + '\nDialogue:'
                    # input_text = title
                    print(input_text)
                    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.cuda()
                    model.eval()
                    try:
                        sample_outputs = model.generate(
                            input_ids=input_ids,
                            pad_token_id=tokenizer.pad_token_id,
                            do_sample=True,
                            top_k=50,
                            max_length=length,
                            top_p=0.95,
                            temperature=0.7,
                            num_return_sequences=15
                        )
                        # Decode and print generated texts
                        generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in sample_outputs]
                        file.writelines([f"Generated text {i+1}:\n{text}\n" for i, text in enumerate(generated_texts)])
                        file.writelines(['\n', '\n', '\n', '\n'])

                    except RuntimeError as e:
                        print("RuntimeError during generation:", e)
                        print('ERROR_checkpoint', checkpoint)
                        print('ERROR_nrows', nrows)
                        print('ERROR_length', length)

                        # Additional Debugging: Check logits
                        with torch.no_grad():
                            outputs = model(input_ids=input_ids)
                            logits = outputs.logits
                            assert not torch.isnan(logits).any(), "logits contain NaNs"
                            assert not torch.isinf(logits).any(), "logits contain Infs"
                            print("Logits sample:", logits[0, -1, :10])

checkpoint 799


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


saved
1023
nrows 0
length 100
length 150
length 200
length 250
length 300
length 350
length 400
length 450
length 500
length 550
length 600
length 650
length 700
length 750
length 800
length 850
length 900
length 950
nrows 1
length 100
Title: A Bargain Struck
Dialogue:
length 150
Title: A Bargain Struck
Dialogue:
length 200
Title: A Bargain Struck
Dialogue:
length 250
Title: A Bargain Struck
Dialogue:
length 300
Title: A Bargain Struck
Dialogue:
length 350
Title: A Bargain Struck
Dialogue:
length 400
Title: A Bargain Struck
Dialogue:
length 450
Title: A Bargain Struck
Dialogue:
length 500
Title: A Bargain Struck
Dialogue:
length 550
Title: A Bargain Struck
Dialogue:
length 600
Title: A Bargain Struck
Dialogue:
length 650
Title: A Bargain Struck
Dialogue:
length 700
Title: A Bargain Struck
Dialogue:
length 750
Title: A Bargain Struck
Dialogue:
length 800
Title: A Bargain Struck
Dialogue:
length 850
Title: A Bargain Struck
Dialogue:
length 900
Title: A Bargain Struck
Dialogue:
length 950