In [1]:
import gc
import os
import torch
import pandas as pd
from torch.utils.data import Dataset, random_split
from transformers import TrainingArguments, Trainer, AutoModelForCausalLM, AutoTokenizer
output_path = 'Models/gpt-neo/125M-wow'
texts = pd.read_csv('data_wow.csv')

torch.manual_seed(42)
model_name = "EleutherAI/gpt-neo-125M"
tokenizer = AutoTokenizer.from_pretrained(model_name, bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')

class TextDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.labels = []
        self.input_ids = []
        self.attn_masks = []        
        for sentence in txt_list['sentence']:
            encodings_dict = tokenizer(sentence, truncation=True, max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
    def __len__(self): return len(self.input_ids)
    def __getitem__(self, idx): return self.input_ids[idx], self.attn_masks[idx]

max_length = max([len(tokenizer.encode(sentence)) for sentence in texts['sentence']])
dataset = TextDataset(texts, tokenizer, max_length=max_length)
train_size = int(0.8 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])
print(texts)
print(train_size)
print(len(dataset) - train_size)

# os.environ["WANDB_PROJECT"]='gpt-neo-125M'
# os.environ["WANDB_LOG_MODEL"]="true"
# os.environ["WANDB_WATCH"]="false"
# os.environ["WANDB_NAME"]="gpt-neo-wow"
# os.environ["WANDB_API_KEY"] = "b689f7c91f1ec7520fa8da927f175f1efd587181"

                                                sentence
0      <|startoftext|>Title: Sharptalon's Claw Descri...
1      <|startoftext|>Title: Riverpaw Gnoll Bounty De...
2      <|startoftext|>Title: Give Gerard a Drink Desc...
3      <|startoftext|>Title: Ursangous' Paw Descripti...
4      <|startoftext|>Title: Shadumbra's Head Descrip...
...                                                  ...
25536  <|startoftext|>Title: Practice Makes Perfect: ...
25537  <|startoftext|>Title: Practice Makes Perfect: ...
25538  <|startoftext|>Title: Pedgi the Parched Descri...
25539  <|startoftext|>Title: Eon's Fringe Description...
25540  <|startoftext|>Title: To Soridormi's Aid Descr...

[25541 rows x 1 columns]
20432
5109


In [2]:
try:
    # model = AutoModelForCausalLM.from_pretrained(os.path.join(output_path, 'results')).cuda()
    model = AutoModelForCausalLM.from_pretrained(os.path.join(output_path, 'results', 'checkpoint-1825')).cuda()
    print('saved')
except:
    model = AutoModelForCausalLM.from_pretrained(model_name).cuda()
    print('downloaded')

model.resize_token_embeddings(len(tokenizer))
print(max_length)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


OutOfMemoryError: CUDA out of memory. Tried to allocate 148.00 MiB. GPU 

In [3]:
from transformers import EarlyStoppingCallback
torch.cuda.empty_cache()

training_args = TrainingArguments(output_dir=os.path.join(output_path, 'results'),
                                  num_train_epochs=25,
                                  load_best_model_at_end=True,
                                  overwrite_output_dir=True,
                                  eval_strategy="epoch",
                                  save_strategy="epoch",
                                  per_device_train_batch_size=56,
                                  per_device_eval_batch_size=56,
                                  warmup_steps=10,
                                  weight_decay=0.05,
                                  logging_dir=os.path.join(output_path, 'logs'),
                                  report_to = 'wandb')

trainer = Trainer(model=model,
        args=training_args,
        train_dataset = train_dataset, 
        eval_dataset = val_dataset,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
        data_collator = lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                      'attention_mask': torch.stack([f[1] for f in data]),
                                      'labels': torch.stack([f[0] for f in data])})

trainer.train()
model.save_pretrained(os.path.join(output_path, 'results'))
tokenizer.save_pretrained(os.path.join(output_path, 'results'))

[2024-07-29 09:40:23,792] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/opt/conda/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




[34m[1mwandb[0m: Currently logged in as: [33mgarbacik-mateusz[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,No log,1.520463
2,1.652300,1.467383
3,1.393100,1.4414
4,1.393100,1.428227
5,1.294700,1.422623
6,1.205200,1.425824
7,1.140800,1.428631
8,1.140800,1.444082


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=2920, training_loss=1.2997734801409995, metrics={'train_runtime': 4592.453, 'train_samples_per_second': 111.226, 'train_steps_per_second': 1.987, 'total_flos': 1.5760779141316608e+16, 'train_loss': 1.2997734801409995, 'epoch': 8.0})

In [None]:
# Epoch 	Training Loss 	Validation Loss
# 1 	No log 	1.520463
# 2 	1.652300 	1.467383
# 3 	1.393100 	1.441400
# 4 	1.393100 	1.428227
# 5 	1.294700 	1.422623
# 6 	1.205200 	1.425824
# 7 	1.140800 	1.428631
# 8 	1.140800 	1.444082

# There were missing keys in the checkpoint model loaded: ['lm_head.weight'].

# TrainOutput(global_step=2920, training_loss=1.2997734801409995, metrics={'train_runtime': 4592.453, 'train_samples_per_second': 111.226, 'train_steps_per_second': 1.987, 'total_flos': 1.5760779141316608e+16, 'train_loss': 1.2997734801409995, 'epoch': 8.0})

In [4]:
input_text = "Title: Sharptalon's Claw \nDescription:"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.cuda()

model.eval()
try:
    sample_outputs = model.generate(
        input_ids=input_ids,
        pad_token_id=tokenizer.pad_token_id,
        do_sample=True,
        top_k=50,
        max_length=300,
        top_p=0.95,
        temperature=0.7,
        num_return_sequences=100
    )
    # Decode and print generated texts
    generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in sample_outputs]
    with open(os.path.join(output_path, 'results','output2.txt'), 'w') as file:
        file.writelines([f"Generated text {i+1}:\n{text}\n" for i, text in enumerate(generated_texts)])

except RuntimeError as e:
    print("RuntimeError during generation:", e)

    # Additional Debugging: Check logits
    with torch.no_grad():
        outputs = model(input_ids=input_ids)
        logits = outputs.logits
        assert not torch.isnan(logits).any(), "logits contain NaNs"
        assert not torch.isinf(logits).any(), "logits contain Infs"
        print("Logits sample:", logits[0, -1, :10])
