In [1]:
import gc
import os
import torch
import pandas as pd
from torch.utils.data import Dataset, random_split
from transformers import TrainingArguments, Trainer, AutoModelForCausalLM, AutoTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer
output_path = 'Models/t5-base/wow'
model_name = "google-t5/t5-base"

torch.manual_seed(42)
texts = pd.read_csv('data_wow.csv')
tokenizer = AutoTokenizer.from_pretrained(model_name, bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
valid_dataset = []
for sentence in texts['sentence']:
    if len(tokenizer.encode(sentence)) < 1024:
        valid_dataset.append(sentence)
        
class TextDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.labels = []
        self.input_ids = []
        self.attn_masks = []        
        for sentence in txt_list:
            encodings_dict = tokenizer(sentence, truncation=True, max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
    def __len__(self): return len(self.input_ids)
    def __getitem__(self, idx): return self.input_ids[idx], self.attn_masks[idx]

max_length = max([len(tokenizer.encode(sentence)) for sentence in valid_dataset])
text_dataset = TextDataset(valid_dataset, tokenizer, max_length=max_length)
train_size = int(0.8 * len(valid_dataset))
train_dataset, val_dataset = random_split(text_dataset, [train_size, len(text_dataset) - train_size])
# print(texts)
print('train_size', train_size)
print('valid_dataset', len(valid_dataset))
print('max_length', max_length)
os.environ["WANDB_PROJECT"]='t5-base-wow'
os.environ["WANDB_LOG_MODEL"]="true"
os.environ["WANDB_WATCH"]="false"
os.environ["WANDB_NAME"]="t5-base-wow"
os.environ["WANDB_API_KEY"] = "b689f7c91f1ec7520fa8da927f175f1efd587181"

train_size 20432
valid_dataset 25541
max_length 237


In [2]:
from transformers import AutoModelForSeq2SeqLM
try:
    model = AutoModelForSeq2SeqLM.from_pretrained(os.path.join(output_path, 'results', 'checkpoint-511')).cuda() #5621
    print('saved')
except:
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).cuda()
    print('downloaded')


# model.resize_token_embeddings(len(tokenizer))


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


saved


In [3]:
from transformers import EarlyStoppingCallback
torch.cuda.empty_cache()

training_args = Seq2SeqTrainingArguments(output_dir=os.path.join(output_path, 'results'),
                                  num_train_epochs=25,
                                  load_best_model_at_end=True,
                                  overwrite_output_dir=True,
                                  eval_strategy="epoch",
                                  save_strategy="epoch",
                                  per_device_train_batch_size=20,
                                  per_device_eval_batch_size=20,
                                  warmup_steps=100,
                                  weight_decay=0.03,
                                  gradient_accumulation_steps=2,
                                  logging_dir=os.path.join(output_path, 'logs'),
                                  report_to = 'wandb')

trainer = Seq2SeqTrainer(model=model,
        args=training_args,
        train_dataset = train_dataset, 
        eval_dataset = val_dataset,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
        data_collator = lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                      'attention_mask': torch.stack([f[1] for f in data]),
                                      'labels': torch.stack([f[0] for f in data])})

trainer.train()
# model.save_pretrained(os.path.join(output_path, 'results'))
# tokenizer.save_pretrained(os.path.join(output_path, 'results'))

# add t5 model to training
# add gpt-2-large 

[2024-07-31 06:02:02,179] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/opt/conda/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




[34m[1mwandb[0m: Currently logged in as: [33mgarbacik-mateusz[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,3.7526,0.301425
2,0.2705,0.160294
3,0.154,0.097471
4,0.098,0.063786
5,0.0664,0.0441
6,0.0471,0.031855
7,0.0347,0.023482
8,0.0262,0.017688
9,0.0201,0.013652
10,0.0157,0.010699


KeyboardInterrupt: 

In [3]:
input_text = "Title: Sharptalon's Claw"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.cuda()

model.eval()
try:
    sample_outputs = model.generate(
        input_ids=input_ids,
        pad_token_id=tokenizer.pad_token_id,
        do_sample=True,
        top_k=50,
        max_length=300,
        top_p=0.95,
        temperature=0.7,
        num_return_sequences=50
    )
    print(sample_outputs[0])
    # Decode and print generated texts
    generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in sample_outputs]
    print(generated_texts)
    with open(os.path.join(output_path, 'results','output.txt'), 'w') as file:
        file.writelines([f"Generated text {i+1}:\n{text}\n" for i, text in enumerate(generated_texts)])

except RuntimeError as e:
    print("RuntimeError during generation:", e)

    # Additional Debugging: Check logits
    with torch.no_grad():
        outputs = model(input_ids=input_ids)
        logits = outputs.logits
        assert not torch.isnan(logits).any(), "logits contain NaNs"
        assert not torch.isinf(logits).any(), "logits contain Infs"
        print("Logits sample:", logits[0, -1, :10])


tensor([    0, 11029,    10, 22130,  1947,   106,    31,     7,   205,  4207,
           10, 22130,  1947,   106,    31,     7,   205,  4207, 11029,    10,
        22130,  1947,   106,    31,     7,   205,  4207,    10, 22130,  1947,
          106,    31,     7,   205,  4207,    10, 22130,  1947,   106,    31,
            7,   205,  4207,    10, 22130,  1947,   106,    31,     7,   205,
         4207,    10,   205,  4207,    10, 22130,  1947,   106,    31,     7,
          205,  4207,    10,   205,  4207,    10, 22130,  1947,   106,    31,
            7,   205,  4207,    10,   205,  4207,    10, 22130,  1947,   106,
           31,     7,   205,  4207,    10, 22130,  1947,   106,    31,     7,
          205,  4207,    10,   205,  4207,    10, 22130,  1947,   106,    31,
            7,   205,  4207,    10,   205,  4207,    10, 22130,  1947,   106,
           31,     7,   205,  4207,    10, 22130,  1947,   106,    31,     7,
          205,  4207,    10, 22130,  1947,   106,    31,     7, 