In [1]:
import gc
import os
import torch
import pandas as pd
from torch.utils.data import Dataset, random_split
from transformers import TrainingArguments, Trainer, AutoModelForCausalLM, AutoTokenizer
output_path = 'Models/gpt2-large/wow-test'
texts = pd.read_csv('data_wow.csv', nrows=2000)
# texts = pd.read_csv('data_wow.csv')

torch.manual_seed(42)
model_name = "gpt2-large"
tokenizer = AutoTokenizer.from_pretrained(model_name, bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')

class TextDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.labels = []
        self.input_ids = []
        self.attn_masks = []        
        for sentence in txt_list['sentence']:
            encodings_dict = tokenizer(sentence, truncation=True, max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
    def __len__(self): return len(self.input_ids)
    def __getitem__(self, idx): return self.input_ids[idx], self.attn_masks[idx]

max_length = max([len(tokenizer.encode(sentence)) for sentence in texts['sentence']])
dataset = TextDataset(texts, tokenizer, max_length=max_length)
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])
print(texts)
print(train_size)
print(len(dataset) - train_size)

# os.environ["WANDB_PROJECT"]='gpt-neo-125M'
# os.environ["WANDB_LOG_MODEL"]="true"
# os.environ["WANDB_WATCH"]="false"
# os.environ["WANDB_NAME"]="gpt-neo-wow"
# os.environ["WANDB_API_KEY"] = "b689f7c91f1ec7520fa8da927f175f1efd587181"

                                               sentence
0     <|startoftext|>Title: Sharptalon's Claw Descri...
1     <|startoftext|>Title: Riverpaw Gnoll Bounty De...
2     <|startoftext|>Title: Give Gerard a Drink Desc...
3     <|startoftext|>Title: Ursangous' Paw Descripti...
4     <|startoftext|>Title: Shadumbra's Head Descrip...
...                                                 ...
1995  <|startoftext|>Title: Invasion Point: Cataclys...
1996  <|startoftext|>Title: Invasion Point: Cataclys...
1997  <|startoftext|>Title: Tabards of the Illidari ...
1998  <|startoftext|>Title: Dissension Amongst the R...
1999  <|startoftext|>Title: Little Embers Descriptio...

[2000 rows x 1 columns]
1800
200


In [2]:
# try:
#     # model = AutoModelForCausalLM.from_pretrained(os.path.join(output_path, 'results')).cuda()
#     model = AutoModelForCausalLM.from_pretrained(os.path.join(output_path, 'results', 'checkpoint-1825')).cuda()
#     print('saved')
# except:
#     model = AutoModelForCausalLM.from_pretrained(model_name).cuda()
#     print('downloaded')

model = AutoModelForCausalLM.from_pretrained(model_name).cuda()
model.resize_token_embeddings(len(tokenizer))
print(max_length)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


172


In [3]:
import nltk
from nltk.translate.bleu_score import sentence_bleu

def decode(input_ids_tensor):
    token_ids_list = input_ids_tensor.tolist()
    # Decode the token IDs into text
    return tokenizer.decode(token_ids_list, skip_special_tokens=True)

import torch.nn.functional as F
def decodeLogits(logits):

    # Apply softmax to get probabilities
    logits_tensor = torch.tensor(logits, dtype=torch.float)
    probabilities = F.softmax(logits_tensor, dim=-1)

    # Get the token IDs (the indices of the highest probabilities)
    token_ids = torch.argmax(probabilities, dim=-1)
    return tokenizer.decode(token_ids, skip_special_tokens=True)

# !pip install bert-score
import bert_score

def calculate_bertscore(predictions, references, lang='en'):
    # Calculate BERTScore
    # P, R, F1 = bert_score.score(predictions, references, lang=lang)
    P, R, F1 = bert_score.score(predictions, references, lang=lang, model_type='distilbert-base-uncased', verbose=True)
    
    # Compute average scores
    avg_precision = P.mean().item()
    avg_recall = R.mean().item()
    avg_f1 = F1.mean().item()
    
    return {
        'precision': avg_precision,
        'recall': avg_recall,
        'f1': avg_f1
    }

from rouge_score import rouge_scorer

def compute_rouge_in_chunks(candidates, references, chunk_size=100):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    results = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    
    for i in range(0, len(candidates), chunk_size):
        chunk_candidates = candidates[i:i + chunk_size]
        chunk_references = references[i:i + chunk_size]
        
        for c, r in zip(chunk_candidates, chunk_references):
            decoded_r = decode(r)
            decoded_c = decodeLogits(c)
            scores = scorer.score(decoded_r, decoded_c)
            print(scores)
            for key in results.keys():
                results[key].append(scores[key].fmeasure)

    average_scores = {key: sum(scores) / len(scores) for key, scores in results.items()}
    return average_scores

from nltk.translate.bleu_score import sentence_bleu

def compute_metrics(pred):
    references = pred.label_ids
    generated_texts = pred.predictions
    
    bleu_scores = []
    bert_scores = []
    for reference, generated_text in zip(references, generated_texts):
        reference_text = decode(reference)
        predicted_text = decodeLogits(generated_text)
        bert_score = calculate_bertscore([predicted_text], [reference_text])
        bert_scores.append(bert_score)
        bleu_score = sentence_bleu([reference_text], predicted_text)
        bleu_scores.append(bleu_score)
        
    avg_precision = sum(score['precision'] for score in bert_scores) / len(bert_scores)
    avg_recall = sum(score['recall'] for score in bert_scores) / len(bert_scores)
    avg_f1 = sum(score['f1'] for score in bert_scores) / len(bert_scores)
    rouge = compute_rouge_in_chunks(generated_texts, references)

    metric = {
        'bleu': sum(bleu_scores) / len(bleu_scores),
        'rouge1': rouge['rouge1'],
        'rouge2': rouge['rouge2'],
        'rougeL': rouge['rougeL'],
        'precision': avg_precision,
        'recall': avg_recall,
        'f1': avg_f1,
    }
    wandb.log(metric)
    return metric

In [None]:
from transformers import EarlyStoppingCallback
torch.cuda.empty_cache()

from transformers import TrainerCallback

class CustomCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        print(logs)

training_args = TrainingArguments(output_dir=os.path.join(output_path, 'results'),
                                  num_train_epochs=25,
                                  load_best_model_at_end=True,
                                  overwrite_output_dir=True,
                                  eval_strategy="epoch",
                                  save_strategy="epoch",
                                  per_device_train_batch_size=2, #56
                                  per_device_eval_batch_size=2,
                                  warmup_steps=10,
                                  logging_steps=1,
                                  weight_decay=0.05,
                                  logging_dir=os.path.join(output_path, 'logs'),
                                  report_to = 'wandb')

trainer = Trainer(model=model,
        args=training_args,
        train_dataset = train_dataset, 
        eval_dataset = val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
        data_collator = lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                      'attention_mask': torch.stack([f[1] for f in data]),
                                      'labels': torch.stack([f[0] for f in data])})
# Fine-tune the model
trainer.train()
model.save_pretrained(os.path.join(output_path, 'results'))
tokenizer.save_pretrained(os.path.join(output_path, 'results'))

[2024-09-02 17:29:04,916] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/opt/conda/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




[34m[1mwandb[0m: Currently logged in as: [33mgarbacik-mateusz[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss


In [None]:
# Access a single example from the validation dataset
# print(trainer.evaluate())
# for i in val_dataset:
#     evalInput(i)

In [38]:
import re
# Define the regex pattern to match the sentence that starts with "everything" and ends with "Description:"
def evalInput(example):
    # Assuming 'example' is a tuple where the first element is the input tensor
    input_ids_tensor = example[0]
    # Convert the tensor to a list of token IDs
    token_ids_list = input_ids_tensor.tolist()
    # Decode the token IDs into text
    decoded_text = tokenizer.decode(token_ids_list, skip_special_tokens=True)
    # Regex to capture the title and content separately
    match = re.match(r'^Title: (.*?) Description: (.*)', decoded_text, re.DOTALL)

    if match:
        title = match.group(1)
        content = match.group(2)
        input_text = f"Title: {title} Description: "
        print(input_text)
        predictions = generate_predictions(input_text)
        print(predictions)
        print(compute_rouge(predictions, decoded_text))
        bleu_scores = []
        bert_scores = []
        for generated_text in predictions:
            bleu_score = sentence_bleu([decoded_text], generated_text)
            bert_score = calculate_bertscore([generated_text], [decoded_text])
            bleu_scores.append(bleu_score)
            bert_scores.append(bert_score)
        precision = 0
        recall = 0
        f1 = 0
        for score in bert_scores:
            print(score)
            precision = precision + score['precision']
            recall = recall + score['recall']
            f1 = f1 + score['f1']
        print({
            'bleu': sum(bleu_scores) / len(bleu_scores),
            'precision': precision / len(bert_scores),
            'recall': recall / len(bert_scores),
            'f1': f1 / len(bert_scores),
        })
        # print("Title:", title)
        # print("Content:", content)
    else:
        print("No match found.")

# !pip install bert-score
import bert_score

def calculate_bertscore(predictions, references, lang='en'):
    # Calculate BERTScore
    P, R, F1 = bert_score.score(predictions, references, lang=lang)
    
    # Compute average scores
    avg_precision = P.mean().item()
    avg_recall = R.mean().item()
    avg_f1 = F1.mean().item()
    
    return {
        'precision': avg_precision,
        'recall': avg_recall,
        'f1': avg_f1
    }

# Example usage
# predictions = [
#     "The cat sat on the mat.",
#     "The quick brown fox jumps over the lazy dog."
# ]
# references = [
#     "A cat was sitting on a rug.",
#     "A speedy brown fox leaps over a lazy canine."
# ]

# scores = calculate_bertscore(predictions, references)
# print(scores)

from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
def compute_rouge(predictions, references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    rouge1_scores = []
    rougeL_scores = []
    
    for ref, pred in zip(references, predictions):
        scores = scorer.score(ref, pred)
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)
        
    rouge1_avg = sum(rouge1_scores) / len(rouge1_scores)
    rougeL_avg = sum(rougeL_scores) / len(rougeL_scores)
    
    return {
        "rouge1": rouge1_avg,
        "rougeL": rougeL_avg
    }

# Epoch 	Training Loss 	Validation Loss
# 1 	No log 	1.520463
# 2 	1.652300 	1.467383
# 3 	1.393100 	1.441400
# 4 	1.393100 	1.428227
# 5 	1.294700 	1.422623
# 6 	1.205200 	1.425824
# 7 	1.140800 	1.428631
# 8 	1.140800 	1.444082

# There were missing keys in the checkpoint model loaded: ['lm_head.weight'].

# TrainOutput(global_step=2920, training_loss=1.2997734801409995, metrics={'train_runtime': 4592.453, 'train_samples_per_second': 111.226, 'train_steps_per_second': 1.987, 'total_flos': 1.5760779141316608e+16, 'train_loss': 1.2997734801409995, 'epoch': 8.0})

In [3]:
def generate_predictions(input_text):
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.cuda()
    model.eval()
    try:
        sample_outputs = model.generate(
            input_ids=input_ids,
            pad_token_id=tokenizer.pad_token_id,
            do_sample=True,
            top_k=50,
            max_length=300,
            top_p=0.95,
            temperature=0.7,
            num_return_sequences=10
        )
        # Decode and print generated texts
        generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in sample_outputs]
        return generated_texts

    except RuntimeError as e:
        print("RuntimeError during generation:", e)

        # Additional Debugging: Check logits
        with torch.no_grad():
            outputs = model(input_ids=input_ids)
            logits = outputs.logits
            assert not torch.isnan(logits).any(), "logits contain NaNs"
            assert not torch.isinf(logits).any(), "logits contain Infs"
            print("Logits sample:", logits[0, -1, :10])


In [4]:
input_text = "Title: Sharptalon's Claw \nDescription:"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.cuda()

model.eval()
try:
    sample_outputs = model.generate(
        input_ids=input_ids,
        pad_token_id=tokenizer.pad_token_id,
        do_sample=True,
        top_k=50,
        max_length=300,
        top_p=0.95,
        temperature=0.7,
        num_return_sequences=100
    )
    # Decode and print generated texts
    generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in sample_outputs]
    with open(os.path.join(output_path, 'results','output2.txt'), 'w') as file:
        file.writelines([f"Generated text {i+1}:\n{text}\n" for i, text in enumerate(generated_texts)])

except RuntimeError as e:
    print("RuntimeError during generation:", e)

    # Additional Debugging: Check logits
    with torch.no_grad():
        outputs = model(input_ids=input_ids)
        logits = outputs.logits
        assert not torch.isnan(logits).any(), "logits contain NaNs"
        assert not torch.isinf(logits).any(), "logits contain Infs"
        print("Logits sample:", logits[0, -1, :10])
