In [1]:
from datasets import load_dataset, load_metric
from torch.utils.data import Dataset
from tqdm import tqdm
import torch
import transformers
from transformers import AutoTokenizer
from tokenizers.processors import BertProcessing
import numpy as np
from transformers import MambaForCausalLM, AutoModelForCausalLM
from transformers import TrainingArguments, Trainer

In [2]:
model_id = "EleutherAI/pythia-160m"
tokenizer = AutoTokenizer.from_pretrained(model_id)

tokenizer.eos_token = "<|endoftext|>"
tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "left"

# special_tokens_dict = {"cls_token": "<CLS>", "sep_token": "<SEP>"}
# num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer._tokenizer.post_processor = BertProcessing(
#       (str("[SEP]"), 50278), (str("[CLS]"), 50277)
# )

max_length = 512 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
pad_on_right = tokenizer.padding_side == "right"

datasets = load_dataset("squad")
print(datasets)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})


In [3]:
def length_filter(example):
    # Calculate the total length of question, context, and answers (assuming only one answer per question)
    total_length = len(tokenizer(f"{example['context']}\n\nQ: {example['question']}\nA: {example['answers']['text'][0]}<|endoftext|>")["input_ids"])
    return total_length <= 512

datasets = datasets.filter(length_filter)
print(datasets)

Filter:   0%|          | 0/87599 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10570 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87428
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10515
    })
})


In [4]:
model = AutoModelForCausalLM.from_pretrained(model_id)
# model.config.keys_to_ignore_at_inference = ["cache_params", "hidden_states"]
model = model.to("cuda:0")
print(model)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 768)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-11): 12 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=768, out_features=2304, bias=True)
          (dense): Linear(in_features=768, out_features=768, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=768, out_features=3072, bias=True)
          (dense_4h_to_h): Linear(in_features=3072, out_features=768, bias=True)
          

In [5]:
# trainer.save_model("Pythia-finetuned-squadCausal/Final")
# model = AutoModelForCausalLM.from_pretrained("Pythia-finetuned-squadCausal/Final")
# model = model.to("cuda:0")

In [6]:
def prepare_validation_features(example):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    example["question"] = example["question"].lstrip()

    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    # tokenized_example = tokenizer(
    #     f"{example['context']}\n\nQ: {example['question']}\nA:",
    #     max_length=max_length,
    #     truncation=True,
    # )
    
    text = f"{example['context']}\n\nQ: {example['question']}\nA:"
    input_ids = torch.LongTensor([tokenizer.encode(text)]).cuda()

    # example["input_ids"] = tokenized_example["input_ids"]
    # input_ids = torch.LongTensor([tokenized_example["input_ids"]]).cuda()
    out = model.generate(input_ids, max_length=max_length, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id)
    decoded = tokenizer.batch_decode(out)[0]
    cleaned = decoded.replace(text, "")
    cleaned = cleaned.replace("<|endoftext|>", "")
    guess = cleaned.split("\n\n")[0].strip()
    example["guess"] = guess
   
    return example

tokenized_validsets = datasets["validation"].map(prepare_validation_features, batched=False)
print(datasets["validation"])
print(tokenized_validsets)

Map:   0%|          | 0/10515 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 10515
})
Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'guess'],
    num_rows: 10515
})


In [7]:
from evaluate import load
squad_metric = load("squad")
predictions = [{"id": ex["id"], 'prediction_text': ex["guess"]} for ex in tokenized_validsets]
references = [{"id": ex["id"], "answers": ex["answers"]} for ex in tokenized_validsets]
results = squad_metric.compute(predictions=predictions, references=references)
print(results)

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

{'exact_match': 0.24726581074655254, 'f1': 5.387292106209826}
