In [394]:
from datasets import load_dataset, load_metric
from torch.utils.data import Dataset
from tqdm import tqdm
import torch
import transformers
from transformers import AutoTokenizer
from tokenizers.processors import BertProcessing
import numpy as np
from transformers import MambaForCausalLM, AutoModelForCausalLM
from transformers import TrainingArguments, Trainer

In [164]:
model_id = "state-spaces/mamba-130m-hf"
tokenizer = AutoTokenizer.from_pretrained(model_id)

tokenizer.eos_token = "<|endoftext|>"
tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "left"

# special_tokens_dict = {"cls_token": "<CLS>", "sep_token": "<SEP>"}
# num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer._tokenizer.post_processor = BertProcessing(
#       (str("[SEP]"), 50278), (str("[CLS]"), 50277)
# )

max_length = 512 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
pad_on_right = tokenizer.padding_side == "right"

datasets = load_dataset("squad")
print(datasets)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})


In [245]:
example = datasets["train"][0]
# print(example["question"])
# print(example["context"])
# print(example["answers"]["text"][0])

tokenized_example = tokenizer(
    f"{example['context']}\n\nQ: {example['question']}\nA: {example['answers']['text'][0]}<|endoftext|>",
    # example["context"],
    # example["question"],
    # [example["answers"]["text"][0]],
    max_length=max_length,
    truncation=True,
    padding="max_length",
)

x = tokenized_example["input_ids"]
a = tokenized_example["attention_mask"]
# print(tokenizer.decode(x))
a = np.array(a)
a1 = np.count_nonzero(a) - 512
print(x)

answer_code = tokenizer(" "+example["answers"]["text"][0] + "<|endoftext|>")["input_ids"]
answer_length = len(answer_code)

print(answer_length)
print(answer_code)
print(tokenizer.decode(answer_code))
print(x[a1-answer_length:a1])
print(tokenizer.decode(x[a1-answer_length:a1]))

labels = np.ones_like(a) * -100
labels[a1-answer_length:a1] = x[a1-answer_length:a1]
# print(np.array(x))
print(labels)

[18551, 5671, 8572, 13, 253, 2143, 556, 247, 10503, 1894, 15, 2058, 412, 253, 11505, 16790, 434, 5328, 34074, 310, 247, 14072, 23957, 273, 253, 8237, 6393, 15, 41853, 275, 2914, 273, 253, 11505, 16790, 285, 10268, 352, 13, 310, 247, 14295, 23957, 273, 2828, 342, 6174, 598, 42750, 342, 253, 13691, 346, 39685, 614, 2006, 3189, 30542, 5210, 3446, 10209, 281, 253, 11505, 16790, 310, 253, 42385, 3737, 273, 253, 50179, 15382, 15, 41853, 3212, 253, 40683, 3737, 310, 253, 443, 8601, 936, 13, 247, 38005, 1659, 273, 15851, 285, 12906, 15, 733, 310, 247, 36804, 273, 253, 7753, 85, 936, 387, 418, 454, 3229, 13, 6181, 835, 253, 8237, 6393, 34719, 15376, 5420, 281, 11877, 10246, 324, 5464, 36920, 67, 343, 528, 275, 1283, 3680, 15, 2058, 253, 990, 273, 253, 2022, 4446, 313, 395, 275, 247, 1480, 1386, 326, 23417, 949, 495, 38490, 285, 253, 7284, 399, 485, 582, 310, 247, 2969, 13, 4980, 8805, 23957, 273, 6393, 15, 187, 187, 50, 27, 1916, 5207, 858, 253, 8237, 6393, 14163, 3176, 275, 1283, 3680, 275, 41

In [254]:
def length_filter(example):
    # Calculate the total length of question, context, and answers (assuming only one answer per question)
    total_length = len(tokenizer(f"{example['context']}\n\nQ: {example['question']}\nA: {example['answers']['text'][0]}<|endoftext|>")["input_ids"])
    return total_length <= 512

datasets = datasets.filter(length_filter)
print(datasets)

Filter: 100%|████████████████████████████████████████████████| 87425/87425 [00:19<00:00, 4401.14 examples/s]
Filter: 100%|████████████████████████████████████████████████| 10514/10514 [00:02<00:00, 4206.55 examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87425
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10514
    })
})





In [255]:
def prepare_train_features(example):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    example["question"] = example["question"].lstrip()

    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_example = tokenizer(
        f"{example['context']}\n\nQ: {example['question']}\nA: {example['answers']['text'][0]}<|endoftext|>",
        max_length=max_length,
        truncation=True,
        padding="max_length",
    )

    # Let's label those examples!
    input_ids = tokenized_example["input_ids"]
    attention_mask = tokenized_example["attention_mask"]
    end_pos = np.count_nonzero(attention_mask) - max_length

    answer_code = tokenizer(" "+example["answers"]["text"][0] + "<|endoftext|>")["input_ids"]
    answer_length = len(answer_code)

    labels = np.ones_like(attention_mask) * -100
    labels[end_pos-answer_length:end_pos] = input_ids[end_pos-answer_length:end_pos]
    tokenized_example["labels"] = labels
   
    return tokenized_example

tokenized_datasets = datasets.map(prepare_train_features, batched=False, remove_columns=datasets["train"].column_names)

Map: 100%|███████████████████████████████████████████████████| 87425/87425 [00:41<00:00, 2111.54 examples/s]
Map: 100%|███████████████████████████████████████████████████| 10514/10514 [00:05<00:00, 2052.57 examples/s]


In [268]:
print(tokenized_datasets["validation"])

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 10514
})


In [388]:
model = MambaForCausalLM.from_pretrained(model_id)
model.config.keys_to_ignore_at_inference = ["cache_params", "hidden_states"]
model = model.to("cuda:0")
print(model)

MambaForCausalLM(
  (backbone): MambaModel(
    (embeddings): Embedding(50280, 768)
    (layers): ModuleList(
      (0-23): 24 x MambaBlock(
        (norm): MambaRMSNorm()
        (mixer): MambaMixer(
          (conv1d): Conv1d(1536, 1536, kernel_size=(4,), stride=(1,), padding=(3,), groups=1536)
          (act): SiLU()
          (in_proj): Linear(in_features=768, out_features=3072, bias=False)
          (x_proj): Linear(in_features=1536, out_features=80, bias=False)
          (dt_proj): Linear(in_features=48, out_features=1536, bias=True)
          (out_proj): Linear(in_features=1536, out_features=768, bias=False)
        )
      )
    )
    (norm_f): MambaRMSNorm()
  )
  (lm_head): Linear(in_features=768, out_features=50280, bias=False)
)


In [389]:
args = TrainingArguments(
    f"Mamba-finetuned-squadCausal",
    evaluation_strategy = "epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    push_to_hub=False,
    warmup_ratio=0.1,
    # fp16=True,
    # max_steps=1
)

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    # data_collator=data_collator,
    tokenizer=tokenizer,
)
model.config.use_cache = False

# trainer.train()

In [397]:
# trainer.save_model("Mamba-finetuned-squadCausal/Final")
model = AutoModelForCausalLM.from_pretrained("Mamba-finetuned-squadCausal/Final")
model = model.to("cuda:0")

In [370]:
def prepare_validation_features(example):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    example["question"] = example["question"].lstrip()

    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    # tokenized_example = tokenizer(
    #     f"{example['context']}\n\nQ: {example['question']}\nA:",
    #     max_length=max_length,
    #     truncation=True,
    # )
    
    text = f"{example['context']}\n\nQ: {example['question']}\nA:"
    input_ids = torch.LongTensor([tokenizer.encode(text)]).cuda()

    # example["input_ids"] = tokenized_example["input_ids"]
    # input_ids = torch.LongTensor([tokenized_example["input_ids"]]).cuda()
    out = model.generate(input_ids, max_length=max_length, eos_token_id=tokenizer.eos_token_id)
    decoded = tokenizer.batch_decode(out)[0]
    cleaned = decoded.replace(text, "")
    cleaned = cleaned.replace("<|endoftext|>", "")
    guess = cleaned.split("\n\n")[0].strip()
    example["guess"] = guess
   
    return example

tokenized_validsets = datasets["validation"].map(prepare_validation_features, batched=False)
print(datasets["validation"])
print(tokenized_validsets)

Map: 100%|█████████████████████████████████████████████████████| 10514/10514 [08:11<00:00, 21.40 examples/s]

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 10514
})
Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'guess'],
    num_rows: 10514
})





In [398]:
text = datasets["validation"][2]
text["question"] = text["question"].lstrip()
print(text["question"])

text = f"{text['context']}\n\nQ: {text['question']}\nA:"

input_ids = torch.LongTensor([tokenizer.encode(text)]).cuda()
out = model.generate(input_ids, max_length=max_length, eos_token_id=tokenizer.eos_token_id)
decoded = tokenizer.batch_decode(out)[0]
# print(decoded)
cleaned = decoded.replace(text, "")
cleaned = cleaned.replace("<|endoftext|>", "")
guess = cleaned.split("\n\n")[0].strip()
print(guess)


Where did Super Bowl 50 take place?
Levi's Stadium in the San Francisco Bay Area at Santa Clara, California


In [383]:
from evaluate import load
squad_metric = load("squad")
predictions = [{"id": ex["id"], 'prediction_text': ex["guess"]} for ex in tokenized_validsets]
references = [{"id": ex["id"], "answers": ex["answers"]} for ex in tokenized_validsets]
results = squad_metric.compute(predictions=predictions, references=references)
print(results)

{'exact_match': 58.37930378542895, 'f1': 67.6197732858846}


In [391]:
text = tokenized_validsets[6]
print(text)

{'id': '56be8e613aeaaa14008c90d2', 'title': 'Super_Bowl_50', 'context': 'Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.', 'question': 'What day was the game played on?', 'answers': {'text': ['February 7, 2016', 'February 7', 'February 7, 2016'], 'answer_start': [334, 33