In [1]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def dataset_gen(data):
    for idx, row in data.iterrows():

        prompt = [
            {
                "role": "system",
                "content": """Using the information contained in the context,
give a comprehensive and concise answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the rule when relevant.
If the answer cannot be deduced from the context, do not give an answer.
The questions are related with Magic The Gathering card game.""",
            },
            {
                "role": "user",
                "content": f"""Context:
{row['context']}
---
Now here is the question you need to answer.

Question: {row['question']}""",
            },
            {"role": "assistant", "content": f"Answer: {row['answer']}"},
        ]

        yield {"messages": prompt}


reddit_df = pd.read_csv("data/reddit/reddit_qa_dataset_with_context.csv")

dataset = Dataset.from_generator(dataset_gen, gen_kwargs={"data": reddit_df})
dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset["train"]
test_dataset = dataset["test"]

train_dataset, train_dataset[0]

(Dataset({
     features: ['messages'],
     num_rows: 1960
 }),
 {'messages': [{'content': 'Using the information contained in the context,\ngive a comprehensive and concise answer to the question.\nRespond only to the question asked, response should be concise and relevant to the question.\nProvide the number of the rule when relevant.\nIf the answer cannot be deduced from the context, do not give an answer.\nThe questions are related with Magic The Gathering card game.',
    'role': 'system'},
   {'content': "Context:\n\nExtracted documents:\nDocument 0:::\nName: Dermoplasm\nMana Cost: 2 colorless, blue\nType: Creature — Shapeshifter\nText: Flying\nMorph 2 colorless, blue, blue,                                         2 colorless 2 colorless              3 colorless. Turn it face up any time for its morph cost.)\nWhen Dermoplasm is turned face up, you may put a creature card with a morph ability from your hand onto the battlefield face up. If you do, return Dermoplasm to its owner's

In [3]:
READER_MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    READER_MODEL_NAME,
    device_map="auto",
    # attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  2.30it/s]


In [4]:
from transformers import TrainingArguments
from peft import LoraConfig
from trl import SFTTrainer
import evaluate

# LoRA Config
peft_config = LoraConfig(
    lora_alpha=128,
    lora_dropout=0.05,
    r=256,
    bias="none",
    target_modules="all-linear",
    task_type="CAUSAL_LM",
)
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")


def compute_metrics(pred):
    print(pred)
    return {
        "rouge1": rouge.compute(predictions=pred["pred"], references=pred["label"])[
            "rouge1"
        ].mid.fmeasure,
        "rouge2": rouge.compute(predictions=pred["pred"], references=pred["label"])[
            "rouge2"
        ].mid.fmeasure,
        "rougeL": rouge.compute(predictions=pred["pred"], references=pred["label"])[
            "rougeL"
        ].mid.fmeasure,
        "bleu": bleu.compute(predictions=pred["pred"], references=pred["label"])[
            "score"
        ],
    }


# Training Params
args = TrainingArguments(
    output_dir="./results_modified",  # directory to save and repository id
    num_train_epochs=1,  # number of training epochs
    per_device_train_batch_size=1,  # batch size per device during training
    per_device_eval_batch_size=1,  # batch size for evaluation
    eval_accumulation_steps=4,  # number of steps before performing a backward/update pass
    gradient_accumulation_steps=4,  # number of steps before performing a backward/update pass
    gradient_checkpointing=True,  # use gradient checkpointing to save memory
    optim="adamw_torch_fused",  # use fused adamw optimizer
    logging_steps=10,  # log every 10 steps
    eval_steps=5,  # evaluate every 100 steps
    eval_strategy="steps",  # evaluate every 5 steps
    save_strategy="epoch",  # save checkpoint every epoch
    learning_rate=2e-4,  # learning rate, based on QLoRA paper
    bf16=True,  # use bfloat16 precision
    tf32=True,  # use tf32 precision
    max_grad_norm=0.3,  # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,  # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",  # use constant learning rate scheduler
    # push_to_hub=True,                       # push model to hub
    # report_to="tensorboard",                # report metrics to tensorboard
)

# Trainer
max_seq_length = 3072  # max sequence length for model and packing of the dataset

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    # packing=True,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False,  # No need to add additional separator token
    },
)

# Training
trainer.train()

Map: 100%|██████████| 1960/1960 [00:00<00:00, 2021.51 examples/s]
Map: 100%|██████████| 490/490 [00:00<00:00, 2037.77 examples/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
ERROR:wandb.jupyter:F

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss


: 

In [None]:
# Save Model
trainer.model.save_pretrained("model/gatherer_sage_model/")