In [1]:
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from transformers import AutoTokenizer
from gatherer_sage.rag import RAG
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print("Loading RAG model")
rag = RAG()

Loading RAG model




In [None]:
PROMPT_TEMPLATE = [
    {
        "role": "system",
        "content": """Using the information contained in the context,
give a comprehensive and concise answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the rule when relevant.
If the answer cannot be deduced from the context, do not give an answer.
The questions are related with Magic The Gathering card game.""",
    },
    {
        "role": "user",
        "content": """Context:
{context}
---
Now here is the question you need to answer.

Question: {question}""",
    },
]

READER_MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"


class RedditDataset(Dataset):
    def __init__(
        self,
        reddit_data: pd.DataFrame,
        rag,
        model_path: str = "meta-llama/Meta-Llama-3-8B-Instruct",
    ):
        self.rag = rag
        self.data = reddit_data
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        question = row["question"]
        context = self.rag.retrieve_context(question)

        prompt = [
            {
                "role": "system",
                "content": """Using the information contained in the context,
give a comprehensive and concise answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the rule when relevant.
If the answer cannot be deduced from the context, do not give an answer.
The questions are related with Magic The Gathering card game.""",
            },
            {
                "role": "user",
                "content": f"""Context:
{context}
---
Now here is the question you need to answer.

Question: {question}""",
            },
            {"role": "assistant", "content": f"Answer: {row['answer']}"},
        ]

        return self.tokenizer.apply_chat_template(
            prompt,
            tokenize=False,
            add_generation_prompt=True,
            return_dict=True,
        )


print("Loading Reddit dataset")
reddit_df = pd.read_csv("data/reddit/reddit_qa_dataset.csv")
train, test = train_test_split(reddit_df, test_size=0.2)

print("Creating datasets")
train_dataset = RedditDataset(train, rag)
test_dataset = RedditDataset(test, rag)

Loading Reddit dataset
Creating datasets


AttributeError: can't set attribute

In [11]:
from transformers import pipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    READER_MODEL_NAME, quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)

model_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=True,
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=500,
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.30it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
from transformers import (
    TrainingArguments,
)
from sentence_transformers import SentenceTransformer

from peft import LoraConfig
from trl import SFTTrainer
import trl


def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example["instruction"])):
        text = f"### Question: {example['instruction'][i]}\n ### Answer: {example['output'][i]}"
        output_texts.append(text)
    return output_texts


max_seq_length = SentenceTransformer(READER_MODEL_NAME).max_seq_length

# LoRA Config
peft_parameters = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=8,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)


# Training Params
train_params = TrainingArguments(
    output_dir="./results_modified",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
)

# Trainer
fine_tuning = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    peft_config=peft_parameters,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=train_params,
    formatting_prompts_func=formatting_prompts_func,
)

# Training
fine_tuning.train()

No sentence-transformers model found with name meta-llama/Meta-Llama-3-8B-Instruct. Creating a new one with MEAN pooling.
Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  2.18it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 1/1 [00:00<00:00, 19.51it/s]


AttributeError: 

In [None]:
# Save Model
fine_tuning.model.save_pretrained("model/gatherer_sage_model/")

In [9]:
from datasets import load_dataset

dataset = load_dataset("philschmid/dolly-15k-oai-style", split="train")
dataset[0]

{'messages': [{'content': "When did Virgin Australia start operating?\nVirgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia's domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney.",
   'role': 'user'},
  {'content': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.',
   'role': 'assistant'}]}

In [None]:
train_dataset[0]

100%|██████████| 1/1 [00:00<00:00, 19.11it/s]


'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nUsing the information contained in the context,\ngive a comprehensive and concise answer to the question.\nRespond only to the question asked, response should be concise and relevant to the question.\nProvide the number of the rule when relevant.\nIf the answer cannot be deduced from the context, do not give an answer.\nThe questions are related with Magic The Gathering card game.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nContext:\n\nExtracted documents:\nDocument 0:::\nBear enter the battlefield from your graveyard at the same time , you can ’t choose to \nexile either of them when applying Sutured Ghoul ’s replacement effect.  \n \n614.13b The same object can’ t be chosen to change zones more than once when applying \nreplacement effects that modify how a single permanent enters the battlefield.  \nExample:  Jund (a plane card) says, “ Whenever a player casts a black, red, or green \ncreature spell, it gains d

In [13]:
data_name = "mlabonne/guanaco-llama2-1k"
training_data = load_dataset(data_name, split="train")
training_data[0]

Downloading readme: 100%|██████████| 1.02k/1.02k [00:00<00:00, 1.70MB/s]
Downloading data: 100%|██████████| 967k/967k [00:00<00:00, 1.19MB/s]
Generating train split: 100%|██████████| 1000/1000 [00:00<00:00, 47715.68 examples/s]


{'text': '<s>[INST] Me gradué hace poco de la carrera de medicina ¿Me podrías aconsejar para conseguir rápidamente un puesto de trabajo? [/INST] Esto vale tanto para médicos como para cualquier otra profesión tras finalizar los estudios aniversarios y mi consejo sería preguntar a cuántas personas haya conocido mejor. En este caso, mi primera opción sería hablar con otros profesionales médicos, echar currículos en hospitales y cualquier centro de salud. En paralelo, trabajaría por mejorar mi marca personal como médico mediante un blog o formas digitales de comunicación como los vídeos. Y, para mejorar las posibilidades de encontrar trabajo, también participaría en congresos y encuentros para conseguir más contactos. Y, además de todo lo anterior, seguiría estudiando para presentarme a las oposiciones y ejercer la medicina en el sector público de mi país. </s>'}