In [1]:
from transformers import pipeline
from unsloth import FastLanguageModel
import pandas as pd
from gatherer_sage.gen_hyperparam_search import evaluate_model
from requests.exceptions import HTTPError

base_model_path = "mistralai/Mistral-7B-v0.3"
pretrained_model_path = (
    "../model/mistral-gatherer-sage-v1/full_train_instruct/best_model"
)
rg_no_context_model_path = (
    "../model/mistral-gatherer-sage-v1/rules_guru_no_context_full/best_model"
)
rg_context_model_path = (
    "../model/mistral-gatherer-sage-v1/rules_guru_context_full/best_model"
)
max_seq_length = 20248

test_dataset = pd.read_csv(
    "../data/rules_guru/rules_guru_qa_dataset_with_context.csv"
).sample(frac=0.1, random_state=42)

df_scores = pd.DataFrame(columns=["model"])

temperature = 0.31
top_k = 75
top_p = 0.59
repetition_penalty = 1.02

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [2]:
import re
from tqdm.auto import tqdm
from huggingface_hub import InferenceClient

tqdm.pandas()

JUDGE_PROMPT = """
<s>[INST]You will be given a user_question, the real_answer and a system_answer.
Your task is to provide a 'total rating' scoring how well the system_answer answers the user_question, compared with the real_answer.
Give your rating on a scale of 1 to 4, where 1 means that the system_answer is not CORRECT at all, and 4 means that the system_answer is the same as real_answer, and answer the user_question perfectly.

Here is the scale you should use to build your answer:
1: The system_answer is terrible: completely wrong, misleading, or off-topic.
2: The system_answer is mostly not correct: provides some intuition, but is mostly irrelevant or off-topic.
3: The system_answer is mostly correct: relevant, but lacks detail, or is not directly addressing the user_question.
4: The system_answer is excellent: relevant, direct, detailed, and addresses all the concerns raised in the question.

Provide your feedback as follows:

Feedback:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 4)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here are the question and answer.

user_question: {question}
real_answer: {real_answer}
system_answer: {system_answer}

Provide your feedback:[/INST]
Feedback:::
Evaluation: """


def extract_judge_score(answer: str, split_str: str = "Total rating: ") -> int:
    try:
        if split_str in answer:
            rating = answer.split(split_str)[1]
        else:
            rating = answer
        digit_groups = [el.strip() for el in re.findall(r"\d+(?:\.\d+)?", rating)]
        return float(digit_groups[0])
    except Exception as e:
        print(e)
        return None


def llm_judge(preds_df):
    preds_df = preds_df.copy()
    repo_id = "mistralai/Mistral-7B-Instruct-v0.3"
    llm_client = InferenceClient(
        model=repo_id,
        timeout=120,
    )

    preds_df["llm_judge"] = preds_df.progress_apply(
        lambda x: llm_client.text_generation(
            prompt=JUDGE_PROMPT.format(
                question=x["question"],
                real_answer=x["real_answer"],
                system_answer=x["generated_answer"],
            ),
            max_new_tokens=500,
        ),
        axis=1,
    )

    preds_df["llm_judge_rating"] = preds_df["llm_judge"].apply(extract_judge_score)

    return preds_df

## 1. Base Mistral Model

In [3]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=base_model_path,
    max_seq_length=max_seq_length,
    load_in_4bit=True,
)
model.config.use_cache = False
tokenizer.pad_token = tokenizer.eos_token

FastLanguageModel.for_inference(model)

llm_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=True,
    temperature=temperature,
    repetition_penalty=repetition_penalty,
    top_k=top_k,
    top_p=top_p,
    return_full_text=False,
    max_new_tokens=500,
)

df_outputs, scores = evaluate_model(
    llm_pipeline, test_dataset.drop(columns=["context"])
)
try:
    df_outputs = llm_judge(df_outputs)
    scores["llm_judge_rating"] = df_outputs["llm_judge_rating"].mean()
except HTTPError as e:
    print(e)

df_outputs.to_csv("../results/mistral_preds.csv")
df_aux = pd.DataFrame(
    {
        "model": "mistral",
        **scores,
    },
    index=[0],
)
df_scores = pd.concat([df_scores, df_aux], ignore_index=True)
df_scores

==((====))==  Unsloth: Fast Mistral patching release 2024.5
   \\   /|    GPU: NVIDIA RTX A6000. Max memory: 47.529 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = True.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


## 2. Full Pretraining 60K QA

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=pretrained_model_path,
    max_seq_length=max_seq_length,
    load_in_4bit=True,
)
model.config.use_cache = False
tokenizer.pad_token = tokenizer.eos_token

FastLanguageModel.for_inference(model)

llm_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=True,
    temperature=temperature,
    repetition_penalty=repetition_penalty,
    top_k=top_k,
    top_p=top_p,
    return_full_text=False,
    max_new_tokens=500,
)

df_outputs, scores = evaluate_model(
    llm_pipeline, test_dataset.drop(columns=["context"])
)

try:
    df_outputs = llm_judge(df_outputs)
    scores["llm_judge_rating"] = df_outputs["llm_judge_rating"].mean()
except HTTPError as e:
    print(e)

df_outputs.to_csv("../results/full_train_instruct_preds.csv")

df_aux = pd.DataFrame(
    {
        "model": "full_train_instruct",
        **scores,
    },
    index=[0],
)
df_scores = pd.concat([df_scores, df_aux], ignore_index=True)
df_scores

==((====))==  Unsloth: Fast Mistral patching release 2024.5
   \\   /|    GPU: NVIDIA RTX A6000. Max memory: 47.529 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = True.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.10it/s]
TrevorJS/mtg-mistral-7b-instruct-sft-merged does not have a padding token! Will use pad_token = <unk>.
Unsloth 2024.5 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCau

HfHubHTTPError: 429 Client Error: Too Many Requests for url: https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3 (Request ID: HN-tUp05ekbom88ArwSmO)

Rate limit reached. You reached free usage limit (reset hourly). Please subscribe to a plan at https://huggingface.co/pricing to use the API at this rate

## 3. Finetuned on RulesGuru 

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=rg_no_context_model_path,
    max_seq_length=max_seq_length,
    load_in_4bit=True,
)
model.config.use_cache = False
tokenizer.pad_token = tokenizer.eos_token

FastLanguageModel.for_inference(model)

llm_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=True,
    temperature=temperature,
    repetition_penalty=repetition_penalty,
    top_k=top_k,
    top_p=top_p,
    return_full_text=False,
    max_new_tokens=500,
)

df_outputs, scores = evaluate_model(
    llm_pipeline, test_dataset.drop(columns=["context"])
)
try:
    df_outputs = llm_judge(df_outputs)
    scores["llm_judge_rating"] = df_outputs["llm_judge_rating"].mean()
except HTTPError as e:
    print(e)

df_outputs.to_csv("../results/rg_no_context_preds.csv")
df_aux = pd.DataFrame(
    {
        "model": "rg_no_context",
        **scores,
    },
    index=[0],
)
df_scores = pd.concat([df_scores, df_aux], ignore_index=True)
df_scores

==((====))==  Unsloth: Fast Mistral patching release 2024.5
   \\   /|    GPU: NVIDIA RTX A6000. Max memory: 47.529 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = True.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.09it/s]
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'Mistral

{'rouge1': 0.11556282064673487, 'rouge2': 0.04037775732522132, 'rougeL': 0.08957306648195612, 'bleu': 0.01731856075861505, 'bertscore_f1': 0.8117924344866243, 'bertscore_precision': 0.7691056075161451, 'bertscore_recall': 0.8601940376301335}


## 4. Finetuned on RulesGuru + RAG

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=rg_no_context_model_path,
    max_seq_length=max_seq_length,
    load_in_4bit=True,
)
model.config.use_cache = False
tokenizer.pad_token = tokenizer.eos_token

FastLanguageModel.for_inference(model)

llm_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=True,
    temperature=temperature,
    repetition_penalty=repetition_penalty,
    top_k=top_k,
    top_p=top_p,
    return_full_text=False,
    max_new_tokens=500,
)

df_outputs, scores = evaluate_model(llm_pipeline, test_dataset)
try:
    df_outputs = llm_judge(df_outputs)
    scores["llm_judge_rating"] = df_outputs["llm_judge_rating"].mean()
except HTTPError as e:
    print(e)

df_outputs.to_csv("../results/rg_no_context_rag_preds.csv")
df_aux = pd.DataFrame(
    {
        "model": "rg_no_context_rag",
        **scores,
    },
    index=[0],
)
df_scores = pd.concat([df_scores, df_aux], ignore_index=True)
df_scores

==((====))==  Unsloth: Fast Mistral patching release 2024.5
   \\   /|    GPU: NVIDIA RTX A6000. Max memory: 47.529 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = True.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'Mistral

{'rouge1': 0.11487719685522854, 'rouge2': 0.04005438883652405, 'rougeL': 0.08950434027070313, 'bleu': 0.01656295408767753, 'bertscore_f1': 0.8095278217367929, 'bertscore_precision': 0.7665016569503366, 'bertscore_recall': 0.8584357777686968}


## 5. Finetuned on RulesGuru with context (RAG)

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=rg_context_model_path,
    max_seq_length=max_seq_length,
    load_in_4bit=True,
)
model.config.use_cache = False
tokenizer.pad_token = tokenizer.eos_token

FastLanguageModel.for_inference(model)

llm_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=True,
    temperature=temperature,
    repetition_penalty=repetition_penalty,
    top_k=top_k,
    top_p=top_p,
    return_full_text=False,
    max_new_tokens=500,
)

df_outputs, scores = evaluate_model(llm_pipeline, test_dataset)
try:
    df_outputs = llm_judge(df_outputs)
    scores["llm_judge_rating"] = df_outputs["llm_judge_rating"].mean()
except HTTPError as e:
    print(e)

df_outputs.to_csv("../results/rg_context_rag_preds.csv")
df_aux = pd.DataFrame(
    {
        "model": "rg_context_rag",
        **scores,
    },
    index=[0],
)
df_scores = pd.concat([df_scores, df_aux], ignore_index=True)
df_scores

==((====))==  Unsloth: Fast Mistral patching release 2024.5
   \\   /|    GPU: NVIDIA RTX A6000. Max memory: 47.529 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = True.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.19s/it]
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'Mistral

{'rouge1': 0.3317035367852089, 'rouge2': 0.128929813019674, 'rougeL': 0.2475122078428738, 'bleu': 0.08849212240221818, 'bertscore_f1': 0.8798312622390382, 'bertscore_precision': 0.8860747961148824, 'bertscore_recall': 0.8742796309190254}


In [None]:
df_scores.to_csv("../results/scores.csv")

ValueError: Length of values (5) does not match length of index (3)