# Install dependencies and ready the environment

In [1]:
# Install required packages

!pip install -q transformers
!pip install -q datasets
!pip install -q hazm
!pip install -q clean-text[gpl]
!pip install -q numpy
!pip install accelerate -U

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 16.1.0 

In [2]:
import torch
from datasets import load_dataset, Dataset
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from transformers import logging as transformers_logging
import numpy as np
from tqdm import tqdm

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
transformers_logging.set_verbosity_error()
torch.random.manual_seed(1111)

<torch._C.Generator at 0x780f3c26bf50>

# Dataset

In [5]:
dataset = load_dataset("persiannlp/parsinlu_query_paraphrasing")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/175k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/133k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/88.5k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1830 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1916 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/898 [00:00<?, ? examples/s]

In [6]:
dataset = dataset.shuffle(seed=1111)
dataset

DatasetDict({
    train: Dataset({
        features: ['q1', 'q2', 'category', 'label'],
        num_rows: 1830
    })
    test: Dataset({
        features: ['q1', 'q2', 'category', 'label'],
        num_rows: 1916
    })
    validation: Dataset({
        features: ['q1', 'q2', 'category', 'label'],
        num_rows: 898
    })
})

In [7]:
train_pairs = [{'question': (row['q1'], row['q2']), 'label': 'Yes' if row['label']=='1' else 'No'} for row in dataset['train']]
train_pairs[:4]

[{'question': ('بهترین کتابها برای یادگیری C++ کدامند؟',
   'بهترین کتاب برای C++ DEV چیست؟'),
  'label': 'Yes'},
 {'question': ('جواب درس خشم خود را چگونه مهار کنیم تفکر هفتم؟',
   'خلاصه درس خشم خود را چگونه مهار کنیم تفکر هفتم؟'),
  'label': 'No'},
 {'question': ('از کجا می توانم یک پریز برق برای لپ تاپ خود در فرودگاه ملبورن پیدا کنم؟',
   'آیا در صورت ایجاد ارتباط راه آهن پر سرعت بین ملبورن و سیدنی ، فرودگاه دوم در سیدنی استرالیا مورد نیاز خواهد بود؟'),
  'label': 'No'},
 {'question': ('چگونه می توانم PDF را به یک پرونده INDD تبدیل کنم تا در InDesign ویرایش شود؟',
   'چگونه می توانم یک پرونده .ASHX را به .PDF تبدیل کنم؟'),
  'label': 'No'}]

In [8]:
test_pairs = [{'question': (row['q1'], row['q2']), 'label': 'Yes' if row['label']=='1' else 'No'} for row in dataset['test']]
test_pairs[:4]

[{'question': ('چگونه می توانم تماشای فیلم های پورنو را متوقف کنم؟',
   'برای جلوگیری از تماشای کامل پورنو باید چه کاری انجام دهم؟'),
  'label': 'Yes'},
 {'question': ('دانلود اهنگ کجا رفته بودی که ازرد تو؟',
   'دانلود اهنگ کجا رفته بودی ک ازرد تو؟'),
  'label': 'Yes'},
 {'question': ('در دوران شیردهی چه میوه ای بخوریم؟',
   'چه میوه هایی در دوران شیردهی نباید بخوریم؟'),
  'label': 'No'},
 {'question': ('چرا ماشین ظرفشویی اب را تخلیه نمیکند؟',
   'چرا ماشین لباسشویی آب را تخلیه نمیکند؟'),
  'label': 'Yes'}]

# Model

## Utils and Global Variables

In [9]:
from collections import defaultdict
zero_shot_results = defaultdict(dict)
one_shot_results = defaultdict(dict)
few_shot_results = defaultdict(dict)

In [10]:
model_names = [
    'NousResearch/Meta-Llama-3-8B-Instruct',
    'microsoft/Phi-3-mini-4k-instruct',
    'universitytehran/PersianMind-v1.0'
]

In [11]:
def generate_messages_style_1(questions, user_add_prompt, need_system_prompt):
    questions.reverse()

    messages = []

    if need_system_prompt:
        if len(questions) == 1:
            messages.append(
                {"role": "system", "content": "Check similarity between pair of questions. You only need to answer with `Yes` for similarity or `No` for dissimilarity."},
            )
        else:
            messages.append(
                {"role": "system", "content": f"Check similarity between pair of questions. You only need to answer with `Yes` for similarity or `No` for dissimilarity. Following {len(questions) - 1} question answering {'is' if len(questions)==2 else 'are'} for showing purposes."},
            )

    while len(questions) > 1:
        question = questions.pop()
        messages.append({
            "role": "user", "content": f"Question 1: {question['question'][0]}\nQuestion 2: {question['question'][1]}"
        })
        messages.append({
            "role": "assistant", "content": f"{question['label']}"
        })
    question = questions.pop()
    messages.append(
        {
            'role': 'user',
            'content': f"{user_add_prompt}Question 1: {question['question'][0]}\nQuestion 2: {question['question'][1]}"
        }
    )
    expected = question['label']

    return messages, expected

In [12]:
def generate_messages_style_2(questions, user_add_prompt, need_system_prompt):
    questions.reverse()

    messages = []

    if need_system_prompt:
        if len(questions) == 1:
            messages.append(
                {"role": "system", "content": "Check similarity between pair of questions. You only need to answer with `Yes` for similarity or `No` for dissimilarity."},
            )
        else:
            messages.append(
                {"role": "system", "content": f"Check similarity between pair of questions. You only need to answer with `Yes` for similarity or `No` for dissimilarity. Following {len(questions) - 1} question answering {'is' if len(questions)==2 else 'are'} for showing purposes."},
            )

    while len(questions) > 1:
        question = questions.pop()
        messages.append({
            "role": "user", "content": f"سوال 1: {question['question'][0]}\سوال 2: {question['question'][1]}"
        })
        messages.append({
            "role": "assistant", "content": f"{question['label']}"
        })
    question = questions.pop()
    messages.append(
        {
            'role': 'user',
            'content': f"{user_add_prompt}Question 1: {question['question'][0]}\nQuestion 2: {question['question'][1]}"
        }
    )
    expected = question['label']

    return messages, expected

In [13]:
def generate_messages_style_3(questions, user_add_prompt, need_system_prompt):
    questions.reverse()

    messages = []

    if need_system_prompt:
        if len(questions) == 1:
            messages.append(
                {"role": "system", "content": "شباهت بین دو سؤال را بررسی کنید. شما فقط باید برای شباهت با `Yes` یا برای عدم تشابه `No` پاسخ دهید."},
            )
        else:
            messages.append(
                {"role": "system", "content": f"شباهت بین دو سؤال را بررسی کنید. شما فقط باید برای شباهت با `Yes` یا برای عدم تشابه `No` پاسخ دهید. موارد زیر برای نشان دادن رویه کار آورده شده است."},
            )

    while len(questions) > 1:
        question = questions.pop()
        messages.append({
            "role": "user", "content": f"سوال 1: {question['question'][0]}\سوال 2: {question['question'][1]}"
        })
        messages.append({
            "role": "assistant", "content": f"{question['label']}"
        })
    question = questions.pop()
    messages.append(
        {
            'role': 'user',
            'content': f"سوال {user_add_prompt}1: {question['question'][0]}\سوال 2: {question['question'][1]}"
        }
    )
    expected = question['label']

    return messages, expected

In [14]:
def generate_messages_style_4(questions, user_add_prompt, need_system_prompt):
    questions.reverse()

    messages = []

    if need_system_prompt:
        if len(questions) == 1:
            messages.append(
                {"role": "system", "content": "Output `Yes` for similarity of question and `No` for dissimilarity."},
            )
        else:
            messages.append(
                {"role": "system", "content": f"Output `Yes` for similarity of question and `No` for dissimilarity. Following {len(questions) - 1} question answering {'is' if len(questions)==2 else 'are'} for showing the procedure."},
            )

    while len(questions) > 1:
        question = questions.pop()
        messages.append({
            "role": "user", "content": f"Question 1: {question['question'][0]}\nQuestion 2: {question['question'][1]}"
        })
        messages.append({
            "role": "assistant", "content": f"{question['label']}"
        })
    question = questions.pop()
    messages.append(
        {
            'role': 'user',
            'content': f"{user_add_prompt}Question 1: {question['question'][0]}\nQuestion 2: {question['question'][1]}"
        }
    )
    expected = question['label']

    return messages, expected

In [15]:
def generate_messages_style_5(questions, user_add_prompt, need_system_prompt):
    questions.reverse()

    messages = []

    if need_system_prompt:
        if len(questions) == 1:
            messages.append(
                {"role": "system", "content": "Output `Yes` for similarity of question and `No` for dissimilarity."},
            )
        else:
            messages.append(
                {"role": "system", "content": f"Output `Yes` for similarity of question and `No` for dissimilarity. Following {len(questions) - 1} question answering {'is' if len(questions)==2 else 'are'} for showing the procedure."},
            )

    while len(questions) > 1:
        question = questions.pop()
        messages.append({
            "role": "user", "content": f"Question 1: {question['question'][0]} - Question 2: {question['question'][1]}"
        })
        messages.append({
            "role": "assistant", "content": f"{question['label']}"
        })
    question = questions.pop()
    messages.append(
        {
            'role': 'user',
            'content': f"{user_add_prompt}Question 1: {question['question'][0]} - Question 2: {question['question'][1]}"
        }
    )
    expected = question['label']

    return messages, expected

In [16]:
message_generators = [
    generate_messages_style_1,
    generate_messages_style_2,
    generate_messages_style_3,
    generate_messages_style_4,
    generate_messages_style_5
]

In [17]:
def evaluate_model(output_generator, message_generator, inital_questions, pairs, check_count, user_add_prompt, need_system_prompt):
    correct = 0
    check_pairs = pairs[:check_count]
    for row in tqdm(check_pairs):
        messages, expected = message_generator(list(inital_questions) + [row], user_add_prompt, need_system_prompt)
        prediction = output_generator(messages)
        if expected == prediction:
            correct += 1
    return correct / check_count

In [18]:
def zero_shot_evaluation(output_generator, message_generator, check_count, user_add_prompt, need_system_prompt):
    inital_questions = []
    accuracy = evaluate_model(output_generator, message_generator, inital_questions, train_pairs, check_count, user_add_prompt, need_system_prompt)
    return accuracy


def one_shot_evaluation(output_generator, message_generator, check_count, user_add_prompt, need_system_prompt):
    inital_questions = np.random.choice(test_pairs, size=1, replace=False)
    accuracy = evaluate_model(output_generator, message_generator, inital_questions, train_pairs, check_count, user_add_prompt, need_system_prompt)
    return accuracy


def few_shot_evaluation(output_generator, message_generator, check_count, user_add_prompt, need_system_prompt):
    inital_questions = np.random.choice(test_pairs, size=5, replace=False)
    accuracy = evaluate_model(output_generator, message_generator, inital_questions, train_pairs, check_count, user_add_prompt, need_system_prompt)
    return accuracy

In [19]:
def evaluate_model_on_message_generators(output_generator, model_id, check_count=20, user_add_prompt='', need_system_prompt=True):
    print(f"Evaluating {model_id}")

    for i, mg in enumerate(message_generators):
        print(f"Checking Style {i + 1}")
        zero_shot_results[model_id][i + 1] = zero_shot_evaluation(output_generator, mg, check_count, user_add_prompt, need_system_prompt)
        one_shot_results[model_id][i + 1] = one_shot_evaluation(output_generator, mg, check_count, user_add_prompt, need_system_prompt)
        few_shot_results[model_id][i + 1] = few_shot_evaluation(output_generator, mg, check_count, user_add_prompt, need_system_prompt)

    print()
    print(f"Result for {model_id}:")
    print("\tZero-shot results:", zero_shot_results[model_id])
    print("\tOne-shot results:", one_shot_results[model_id])
    print("\tFew-shot results:", few_shot_results[model_id])

## NousResearch/Meta-Llama-3-8B-Instruct

In [20]:
pipe0 = pipeline(
    "text-generation",
    model=model_names[0],
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

In [21]:
def generate_llama_output(messages):
    prompt = pipe0.tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    terminators = [
        pipe0.tokenizer.eos_token_id,
        pipe0.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = pipe0(
        prompt,
        max_new_tokens=256,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
    )

    return outputs[0]["generated_text"][len(prompt):]

In [22]:
evaluate_model_on_message_generators(generate_llama_output, model_names[0])

Evaluating NousResearch/Meta-Llama-3-8B-Instruct
Checking Style 1


100%|██████████| 20/20 [00:59<00:00,  3.00s/it]
100%|██████████| 20/20 [00:48<00:00,  2.43s/it]
100%|██████████| 20/20 [01:19<00:00,  3.99s/it]


Checking Style 2


100%|██████████| 20/20 [00:46<00:00,  2.32s/it]
100%|██████████| 20/20 [00:51<00:00,  2.59s/it]
100%|██████████| 20/20 [01:21<00:00,  4.08s/it]


Checking Style 3


100%|██████████| 20/20 [00:46<00:00,  2.34s/it]
100%|██████████| 20/20 [00:54<00:00,  2.75s/it]
100%|██████████| 20/20 [01:19<00:00,  3.95s/it]


Checking Style 4


100%|██████████| 20/20 [00:43<00:00,  2.19s/it]
100%|██████████| 20/20 [00:48<00:00,  2.41s/it]
100%|██████████| 20/20 [01:11<00:00,  3.56s/it]


Checking Style 5


100%|██████████| 20/20 [00:44<00:00,  2.22s/it]
100%|██████████| 20/20 [00:48<00:00,  2.41s/it]
100%|██████████| 20/20 [01:12<00:00,  3.64s/it]


Result for NousResearch/Meta-Llama-3-8B-Instruct:
	Zero-shot results: {1: 0.55, 2: 0.55, 3: 0.55, 4: 0.5, 5: 0.5}
	One-shot results: {1: 0.6, 2: 0.65, 3: 0.6, 4: 0.75, 5: 0.55}
	Few-shot results: {1: 0.65, 2: 0.6, 3: 0.75, 4: 0.65, 5: 0.6}





## microsoft/Phi-3-mini-4k-instruct

In [None]:
tokenizer1 = AutoTokenizer.from_pretrained(model_names[1])
model1 = AutoModelForCausalLM.from_pretrained(
    model_names[1],
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
)
# tokenizer1.use_default_system_prompt = True
# tokenizer1.chat_template = "{{ bos_token }}{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + '<|end|>' }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + '<|end|>' }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + '<|end|>' }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
pipe1 = pipeline(
    "text-generation",
    model=model1,
    tokenizer=tokenizer1
)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
def generate_phi_output(messages):
    prompt = pipe1.tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        trust_remote_code=True
    )

    terminators = [
        pipe1.tokenizer.eos_token_id,
        pipe1.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = pipe1(
        prompt,
        max_new_tokens=1, # Should set to 1, otherwise model will continue generating text.
        eos_token_id=terminators,
        do_sample=False, # Yeild different answers with setting it. better trun it off.
        # top_p=0.9,
    )

    return outputs[0]["generated_text"][len(prompt):].strip()

In [None]:
user_add_prompt = 'Just Answer with `Yes` or `No`. No further explanation. '

In [None]:
evaluate_model_on_message_generators(generate_phi_output, model_names[1], check_count=20, user_add_prompt=user_add_prompt, need_system_prompt=False)

Evaluating microsoft/Phi-3-mini-4k-instruct
Checking Style 1


100%|██████████| 20/20 [00:08<00:00,  2.28it/s]
100%|██████████| 20/20 [00:14<00:00,  1.34it/s]
100%|██████████| 20/20 [00:37<00:00,  1.89s/it]


Checking Style 2


100%|██████████| 20/20 [00:08<00:00,  2.27it/s]
100%|██████████| 20/20 [00:13<00:00,  1.51it/s]
100%|██████████| 20/20 [00:40<00:00,  2.01s/it]


Checking Style 3


100%|██████████| 20/20 [00:09<00:00,  2.22it/s]
100%|██████████| 20/20 [00:16<00:00,  1.24it/s]
100%|██████████| 20/20 [00:46<00:00,  2.31s/it]


Checking Style 4


100%|██████████| 20/20 [00:08<00:00,  2.27it/s]
100%|██████████| 20/20 [00:15<00:00,  1.30it/s]
100%|██████████| 20/20 [00:35<00:00,  1.79s/it]


Checking Style 5


100%|██████████| 20/20 [00:08<00:00,  2.29it/s]
100%|██████████| 20/20 [00:13<00:00,  1.50it/s]
100%|██████████| 20/20 [00:33<00:00,  1.69s/it]


Result for microsoft/Phi-3-mini-4k-instruct:
	Zero-shot results: {1: 0.65, 2: 0.65, 3: 0.0, 4: 0.65, 5: 0.5}
	One-shot results: {1: 0.45, 2: 0.35, 3: 0.25, 4: 0.5, 5: 0.5}
	Few-shot results: {1: 0.45, 2: 0.55, 3: 0.4, 4: 0.5, 5: 0.7}





## universitytehran/PersianMind-v1.0

In [None]:
tokenizer2 = AutoTokenizer.from_pretrained(model_names[2])
model2 = AutoModelForCausalLM.from_pretrained(
    model_names[2],
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    device_map={"": device},
)
# tokenizer2.use_default_system_prompt = True
# tokenizer2.chat_template = "{{ bos_token }}{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + '<|end|>' }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + '<|end|>' }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + '<|end|>' }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
pipe2 = pipeline(
    "text-generation",
    model=model2,
    tokenizer=tokenizer2
)

tokenizer_config.json:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/688k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.50M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/549 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/503 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/199 [00:00<?, ?B/s]

In [None]:
TEMPLATE = "{context}\nUser: {prompt}\nPersianMind: "
CONTEXT = "Check similarity of each pair of questions. Answer with `Yes` or `No` and with no further explanation."

In [None]:
def generate_pm_output(messages):
    prompt = pipe2.tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        # trust_remote_code=True
    )

    terminators = [
        pipe2.tokenizer.eos_token_id,
        pipe2.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = pipe2(
        prompt,
        max_new_tokens=2, # Should set to 1, otherwise model will continue generating text.
        eos_token_id=terminators,
        # temperature=None,
        do_sample=True, # Yeild different answers with setting it. better trun it off.
        top_p=0.9,
    )

    return outputs[0]["generated_text"][len(prompt):].strip()

In [None]:
user_add_prompt = 'Just Answer with `Yes` or `No`. No further explanation. '
evaluate_model_on_message_generators(generate_pm_output, model_names[2], check_count=20, user_add_prompt=user_add_prompt, need_system_prompt=True)

Evaluating universitytehran/PersianMind-v1.0
Checking Style 1


100%|██████████| 20/20 [00:16<00:00,  1.22it/s]
100%|██████████| 20/20 [00:24<00:00,  1.23s/it]
100%|██████████| 20/20 [00:39<00:00,  1.99s/it]


Checking Style 2


100%|██████████| 20/20 [00:15<00:00,  1.26it/s]
100%|██████████| 20/20 [00:23<00:00,  1.15s/it]
100%|██████████| 20/20 [00:46<00:00,  2.31s/it]


Checking Style 3


100%|██████████| 20/20 [00:15<00:00,  1.26it/s]
100%|██████████| 20/20 [00:23<00:00,  1.17s/it]
100%|██████████| 20/20 [00:43<00:00,  2.17s/it]


Checking Style 4


100%|██████████| 20/20 [00:15<00:00,  1.28it/s]
100%|██████████| 20/20 [00:22<00:00,  1.14s/it]
100%|██████████| 20/20 [00:44<00:00,  2.22s/it]


Checking Style 5


100%|██████████| 20/20 [00:15<00:00,  1.28it/s]
100%|██████████| 20/20 [00:22<00:00,  1.13s/it]
100%|██████████| 20/20 [00:38<00:00,  1.92s/it]


Result for universitytehran/PersianMind-v1.0:
	Zero-shot results: {1: 0.4, 2: 0.4, 3: 0.0, 4: 0.4, 5: 0.4}
	One-shot results: {1: 0.35, 2: 0.4, 3: 0.35, 4: 0.4, 5: 0.4}
	Few-shot results: {1: 0.4, 2: 0.4, 3: 0.4, 4: 0.35, 5: 0.35}



