In [1]:
!curl -fsSL https://ollama.com/install.sh | sh 

>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%   12.4%################                                    53.2%                59.6%
>>> Creating ollama user...
>>> Adding ollama user to render group...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.


In [2]:
## Run `ollama serve` in one terminal
## Run `ollama run llama3.1:8b-instruct-fp16`
## Run `ollama run gemma2:9b-instruct-fp16`

In [3]:
!pip install OpenAI datasets rich

Collecting OpenAI
  Downloading openai-1.54.3-py3-none-any.whl.metadata (24 kB)
Collecting rich
  Downloading rich-13.9.4-py3-none-any.whl.metadata (18 kB)
Collecting httpx<1,>=0.23.0 (from OpenAI)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from OpenAI)
  Downloading jiter-0.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Collecting typing-extensions<5,>=4.11 (from OpenAI)
  Downloading typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Collecting markdown-it-py>=2.2.0 (from rich)
  Downloading markdown_it_py-3.0.0-py3-none-any.whl.metadata (6.9 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->OpenAI)
  Downloading httpcore-1.0.6-py3-none-any.whl.metadata (21 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->OpenAI)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Collecting mdurl~=0.1 (from markdown-it-py>=2.2.0->rich)
  Downloading mdurl-0.1.2-py3-none-any.whl.m

In [4]:
from rich import print
import os
from openai import OpenAI
from datasets import Dataset, DatasetDict, load_dataset
import json

In [5]:
modelName = "llama3.1:8b-instruct-fp16"

In [15]:
topic = "Science"
fileName = f'synthetic_data-{topic}.jsonl'.format(topic)
print(fileName)
n_subtopics = 14
n_questions = 2

client = OpenAI(
   base_url = 'http://localhost:11434/v1', ## using ollama local endpoint
    api_key='ollama', # required, but unused
)

In [16]:
# 1. Subtopics Generation

TOPIC_GENERATION_PROMPT_TEMPLATE = """\
Given a topic, generate a list of {n_subtopics} topics make it simple and subtopics in Arabic that are formally related to the topic.

The topic is: {topic}

Only provide the list of subtopics, without numbers, descriptions, or any additional text. Separate each subtopic with newline
"""

def generate_subtopics(client, topic, n_subtopics):
    prompt = TOPIC_GENERATION_PROMPT_TEMPLATE.format(topic=topic, n_subtopics=n_subtopics)
    response = client.chat.completions.create(
        model=modelName,
        messages=[
            {"role": "user",
             "content": prompt}
        ],
        temperature=0.2,
        top_p=0.7,
        max_tokens=1024,
    )
    return response

responses = generate_subtopics(client, topic=topic, n_subtopics=n_subtopics)
print(responses.choices[0].message.content.split('\n'), len(responses.choices[0].message.content.split('\n')))

In [18]:

# 2. Questions Generation

QUESTION_PROMPT_TEMPLATE = """
Given a topic, generate {n_questions} in-depth and questions in Arabic that could be asked about that topic.

The topic is: {sub_topic}

Only provide the list of questions, formatted with each question on a new line. Exclude numbering, bullet points, and any additional text.
"""
subtopic_list = responses.choices[0].message.content.split("\n")
def generate_questions(client, sub_topic, n_questions):
    prompt = QUESTION_PROMPT_TEMPLATE.format(sub_topic=sub_topic, n_questions=n_questions)
    response = client.chat.completions.create(
        model=modelName,
        messages=[
            {"role": "user",
             "content": prompt}
        ],
        temperature=0.2,
        top_p=0.7,
        max_tokens=8048,
    )
    print(response.choices[0].message.content)
    return response.choices[0].message.content

def question_generator(client, subtopic_list, n_question):
    tasks = []
    for subtopic in subtopic_list:
        tasks.append(generate_questions(client, subtopic, n_question))

    print(tasks)
    question_list = tasks
    return question_list

question_list = question_generator(client, subtopic_list, n_questions)

question_list_formatted = []
for question_set in question_list:
    question_list_formatted.extend([question.strip() for question in question_set.split("\n") if question])

In [19]:
# 3. Responses Generation
RESPONSE_PROMPT_TEMPLATE = """
Given a question, generate a formal positive and loosely related, irrelevant negative response in Arabic.

The questions is: {question}

RESPONSE A should be a formal, positive response that directly answers the question.
RESPONSE B should be a formal, negative response that is loosely related to the question but does not answer it.
Your response should follow this format without any numbers, bullet points, additional text, or the question itself:

RESPONSE A: [Positive response text here]
RESPONSE B: [Negative response text here]
"""
def generate_responses(client, question):
    prompt = RESPONSE_PROMPT_TEMPLATE.format(question=question)
    print('Prompt: ', prompt)
    response = client.chat.completions.create(
        model=modelName,
        messages=[
            {"role": "user",
             "content": prompt}
        ],
        temperature=0.2,
        top_p=0.7,
        max_tokens=1024,
    )
    print(response.choices[0].message.content)
    return response.choices[0].message.content

def response_generator(client, question_list):
    tasks = [generate_responses(client, question) for question in question_list]
    response_list = tasks
    return response_list

question_response_list = response_generator(client, question_list_formatted)
question_response_pair_list = []
for question, response_set in zip(question_list_formatted, question_response_list):
    question_response_pair_list.append(
        {
            "question": question,
            "positive": response_set.split("RESPONSE B:")[0].replace("RESPONSE A:", "").strip(),
            "negative": response_set.split("RESPONSE B:")[-1].split("\n\n")[0].strip()
        }
    )
    


with open(fileName, 'w') as f:
    for item in question_response_pair_list:
        f.write(json.dumps(item))
        f.write('\n')


In [20]:
scoringModelName = "gemma2:9b-instruct-fp16"

In [21]:
import json
with open(fileName) as f:
    data = [json.loads(line) for line in f]
    
print(data[0])

In [22]:
RESPONSE_PROMPT_TEMPLATE = """
Acting as an Arabic language expert and a subject matter expert on the provided question, evaluate the quality of the following positive answer based on its accuracy, relevance, and clarity in addressing the question. Rate the answer on a confidence scale from 1 to 10, where 1 represents the lowest confidence in the answer's validity and 10 represents the highest. If the question or answer is missing, incomplete, appears incorrect, or contains any words that are neither English nor Arabic, respond with 0. Only respond with the evaluation number.

Question: {question}
Positive Answer: {positive}
"""
def generate_evaluation(client, item):
    question = item['question']
    positive = item['positive']
    prompt = RESPONSE_PROMPT_TEMPLATE.format(question=question, positive=positive)
    print('Prompt: ', prompt)
    response = client.chat.completions.create(
        model=modelName,
        messages=[
            {"role": "user",
             "content": prompt}
        ],
        temperature=0.2,
        top_p=0.7,
        max_tokens=1024,
    )
    print(response.choices[0].message.content)
    return int(response.choices[0].message.content)
finalItems = []

for item in data:
    if(generate_evaluation(client, item) > 7):
        finalItems.append(item)

In [23]:
with open(f"final-{topic}.jsonl".format(topic), 'w') as f:
    for item in finalItems:
        item['topic'] = topic
        f.write(json.dumps(item))
        f.write('\n')