## Load Dataset

In [None]:
import json

def load_and_format_gotquestions(file_path):
    """
    Load the gotquestions JSON file and format it into conversation pairs.
    
    Args:
        file_path (str): Path to the JSON file
        
    Returns:
        list: List of conversation dictionaries with 'content' and 'role' keys
    """
    # Load the JSON file
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    
    conversations = []
    
    # Iterate through each category
    for category in data:
        category_name = category.get('name', '')
        articles = category.get('articles', [])
        
        # Process each article in the category
        for article in articles:
            question = article.get('name', '')
            answer = article.get('answer', '')
            
            # Clean up the answer by removing extra whitespace and newlines
            answer = answer.strip()
            
            # Create conversation pair
            conversation_pair = [
                {"role": "user", "content": question},
                {"role": "assistant", "content": answer}
            ]
            
            conversations.append(conversation_pair)
    
    return conversations

def load_and_format_qa_messages_jsonl(file_path):
    """
    Load the Arabic Final qa_messages.jsonl file and format it into conversation pairs.

    Args:
        file_path (str): Path to the JSONL file

    Returns:
        list: List of conversation pairs (each pair is a list of dicts with 'role' and 'content')
    """
    conversations = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data = json.loads(line.strip())
            msgs = data.get("messages", [])
            # Only keep user/assistant pairs (ignore system if present)
            pair = []
            for msg in msgs:
                if msg["role"] in ("user", "assistant"):
                    pair.append({"role": msg["role"], "content": msg["content"]})
            if len(pair) == 2:
                conversations.append(pair)
    return conversations

# Load as one long conversation
file_path = "data/gotquestions_ar.json"
formatted_data = load_and_format_gotquestions(file_path)

file_path_jsonl = "data/Arabic Final qa_messages.jsonl"
formatted_data_jsonl = load_and_format_qa_messages_jsonl(file_path_jsonl)

# Combine both datasets
combined_data = formatted_data + formatted_data_jsonl

# Randomly shuffle the combined data
# random.shuffle(combined_data)
print(f"Total messages: {len(combined_data)}")

In [None]:
from parrot_ai import ParrotAI, parrot_chain

parrot = ParrotAI()
parrot.load_model("google/gemma-3-12b-it")

# Check if model is loaded
if parrot.is_loaded():
    print(parrot.get_model_info())

In [None]:
from tqdm import tqdm
import parrot_ai.prompts as parrot_prompts
import os
import json

# Output file for the training dataset
output_file = "data/training_dataset.jsonl"

# Check if the output file exists and count existing entries
existing_count = 0
if os.path.exists(output_file):
    with open(output_file, 'r', encoding='utf-8') as f:
        existing_count = sum(1 for line in f if line.strip())
    print(f"Found existing file with {existing_count} entries. Resuming from there.")
else:
    print("Creating new training dataset file.")

# Process data starting from where we left off
start_index = existing_count
total_data = len(combined_data)

print(f"Processing {total_data - start_index} remaining entries...")

# Open file in append mode for incremental saving
with open(output_file, 'a', encoding='utf-8') as f:
    for i in tqdm(range(start_index, total_data), desc="Generating training data"):
        try:
            data = combined_data[i]
            
            # Generate response using parrot_chain
            response = parrot_chain(data, parrot)
            
            # Create training example in the format expected for fine-tuning
            training_example = {
                "messages": [
                    {
                        "role": "system",
                        "content": parrot_prompts.MAIN_SYSTEM_PROMPT
                    },
                    {
                        "role": "user",
                        "content": data[0]["content"]  # User question
                    },
                    {
                        "role": "assistant",
                        "content": response["final_answer"]  # Final answer from chain
                    }
                ]
            }
            
            # Write the training example as a JSON line
            f.write(json.dumps(training_example, ensure_ascii=False) + '\n')
            f.flush()  # Ensure data is written immediately
            
        except Exception as e:
            print(f"\nError processing item {i}: {e}")
            print(f"Question: {data[0]['content'][:100]}...")
            # Continue with the next item instead of stopping
            continue

print(f"\nTraining dataset creation completed!")
print(f"Output saved to: {output_file}")

# Count final entries
with open(output_file, 'r', encoding='utf-8') as f:
    final_count = sum(1 for line in f if line.strip())
print(f"Total training examples: {final_count}")