## 1. Load the old dataset

In [None]:
import json

with open("../dataset.json") as f:
    data = json.load(f)

len(data)

## 2. Add the conversation field and conversation ID

In [None]:
new_data = []
for i in range(len(data)):
    new_data.append({
        "id": i,
        "conversation": data[i]
    })

In [None]:
new_data[0].keys()

In [None]:
new_data[0]

## 3. Add number of tokens

In [None]:
import sys
sys.path.append('../')

In [None]:
from minbpe import RegexTokenizer

tokenizer = RegexTokenizer()
tokenizer.load("../tokenizer/darija_tokenizer.model")

In [None]:
for entry in new_data:
    token_count = 0
    for conversation_turn in entry["conversation"]:
        content = conversation_turn["content"]
        if content:
            tokens = tokenizer.encode(content)
            token_count += len(tokens)

    entry["token_count"] = token_count

In [None]:
new_data[0].keys()

In [None]:
new_data[0]

## 4. Add number of turns

In [None]:
for entry in new_data:
    entry["turns_count"] = len(entry["conversation"])

In [None]:
new_data[0].keys()

In [None]:
new_data[1]

## 5. Turn conversation to markdown

In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
import os

from google import genai

client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))

In [None]:
for model in client.models.list():
    if "gemini-2" in model.name.lower():
        print(model.name)
        print(model.input_token_limit)
        print(model.output_token_limit)
        print()

In [None]:
model = "gemini-2.0-flash"
system_instruction = """You are a helpful assistant. You will be given a conversation between a user and an assistant. Your task is to convert the conversation from plain text into markdown format if possible. 

The dataset was generated manually without any markdown formatting. The conversation is in Moroccan Arabic (Darija). Here are some examples of how to format the conversation in markdown:

# Example 1:

## Before:

```json
[
    {
        "role": "user",
        "content": "السلام لباس؟"
    },
    {
        "role": "assistant",
        "content": "لاباس الحمد لله، كاين شي حاجا بغيتي نعاونك فيها؟"
    },
    {
        "role": "user",
        "content": "اه عافاك بغيت نسافر فالمغرب فالصيف ولكن معرفتش فين نمشي. ممكن تعاوني؟"
    },
    {
        "role": "assistant",
        "content": "بلان كاين بزاف ديال البلايص اللي تقد تمشي ليهم فالمغرب، انا كنقترح عليك هدو:\n\n- شفشاون: هدي مدينة فالجبل، الديور ديالها زرقين او الجو فالمدينة كيجيب الراحة.\n- الصويرة: هاد المدينة فيها البحر الا فيك ميعوم. البحر ديالها زوين او فيها المدينة القديمة.\n- الداخلة: الداخلة هي مدينة فالصحرا ديال المغرب، حتاهيا فيها البحر. الناس كيجيو ليه من العالم كامل باش يلعبوا السبور.\n- مراكش: هاد المدينة عزيزة على السياح لكيجيو من برا. فيها جامع الفنا، المدينة القديمة ولكن فالصيف دايرة بحال الفران.\n- شلالات أوزود: هاد الشلالات كاينين فالجبل دالأطلس، هادوا اشهر الشلالات فالمغرب سير تمنضر فيهوم معا راسك راه ايعجبوك.\n\nكاين بزاف ديال البلايس اخرين فالمغرب ولكن غولي بعدا واش هدوا عجبوك."
    },
    {
        "role": "user",
        "content": "شكرا بزاف اصاحبي"
    },
    {
        "role": "assistant",
        "content": "مرحبا، إلى بغيتي شت حاجة أخرى غولهالي انا هنا باش نعاونك."
    },
    {
        "role": "user",
        "content": "لالا صافي مبغيت حتى شي حاجة"
    },
    {
        "role": "assistant",
        "content": "اوكي اوا نمشي نرتاح شوية على هاد الحساب. ستمتع معا راسك بالتسافيرة."
    }
]
```

## After:

```json
[
    {
      "role": "user",
      "content": "السلام لباس؟"
    },
    {
      "role": "assistant",
      "content": "لاباس الحمد لله، كاين شي حاجا بغيتي نعاونك فيها؟"
    },
    {
      "role": "user",
      "content": "اه عافاك بغيت نسافر فالمغرب فالصيف ولكن معرفتش فين نمشي. ممكن تعاوني؟"
    },
    {
      "role": "assistant",
      "content": "بلان كاين بزاف ديال البلايص اللي تقد تمشي ليهم فالمغرب، انا كنقترح عليك هدو:\n\n- **شفشاون**: هدي مدينة فالجبل، الديور ديالها زرقين او الجو فالمدينة كيجيب الراحة.\n- **الصويرة**: هاد المدينة فيها البحر الا فيك ميعوم. البحر ديالها زوين او فيها المدينة القديمة.\n- **الداخلة**: الداخلة هي مدينة فالصحرا ديال المغرب، حتاهيا فيها البحر. الناس كيجيو ليه من العالم كامل باش يلعبوا السبور.\n- **مراكش**: هاد المدينة عزيزة على السياح لكيجيو من برا. فيها جامع الفنا، المدينة القديمة ولكن فالصيف دايرة بحال الفران.\n- **شلالات أوزود**: هاد الشلالات كاينين فالجبل دالأطلس، هادوا اشهر الشلالات فالمغرب سير تمنضر فيهوم معا راسك راه ايعجبوك.\n\nكاين بزاف ديال البلايس اخرين فالمغرب ولكن غولي بعدا واش هدوا عجبوك."
    },
    {
      "role": "user",
      "content": "شكرا بزاف اصاحبي"
    },
    {
      "role": "assistant",
      "content": "مرحبا، إلى بغيتي شت حاجة أخرى غولهالي انا هنا باش نعاونك."
    },
    {
      "role": "user",
      "content": "لالا صافي مبغيت حتى شي حاجة"
    },
    {
      "role": "assistant",
      "content": "اوكي اوا نمشي نرتاح شوية على هاد الحساب. ستمتع معا راسك بالتسافيرة."
    }
]
```

# Example 2:

## Before:

```json
[
    {
        "role": "user",
        "content": "اهلا شنو سميتك؟"
    },
    {
        "role": "assistant",
        "content": "انا سميتي بودماغ، ساوبني عماد الصاديق باش نساعدك فاي حاجة اللي نقد نعاون فيها."
    }
]
```

## After:

```json
[
    {
      "role": "user",
      "content": "اهلا شنو سميتك؟"
    },
    {
      "role": "assistant",
      "content": "انا سميتي **بودماغ**، ساوبني **عماد الصاديق** باش نساعدك فاي حاجة اللي نقد نعاون فيها."
    }
]
```

# Example 3:

## Before:

```json
[
    {
        "role": "user",
        "content": "شحال خاصني نصلي فالنهار؟"
    },
    {
        "role": "assistant",
        "content": "خاصك تصلي 5 دالمرات فالنهار."
    },
    {
        "role": "user",
        "content": "اوكي شكرا"
    },
    {
        "role": "assistant",
        "content": "مرحبا على الراس او العين"
    }
]
```

## After:

```json
[
    {
      "role": "user",
      "content": "شحال خاصني نصلي فالنهار؟"
    },
    {
      "role": "assistant",
      "content": "خاصك تصلي **5 دالمرات** فالنهار."
    },
    {
      "role": "user",
      "content": "اوكي شكرا"
    },
    {
      "role": "assistant",
      "content": "مرحبا على الراس او العين."
    }
]
```

From these examples, you should be able to understand how to format the conversation in markdown. 

Notse:
- In some conversations, markdown formatting may not be possible. In that case, you can just return the conversation as it is without any formatting.
- The output format shoudl be a valid JSON array.
"""

In [None]:
import time

from tqdm import tqdm
from pydantic import BaseModel
from google.genai.types import GenerateContentConfig, GenerateContentResponse


class ConversationTurn(BaseModel):
    role: str
    content: str


class Conversation(BaseModel):
    conversation: list[ConversationTurn]


def get_user_prompt(conversation: list[dict]) -> str:
    return f"""Please format the following conversation in markdown format. If you cannot format it, return it as is without any formatting.

{conversation}
"""


def get_formatted_conversation(response: GenerateContentResponse) -> list[dict]:
    if not response:
        return []

    formatted_conversation = []
    for conversation_turn in response.parsed.conversation:
        formatted_conversation.append({
            "role": conversation_turn.role,
            "content": conversation_turn.content
        })
    return formatted_conversation


request_per_minute = 30
for entry in tqdm(new_data, total=len(new_data)):
    conversation = entry["conversation"]
    user_prompt = get_user_prompt(conversation)
    response = client.models.generate_content(
        model=model,
        contents=user_prompt,
        config=GenerateContentConfig(
            system_instruction=[
                system_instruction
            ],
            response_mime_type="application/json",
            response_schema=Conversation,
        )
    )
    if not response:
        entry["markdown_conversation"] = []
    else:
        formatted_conversation = get_formatted_conversation(response)
        entry["markdown_conversation"] = formatted_conversation

    time.sleep(3)

## 6. Add the source of data

In [None]:
for entry in new_data:
    entry['source'] = 'Manually generated'

In [None]:
new_data[0].keys()

## 7. Add the topic of the conversation

In [None]:
system_instruction = """You are a Topic Classification AI. Your task is to analyze the provided conversation transcript and determine its single, primary topic.

You MUST prioritize selecting ONE topic from the following predefined list if the conversation's main subject is adequately covered by one of them:

[
  "Geography",
  "Science",
  "Technology",
  "History",
  "Culture",
  "Religion",
  "Education",
  "Health",
  "Sports",
  "AI Persona",
  "Chit-chat/Games/Humor",
  "Safety",
  "Mathematics",
  "Ethics",
  "Politics",
  "Language",
  "Food",
  "Animals",
  "Arts",
  "Career",
  "Travel",
  "Task Assistance"
]

Focus on the *dominant theme* or *central subject* of the entire conversation, not just isolated sentences or brief tangents.

If, after careful consideration, you find that *none* of the provided topics accurately or sufficiently capture the main subject of the conversation, you MUST identify and output a *new*, concise, and appropriate topic label that best describes it (e.g., "Relationship Advice", "Car Maintenance", "Event Planning").

Your response MUST consist *only* of the single chosen or newly identified topic label. Do not include any explanations, introductions, justifications, or surrounding text. Just output the topic name.
"""

In [None]:
import time

from tqdm import tqdm
from google.genai.types import GenerateContentConfig


def get_user_prompt(conversation: list[dict]) -> str:
    return f"""Classify the topic of the following conversation:

{conversation}
"""


request_per_minute = 30
for entry in tqdm(new_data, total=len(new_data)):
    conversation = entry["conversation"]
    user_prompt = get_user_prompt(conversation)
    response = client.models.generate_content(
        model=model,
        contents=user_prompt,
        config=GenerateContentConfig(
            system_instruction=[
                system_instruction
            ]
        )
    )
    if not response:
        entry["topic"] = []
    else:
        topic = response.text
        entry["topic"] = topic
    time.sleep(5)

## 8. Add Safety column 

In [None]:
system_instruction = """You are a Safety Classification AI. Your task is to analyze the provided conversation transcript and identify any applicable safety flags based on the content.

Review the entire conversation for instances matching any of the following categories:

*   **`harmful_content_request`**: User asks for instructions or information on performing illegal or dangerous activities (e.g., making weapons, promoting violence, illegal substance use).
*   **`unethical_request`**: User asks for assistance with academically dishonest or clearly unethical actions (e.g., cheating, plagiarism, impersonation).
*   **`unsafe_activity_inquiry`**: User asks about performing physically dangerous actions without a clear safety or educational context (e.g., self-harm, dangerous stunts, misuse of objects).
*   **`sexual_content_suggestion`**: User inputs sexually explicit content, requests, or makes inappropriate sexual suggestions.
*   **`inappropriate_medical_advice_request`**: User asks for medical diagnosis, treatment plans, or specific medication advice that should only come from a healthcare professional.
*   **`insults`**: User directs personal insults or attacks towards the AI or another party.
*   **`vulgar_language`**: User employs excessive or highly offensive profanity/vulgarity.
*   **`threat`**: User expresses a direct or indirect threat of harm towards the AI or others.
*   **`animal_cruelty_suggestion`**: User suggests or describes actions that constitute cruelty or significant harm to animals.
*   **`refusal`**: The AI explicitly refuses to fulfill a user request due to safety, ethical, or capability limitations. (Include this if the AI's refusal action is relevant).

Consider the context of the conversation. A flag should only be applied if the content clearly fits the category description. Multiple flags can apply to a single conversation.

Your response MUST be a valid JSON list containing the applicable flag strings. If no flags apply and the conversation is deemed safe, output an empty JSON list `[]`. Do not include any explanations, introductions, justifications, or surrounding text. Just output the JSON list.

If you think a new flag is needed, use it instead of the predefined ones. The new flag should be a single string that describes the issue in a concise manner. For example, if the user is asking for help with a dangerous activity, you can use "dangerous_activity_request" as a new flag.
"""

In [None]:
import time

from tqdm import tqdm
from google.genai.types import GenerateContentConfig


def get_user_prompt(conversation: list[dict]) -> str:
    return f"""Classify the safety flags for the following conversation:

{conversation}
"""


request_per_minute = 30
for entry in tqdm(new_data, total=len(new_data)):
    conversation = entry["conversation"]
    user_prompt = get_user_prompt(conversation)
    response = client.models.generate_content(
        model=model,
        contents=user_prompt,
        config=GenerateContentConfig(
            system_instruction=[
                system_instruction
            ]
        )
    )
    if not response:
        entry["safety_flag"] = []
    else:
        safety_flag = response.text.strip()
        entry["safety_flag"] = safety_flag
    time.sleep(5)

## 9. Save the data

In [None]:
for entry in new_data:
    topic = entry["topic"].strip()
    entry["topic"] = topic

In [None]:
import pandas as pd

df = pd.DataFrame(new_data)
df.head()

In [None]:
df["token_count"].describe()

In [None]:
df["turns_count"].describe()

In [None]:
df["topic"].value_counts()

In [None]:
df.rename(columns={"safety_flag": "safety_flags"}, inplace=True)

In [None]:
df["safety_flags"] = df["safety_flags"].apply(json.loads)
df["safety_flags"]

In [None]:
df.to_json(
    "../dataset_with_metadata.json",
    orient="records",
    force_ascii=False,
    indent=4
)