# Fine tuning

In [None]:
import json
from typing import cast

import gel
from pydantic import BaseModel, TypeAdapter
from settings import NANAPI_CLIENT_ID

In [None]:
client = cast(
    gel.Client,
    gel.create_client().with_globals(client_id=NANAPI_CLIENT_ID),  # pyright: ignore[reportUnknownMemberType]
)

## Dataset


In [None]:
class MessageData(BaseModel):
    content: str
    referenced_content: str

In [None]:
def fetch_messages(user_id: str):
    resp = client.query_json(  # pyright: ignore[reportUnknownMemberType]
        r"""
        select discord::Message {
            content,
            referenced_content := .data['referenced_message']['content'],
        }
        filter .author_id = <str>$user_id
        and exists json_get(.data, 'referenced_message', 'content')
        order by .timestamp desc
        """,
        user_id=user_id,
    )
    return TypeAdapter(list[MessageData]).validate_json(resp, strict=False)


messages = fetch_messages('168753518386216960')

In [None]:
chatml_data = [
    {
        'messages': [
            {'role': 'user', 'content': msg.referenced_content},
            {'role': 'assistant', 'content': msg.content},
        ]
    }
    for msg in messages
    if msg.referenced_content and msg.content
]

print(len(chatml_data))

with open('chatml_dataset.jsonl', 'w') as f:
    for entry in chatml_data:
        f.write(json.dumps(entry) + '\n')