# Fine tuning

In [None]:
from typing import cast

import gel
import orjson
from pydantic import BaseModel
from tqdm import tqdm

from settings import NANAPI_CLIENT_ID

In [None]:
client = cast(
    gel.AsyncIOClient,
    gel.create_async_client().with_globals(client_id=NANAPI_CLIENT_ID),
)

## Dataset


In [None]:
class AuthorData(BaseModel):
    id: str


class EmbedField(BaseModel):
    name: str
    value: str


class EmbedFooter(BaseModel):
    text: str


class EmbedData(BaseModel):
    title: str | None = None
    url: str | None = None
    description: str | None = None
    fields: list[EmbedField] | None = None
    footer: EmbedFooter | None = None


class BaseMessageData(BaseModel):
    author: AuthorData
    content: str
    embeds: list[EmbedData]


class MessageData(BaseMessageData):
    author: AuthorData
    content: str
    embeds: list[EmbedData]
    referenced_message: BaseMessageData | None = None

In [None]:
CHANNELS_QUERY = r"""
with
  author_id := <str>$author_id,
  messages := (
    select discord::Message
    filter .client = global client
    and .author_id = author_id
    and not exists .noindex
  )
select distinct messages.channel_id
"""


async def get_channels(author_id: str):
    return await client.query(CHANNELS_QUERY, author_id=author_id)


ALL_MESSAGES_QUERY = r"""
with
  channel_id := <str>$channel_id,
select discord::Message { data, noindex }
filter .client = global client
and .channel_id = channel_id
order by .timestamp asc
"""


async def get_messages(channel_id: str):
    return await client.query(ALL_MESSAGES_QUERY, channel_id=channel_id)


async def yield_messages(channel_id: str):
    resp = await get_messages(channel_id)
    for r in tqdm(resp, leave=False):
        yield MessageData.model_validate_json(r.data), r.noindex



MESSAGE_QUERY = r"""
with
  message_id := <str>$message_id,
select discord::Message { data }
filter .client = global client
and .message_id = message_id
"""


async def get_message(message_id: str):
    resp = await client.query_required_single(MESSAGE_QUERY, message_id=message_id)
    return MessageData.model_validate_json(resp.data)

In [None]:
USER_ID = '168753518386216960'

channels = await get_channels(USER_ID)

conversations = []
questions: list[MessageData] = []
answers: list[MessageData] = []

for channel in tqdm(channels):
    async for message, noindex in yield_messages(channel):
        if message.author.id == USER_ID and not noindex:
            answers.append(message)
        else:
            if questions and answers:
                conversation = [
                    {
                        'from': 'human',
                        'value': '\n'.join(m.content for m in questions),
                    }
                ]
                gpt_msgs: list[MessageData] = []
                for m in answers:
                    if m.referenced_message:
                        if gpt_msgs:
                            conversation.append(
                                {
                                    'from': 'gpt',
                                    'value': '\n'.join(msg.content for msg in gpt_msgs),
                                }
                            )
                            conversations.append(conversation)
                        conversation = [
                            {
                                'from': 'human',
                                'value': m.referenced_message.content,
                            }
                        ]
                        gpt_msgs = []
                    gpt_msgs.append(m)
                if gpt_msgs:
                    conversation.append(
                        {
                            'from': 'gpt',
                            'value': '\n'.join(msg.content for msg in gpt_msgs),
                        }
                    )
                    conversations.append(conversation)
                questions = []
                answers = []
            if questions and message.author.id == questions[-1].author.id:
                questions.append(message)
            else:
                questions = [message]

In [None]:
len(conversations)

In [None]:
with open(f'{USER_ID}.json', 'wb') as f:
    f.write(orjson.dumps({'conversations': conversations}, option=orjson.OPT_INDENT_2))