In [1]:
%cd /src/notebooks

/src/notebooks


In [None]:
!pip install dfs~=2.20 huggingface_hub~=0.24 pyarrow~=17.0
 

In [2]:
import dotenv
dotenv.load_dotenv()

True

In [14]:
from libs.plugin_orchestrator.implementation_bare import OrchestratorBare
OrchestratorBare._prepare_prompt_function_description([])

AssertionError: 

In [44]:
import pandas as pd
import json
from typing import Tuple, Literal, List, Dict, Callable
from pydantic import BaseModel
from libs.utils.logger import get_logger
import pandas as pd
import os
import random

from libs.utils.connector_llm import ChatCompletionMessage
from libs.dataset_generator.exporter import DatasetExporterHuggingFace, DatasetExporterLocal
from libs.utils.prompt_manipulation import DefinitionOpenaiTool
from libs.plugins.plugin_capital import PluginCapital
import libs.plugin_converter.semantic_kernel_v0_to_openai_function as semantic_kernel_v0_to_openai_function
import libs.plugin_converter.openai_function_to_tool as openai_function_to_tool


logger = get_logger('libs.dataset_generator')


#########
# Plugin specific code
from libs.plugins.plugin_capital import country_to_capital

answer_function_name = 'answer'

def generate_system_message(tool_definitions: List[DefinitionOpenaiTool]) -> ChatCompletionMessage:
    return ChatCompletionMessage(role='system', content=(
        f"You are allowed to call the following function:\n"
        f"{OrchestratorBare._prepare_prompt_function_description(tool_definitions)}\n"
    ))

def generate_chitchat_trajectory() -> List[ChatCompletionMessage]:
    user_message = random.choice([
        'Hi',
    ])
    assistant_thought = 'I can answer the user.'
    assistant_answer = 'Hello, how are you doing?'
    return [
        ChatCompletionMessage(role='user', content=user_message),
        ChatCompletionMessage(role='assistant', content='{"thought": "' + assistant_thought + '", "action_name": "' + answer_function_name + '", "args": {"text": "' + assistant_answer + '"}}'),
    ]


def generate_trajectory_sorry_PluginCapital_get_capital() -> List[ChatCompletionMessage]:
    # TODO: can I decrease the code duplication between this function and generate_trajectory_call_PluginCapital_get_capital?
    country = random.choice(list(country_to_capital.keys()))
    country_requested = random.choice([country, country.lower(), country.upper()])
    user_message = random.choice([
        f"What is the capital of {country_requested}?'"
        f"Can you tell me the capital of {country_requested}?'"
        f"I need help with a geography exam about {country_requested}, can you tell me what is its capital?'"
    ])
    assistant_thought = random.choice([
        f"I don't have the necessary tools to answer that'"
        f"I have no tools to get the necessary information'"
    ])
    assistant_answer = "Sorry, I don't know the answer to that question."
    return [
        ChatCompletionMessage(role='user', content=user_message),
        ChatCompletionMessage(role='assistant', content='{"thought": "' + assistant_thought + '", "action_name": "' + answer_function_name + '", "args": {"text": "' + assistant_answer + '"}}'),
    ]

def generate_trajectory_call_PluginCapital_get_capital() -> List[ChatCompletionMessage]:
    '''
    Language: english
    Function calling syntax: capivara_orchestrator_bare
    Approximate number of unique trajectories:
    196 counties * 3 ways to ask the question * 4 ways to answer = 2352
    '''
    country = random.choice(list(country_to_capital.keys()))
    country_requested = random.choice([country, country.lower(), country.upper()])
    user_message = random.choice([
        f'What is the capital of {country_requested}?',
        f'Can you tell me the capital of {country_requested}?',
        f'I need help with a geography exam about {country_requested}, can you tell me what is its capital?',
    ])
    assistant_thought = random.choice([
        f'I need to retrieve the capital of {country}.',
        f'I need to know the capital of {country}.',
        f'This action will help me get the capital of {country}.',
        f'The capital of {country} will be returned by PluginCapital_get_capital.',
    ])
    return [
        ChatCompletionMessage(role='user', content=user_message),
        ChatCompletionMessage(role='assistant', content='{"thought": "' + assistant_thought + '", "action_name": "PluginCapital_get_capital", "args": {"country": "' + country + '"}}'),
    ]

# end of plugin specific code
#########

class DatasetGenerator(BaseModel):
    tool_definitions: List[DefinitionOpenaiTool]

    # This should return a system message that explains the functions that are available
    # The way the function are explained should coincide with the function calling syntax
    # TODO: some caller may want to use this modules to generate a dataset were the LLM does not receive a list of possible functions, instead if learn it... that case is not supported
    generate_system_message: Callable[[List[DefinitionOpenaiTool]], ChatCompletionMessage]

    # To be used in case 1a and 1b
    generate_chitchat_trajectory: Callable[[], List[ChatCompletionMessage]]

    # To be used in case 2a
    # dict maps toolsName -> function that generates a trajectory
    generate_sorry_trajectories: Dict[str, Callable[[], List[ChatCompletionMessage]]]

    # To be used in case 2b
    # This should not contain the first system message (where the functions are explained); that is provided in `generate_system_message`
    # dict maps toolsName -> function that generates a trajectory
    generate_tool_trajectories: Dict[str, Callable[[], List[ChatCompletionMessage]]]

    answer_function_name: str = 'answer'  # We suppose that this function is always available

    function_calling_syntax_source: Literal['capivara_orchestrator_bare', 'llm_compiler'] = 'capivara_orchestrator_bare'
    function_calling_syntax_target: Literal['capivara_orchestrator_bare', 'llm_compiler'] = 'capivara_orchestrator_bare'

    proportion_needs_tool: float = 0.9
    proportion_have_tool: float = 0.9
    n: int

    def generate(self) -> List[List[ChatCompletionMessage]]:
        '''
        return a dataset
        Each item in the outer list represents a row
        Each row represents a trajectory

        A trajectory is simply a List[ChatCompletionMessage] containing a user instruction and the assistant responses (which probably calls tools)
        
        It will become a hugingface dataset... but it's not one... so maybe I should use another name

        -----

        The resulting dataset must cover all 4 cases:
        1a. the user does not need a tool (it's a normal conversation) and no tool was provided
        1b. the user does not need a tool but tools were provided
        2a. the user needs a tool but no tool was provided
        2b. the user needs a tool but and a suitable tool was provided

        Cases 1a and 1b are generated using chitchat_trajectory_generator
        Case 2b is generated using tool_trajectory_generators
        Case 2a is currently not supported... TODO!!!
        The proportions in the dataset are controlled by the parameters proportion_needs_tool and proportion_have_tool
        
        TODO: for cases 1b and 2b, currently we pass all tools
        Maybe I should send a random subset of tools (for 2b that has to include the tool that is needed)
        '''
        # TODO: this method should be capable of converting between different tool calling syntaxes
        # The source (generated trajectories) can be in any of the supported syntaxes, whatever the caller prefers (since they are the unfortunate bastards that will have to write the code for it)
        # An alternative implementation should be to create ooone more syntax, a "generic" one, and obly the caller to use it. but nah
        # So conversion seems crumblesome, but it's the most flexible way to do it (and the most convinient for the caller)
        if self.function_calling_syntax_source != 'capivara_orchestrator_bare' and self.function_calling_syntax_target != 'capivara_orchestrator_bare':
            raise NotImplementedError('Only capivara_orchestrator_bare is supported')

        if not all([tool.function.name in self.generate_sorry_trajectories for tool in self.tool_definitions if tool.function.name != self.answer_function_name]):
            raise ValueError('Not all tools have a trajectory generator')        
        if not all([tool.function.name in self.generate_tool_trajectories for tool in self.tool_definitions if tool.function.name != self.answer_function_name]):
            raise ValueError('Not all tools have a trajectory generator')

        dataset = []
        for i in range(self.n):
            trajectory: List[ChatCompletionMessage]
            if random.random() > self.proportion_needs_tool:
                # case 1: the user does not need a tool
                if random.random() > self.proportion_have_tool:
                    # case 1a: no tool was provided
                    _tool_definitions_active = list(filter(lambda d: d.function.name == self.answer_function_name, self.tool_definitions))
                    trajectory = [generate_system_message(_tool_definitions_active)] + generate_chitchat_trajectory()
                    logger.info(f"Generated trajectory {i} for case 1a")
                else:
                    # case 1b: tools were provided
                    # TODO: select random subset of tools, including 'answer'
                    _tool_definitions_active = self.tool_definitions
                    trajectory = [generate_system_message(_tool_definitions_active)] + generate_chitchat_trajectory()
                    logger.info(f"Generated trajectory {i} for case 1b")
            else:
                # case 2: the user needs a tool
                tool_name: str = random.choice([d.function.name for d in self.tool_definitions if d.function.name != self.answer_function_name])
                if random.random() > self.proportion_have_tool:
                    # case 2a: no tool was provided
                    _tool_definitions_active = list(filter(lambda d: d.function.name == self.answer_function_name, self.tool_definitions))
                    trajectory = [generate_system_message(_tool_definitions_active)] + self.generate_sorry_trajectories[tool_name]()
                    logger.info(f"Generated trajectory {i} for case 2a")
                else:
                    # case 2b: a suitable tool was provided
                    # TODO: select random subset of tools, including 'answer' and the one that is needed
                    _tool_definitions_active = self.tool_definitions
                    trajectory = [generate_system_message(_tool_definitions_active)] + self.generate_tool_trajectories[tool_name]()
                    logger.info(f"Generated trajectory {i} for case 2b")
            dataset.append(trajectory)
        assert len(dataset) == self.n
        assert all([type(row) == list for row in dataset])
        assert all([len(row) > 0 for row in dataset]), 'there are empty trajectories'
        assert all([all([type(message) == ChatCompletionMessage for message in row]) for row in dataset]), 'A trajectory should contain only ChatCompletionMessage objects'
        return dataset


generator = DatasetGenerator(
    tool_definitions=openai_function_to_tool.generate_definitions(semantic_kernel_v0_to_openai_function.generate_definitions([
        (PluginCapital(), "PluginCapital"),
    ])),
    generate_system_message=generate_system_message,
    generate_chitchat_trajectory=generate_chitchat_trajectory,
    generate_sorry_trajectories={
        'PluginCapital_get_capital': generate_trajectory_sorry_PluginCapital_get_capital,
    },
    generate_tool_trajectories={
        'PluginCapital_get_capital': generate_trajectory_call_PluginCapital_get_capital,
    },
    answer_function_name=answer_function_name,
    n=50,
)




dataset = generator.generate()
#await DatasetExporterHuggingFace(
#    dataset=dataset,
#    repo_id = "LeonardoBenitez/capivara-plugin-orchestration",
#    dataset_config='plugins_capital_v1.0',
#).export()

#await DatasetExporterLocal(
#    dataset=dataset,
#    path_local_base = '/src/data/plugins_capital_v1.0',
#).export()
dataset

[2024-08-14 11:58:05 +0000] libs.libs.dataset_generator INFO     Generated trajectory 0 for case 2a
[2024-08-14 11:58:05 +0000] libs.libs.dataset_generator INFO     Generated trajectory 1 for case 2b
[2024-08-14 11:58:05 +0000] libs.libs.dataset_generator INFO     Generated trajectory 2 for case 2b
[2024-08-14 11:58:05 +0000] libs.libs.dataset_generator INFO     Generated trajectory 3 for case 2b
[2024-08-14 11:58:05 +0000] libs.libs.dataset_generator INFO     Generated trajectory 4 for case 2b
[2024-08-14 11:58:05 +0000] libs.libs.dataset_generator INFO     Generated trajectory 5 for case 2b
[2024-08-14 11:58:05 +0000] libs.libs.dataset_generator INFO     Generated trajectory 6 for case 2b
[2024-08-14 11:58:05 +0000] libs.libs.dataset_generator INFO     Generated trajectory 7 for case 2b
[2024-08-14 11:58:05 +0000] libs.libs.dataset_generator INFO     Generated trajectory 8 for case 2b
[2024-08-14 11:58:05 +0000] libs.libs.dataset_generator INFO     Generated trajectory 9 for case 1b


[2024-08-14 11:58:05 +0000] libs.libs.dataset_generator INFO     Generated trajectory 18 for case 2a
[2024-08-14 11:58:05 +0000] libs.libs.dataset_generator INFO     Generated trajectory 19 for case 2b
[2024-08-14 11:58:05 +0000] libs.libs.dataset_generator INFO     Generated trajectory 20 for case 1b
[2024-08-14 11:58:05 +0000] libs.libs.dataset_generator INFO     Generated trajectory 21 for case 2b
[2024-08-14 11:58:05 +0000] libs.libs.dataset_generator INFO     Generated trajectory 22 for case 2b
[2024-08-14 11:58:05 +0000] libs.libs.dataset_generator INFO     Generated trajectory 23 for case 2a
[2024-08-14 11:58:05 +0000] libs.libs.dataset_generator INFO     Generated trajectory 24 for case 2b
[2024-08-14 11:58:05 +0000] libs.libs.dataset_generator INFO     Generated trajectory 25 for case 2b
[2024-08-14 11:58:05 +0000] libs.libs.dataset_generator INFO     Generated trajectory 26 for case 1b
[2024-08-14 11:58:05 +0000] libs.libs.dataset_generator INFO     Generated trajectory 27 fo

[[ChatCompletionMessage(role='system', content="You are allowed to call the following function:\n1. answer: Send response back to the user. Show all your results here, this is the only thing that the user will see. You won't be able to call any other function after this one. Be sure to return a complete and clear answer, the user will not be able to see any other intermediate messages nor ask for more information. Never mention intermediate messages or results; if you want to mention something, include it here. Call this function only once, with everything you want to show to the user..\n  Arguments: text (final textual response to be send to the user. Should use HTML syntax for formatting. Type: string. Required: yes)\n"),
  ChatCompletionMessage(role='user', content="What is the capital of mexico?'Can you tell me the capital of mexico?'I need help with a geography exam about mexico, can you tell me what is its capital?'"),
  ChatCompletionMessage(role='assistant', content='{"thought"

In [43]:
dataset[42]

[ChatCompletionMessage(role='system', content="You are allowed to call the following function:\n1. answer: Send response back to the user. Show all your results here, this is the only thing that the user will see. You won't be able to call any other function after this one. Be sure to return a complete and clear answer, the user will not be able to see any other intermediate messages nor ask for more information. Never mention intermediate messages or results; if you want to mention something, include it here. Call this function only once, with everything you want to show to the user..\n  Arguments: text (final textual response to be send to the user. Should use HTML syntax for formatting. Type: string. Required: yes)\n"),
 ChatCompletionMessage(role='user', content='What is the capital of morocco?'),
 ChatCompletionMessage(role='assistant', content='{"thought": "I have no tools to get the necessary information", "action_name": "answer", "args": {"text": "Sorry, I don\'t know the answe

In [7]:
# test
from datasets import load_dataset

ds = load_dataset("LeonardoBenitez/capivara-plugin-orchestration", 'plugins_capital_v1.0')

In [8]:
ds['train'].to_pandas()

Unnamed: 0,text
0,"[{""role"": ""user"", ""content"": ""What is the capi..."
1,"[{""role"": ""user"", ""content"": ""Hello""}, {""role""..."
2,"[{""role"": ""user"", ""content"": ""Hello""}, {""role""..."
3,"[{""role"": ""user"", ""content"": ""Hello""}, {""role""..."
4,"[{""role"": ""user"", ""content"": ""What is the capi..."
5,"[{""role"": ""user"", ""content"": ""What is the capi..."
6,"[{""role"": ""user"", ""content"": ""Hello""}, {""role""..."
7,"[{""role"": ""user"", ""content"": ""What is the capi..."
8,"[{""role"": ""user"", ""content"": ""Hello""}, {""role""..."
9,"[{""role"": ""user"", ""content"": ""Hello""}, {""role""..."


In [11]:
print(ds['train'].to_pandas().iloc[0]['text'])

[{"role": "user", "content": "What is the capital of france?"}, {"role": "assistant", "content": "{\"thought\": \"I need to retrieve the capital of France.\", \"action_name\": \"PluginCapital_get_capital\", \"args\": {\"country\": \"France\"}}"}]


In [None]:
ds