In [1]:
from datasets import load_dataset, Dataset
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
import json
import matplotlib.pyplot as plt
import seaborn as sns
import re
from tqdm import tqdm
import networkx as nx
from typing import Dict, List
import uuid

In [2]:
d = load_dataset("Team-ACE/ToolACE")

In [3]:
d['train']

Dataset({
    features: ['system', 'conversations'],
    num_rows: 11300
})

In [6]:
d['train'][0:2]

{'system': ['You are an expert in composing functions. You are given a question and a set of possible functions. \nBased on the question, you will need to make one or more function/tool calls to achieve the purpose. \nIf none of the function can be used, point it out. If the given question lacks the parameters required by the function,\nalso point it out. You should only return the function call in tools call sections.\nHere is a list of functions in JSON format that you can invoke:\n[{"name": "newAddress", "description": "Generates a new Ethereum address that can be used to send or receive funds. Do not lose the password! We can\'t restore access to an address if you lose it.", "parameters": {"type": "dict", "properties": {"password": {"description": "The password for the new Ethereum address", "type": "string"}}, "required": ["password"]}, "required": null}, {"name": "Market Trends API", "description": "Get the latest market trends and relevant news for a specified country and langua

In [12]:
d['train'][123].keys()

dict_keys(['system', 'conversations'])

In [13]:
d['train'][0]

{'system': 'You are an expert in composing functions. You are given a question and a set of possible functions. \nBased on the question, you will need to make one or more function/tool calls to achieve the purpose. \nIf none of the function can be used, point it out. If the given question lacks the parameters required by the function,\nalso point it out. You should only return the function call in tools call sections.\nHere is a list of functions in JSON format that you can invoke:\n[{"name": "newAddress", "description": "Generates a new Ethereum address that can be used to send or receive funds. Do not lose the password! We can\'t restore access to an address if you lose it.", "parameters": {"type": "dict", "properties": {"password": {"description": "The password for the new Ethereum address", "type": "string"}}, "required": ["password"]}, "required": null}, {"name": "Market Trends API", "description": "Get the latest market trends and relevant news for a specified country and languag

In [14]:
def generate_unique_id():
    """Generate a unique ID using UUID4"""
    return str(uuid.uuid4())

In [15]:
def transform_dataset():
    # Load the dataset
    dataset = load_dataset("Team-ACE/ToolACE")
    
    transformed_dataset = {}
    for split in dataset.keys():
        # First create list of all examples with their new structure
        new_examples: Dict[str, Any] = {
            'id': [],
            'conversations': []
        }
        
        # Transform each example
        for idx, example in enumerate(dataset[split]):
            # Generate unique ID
            unique_id = str(uuid.uuid4())
            
            # Create new conversations list with system message first
            new_conversations = [
                {
                    'from': 'system',
                    'value': example['system']
                }
            ] + example['conversations']
            
            # Add to our lists
            new_examples['id'].append(unique_id)
            new_examples['conversations'].append(new_conversations)
        
        # Create new dataset with only id and conversations columns
        transformed_dataset[split] = Dataset.from_dict(new_examples)
    
    # Preview first example
    print("\nFirst example in transformed dataset:")
    print(json.dumps({
        'id': transformed_dataset['train'][0]['id'],
        'conversations': transformed_dataset['train'][0]['conversations']
    }, indent=2))
    
    # Save transformed dataset
    transformed_dataset['train'].save_to_disk("transformed_toolace")
    
    return transformed_dataset

In [16]:
transformed = transform_dataset()

# Verification steps
train_data = transformed['train']

print("\nVerification:")
print(f"Number of examples: {len(train_data)}")
print(f"Dataset columns: {train_data.column_names}")  # Should show ['id', 'conversations']


First example in transformed dataset:
{
  "id": "565c7d3b-0f48-4097-bcc2-0602897fc23e",
  "conversations": [
    {
      "from": "system",
      "value": "You are an expert in composing functions. You are given a question and a set of possible functions. \nBased on the question, you will need to make one or more function/tool calls to achieve the purpose. \nIf none of the function can be used, point it out. If the given question lacks the parameters required by the function,\nalso point it out. You should only return the function call in tools call sections.\nHere is a list of functions in JSON format that you can invoke:\n[{\"name\": \"newAddress\", \"description\": \"Generates a new Ethereum address that can be used to send or receive funds. Do not lose the password! We can't restore access to an address if you lose it.\", \"parameters\": {\"type\": \"dict\", \"properties\": {\"password\": {\"description\": \"The password for the new Ethereum address\", \"type\": \"string\"}}, \"req

Saving the dataset (0/1 shards):   0%|          | 0/11300 [00:00<?, ? examples/s]


Verification:
Number of examples: 11300
Dataset columns: ['id', 'conversations']


In [10]:
first_example = train_data[0]
print("\nStructure verification:")
print(f"Has unique ID: {bool(first_example.get('id'))}")
print(f"ID format: {first_example['id']}")
print(f"Number of conversations: {len(first_example['conversations'])}")
print(f"First message is system: {first_example['conversations'][0]['from'] == 'system'}")


Structure verification:
Has unique ID: True
ID format: 643bb58e-cb21-49e9-b9b1-dc026df51028
Number of conversations: 11
First message is system: True


In [11]:
# Verify ID uniqueness
print("\nVerifying ID uniqueness...")
ids = set(train_data['id'])
print(f"Number of unique IDs: {len(ids)}")
print(f"Total number of examples: {len(train_data)}")
print(f"All IDs unique: {len(ids) == len(train_data)}")


Verifying ID uniqueness...
Number of unique IDs: 11300
Total number of examples: 11300
All IDs unique: True


## Real tf

In [17]:
dataset = load_dataset("Team-ACE/ToolACE")

# Transform data
new_data = {
    'id': [],
    'conversations': []
}

# Process each example
for example in dataset['train']:
    # Add system message to conversations and create new structure
    new_data['id'].append(str(uuid.uuid4()))
    new_data['conversations'].append([
        {'from': 'system', 'value': example['system']}
    ] + example['conversations'])

# Create new dataset with just id and conversations
new_dataset = Dataset.from_dict(new_data)

# Save it
new_dataset.save_to_disk("transformed_toolace")


Saving the dataset (0/1 shards):   0%|          | 0/11300 [00:00<?, ? examples/s]

In [20]:
new_dataset[23]['conversations']

[{'from': 'system',
  'value': 'You are an expert in composing functions. You are given a question and a set of possible functions. \nBased on the question, you will need to make one or more function/tool calls to achieve the purpose. \nIf none of the function can be used, point it out. If the given question lacks the parameters required by the function,\nalso point it out. You should only return the function call in tools call sections.\nHere is a list of functions in JSON format that you can invoke:\n[{"name": "Top NFT Sales Today", "description": "Retrieve the top NFT sales for today", "parameters": {"type": "dict", "properties": {"limit": {"description": "The maximum number of top NFT sales to retrieve", "type": "int"}, "offset": {"description": "The starting point for the top NFT sales to retrieve", "type": "int"}, "marketplace": {"description": "The marketplace to filter NFT sales by (e.g., OpenSea, Rarible)", "type": "string"}}, "required": ["limit"]}, "required": null}, {"name"

In [2]:
import transformers
import torch
from datasets import load_from_disk
from tqdm import tqdm

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "4,5"

In [4]:
pipeline = transformers.pipeline(
    "text-generation",
    model="meta-llama/Llama-3.3-70B-Instruct",
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

Device set to use cuda:0


In [24]:
SYSTEM_PROMPT = """
You are a high 170IQ reasoning super smart AI, your job is to enhance existing conversation examples. Remember return the entire conversation as is BUT

BUT We are add Chain of Thought and planning to "Assistant" messages whenever it returns a tool call. 

Remember ONLY When it does return a tool, we all add thinking and reasoning Traces before it to add logic otherwise we don't touch the conversation history

Remember to return the entire message but only enhance the assitant messages whenever it calls a tool with thoghts

Please keep in mind we are not modifying anything in the example neither are we changing what it does, only add CoT everytime a tool gets called in the conversation

Think out loud and max out your tokens when adding CoT
"""

In [25]:
new_dataset[0]

{'id': '59cc174e-0b81-4ffc-8108-bd0a9350f0f2',
 'conversations': [{'from': 'system',
   'value': 'You are an expert in composing functions. You are given a question and a set of possible functions. \nBased on the question, you will need to make one or more function/tool calls to achieve the purpose. \nIf none of the function can be used, point it out. If the given question lacks the parameters required by the function,\nalso point it out. You should only return the function call in tools call sections.\nHere is a list of functions in JSON format that you can invoke:\n[{"name": "newAddress", "description": "Generates a new Ethereum address that can be used to send or receive funds. Do not lose the password! We can\'t restore access to an address if you lose it.", "parameters": {"type": "dict", "properties": {"password": {"description": "The password for the new Ethereum address", "type": "string"}}, "required": ["password"]}, "required": null}, {"name": "Market Trends API", "description

In [31]:
new_dataset[45]['conversations']

[{'from': 'system',
  'value': 'You are an expert in composing functions. You are given a question and a set of possible functions. \nBased on the question, you will need to make one or more function/tool calls to achieve the purpose. \nIf none of the function can be used, point it out. If the given question lacks the parameters required by the function,\nalso point it out. You should only return the function call in tools call sections.\nHere is a list of functions in JSON format that you can invoke:\n[{"name": "Stalled Pattern Indicator", "description": "Retrieve stalled pattern indicators for a specific cryptocurrency symbol.", "parameters": {"type": "dict", "properties": {"interval": {"description": "Time interval for the data (e.g., 1m, 5m, 15m, etc.)", "type": "string"}, "exchange": {"description": "Exchange to retrieve data from (e.g., binance, kraken, etc.)", "type": "string"}, "symbol": {"description": "Cryptocurrency symbol to retrieve data for (e.g., btcusdt, ethusdt, etc.)"

In [27]:
transformed_examples = []

# Iterate through dataset
for example in tqdm(new_dataset[23]['conversations']):
    messages = [
    {"role": "system", "content": SYSTEM_PROMPT},
    {"role": "user", "content": f"{example}"},]
    outputs = pipeline(
    messages,
    max_new_tokens=128000)
    
    # Get response using your method
    response = outputs[0]["generated_text"][-1]['content']
    print(response)
    #

  0%|                                                                                                           | 0/9 [00:07<?, ?it/s]


KeyboardInterrupt: 

In [32]:
transformed_examples = []

# Iterate through dataset
#for example in tqdm(new_dataset[23]['conversations']):
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": f"{new_dataset[45]['conversations']}"},]
outputs = pipeline(
messages,
max_new_tokens=128000)

# Get response using your method
response = outputs[0]["generated_text"][-1]['content']
print(response)
    #

[{'from':'system', 'value': 'You are an expert in composing functions. You are given a question and a set of possible functions. \nBased on the question, you will need to make one or more function/tool calls to achieve the purpose. \nIf none of the function can be used, point it out. If the given question lacks the parameters required by the function,\nalso point it out. You should only return the function call in tools call sections.\nHere is a list of functions in JSON format that you can invoke:\n[{"name": "Stalled Pattern Indicator", "description": "Retrieve stalled pattern indicators for a specific cryptocurrency symbol.", "parameters": {"type": "dict", "properties": {"interval": {"description": "Time interval for the data (e.g., 1m, 5m, 15m, etc.)", "type": "string"}, "exchange": {"description": "Exchange to retrieve data from (e.g., binance, kraken, etc.)", "type": "string"}, "symbol": {"description": "Cryptocurrency symbol to retrieve data for (e.g., btcusdt, ethusdt, etc.)", "

In [34]:
new_dataset

Dataset({
    features: ['id', 'conversations'],
    num_rows: 11300
})

In [28]:
from datasets import load_from_disk

In [29]:
d = load_from_disk("transformed_toolace")

In [33]:
d.select(range(5000,7000))

Dataset({
    features: ['id', 'conversations'],
    num_rows: 2000
})

In [27]:
d[5]

{'id': '8dff4c65-d3b6-4a54-ba0d-a63d689b60ef',
 'conversations': [{'from': 'system',
   'value': 'You are an expert in composing functions. You are given a question and a set of possible functions. \nBased on the question, you will need to make one or more function/tool calls to achieve the purpose. \nIf none of the function can be used, point it out. If the given question lacks the parameters required by the function,\nalso point it out. You should only return the function call in tools call sections.\nHere is a list of functions in JSON format that you can invoke:\n[{"name": "Trending Videos", "description": "Retrieves a list of trending videos from YouTube, filtered by locale, country, and type.", "parameters": {"type": "dict", "properties": {"hl": {"description": "Locale/language for the request", "type": "string", "default": "en"}, "gl": {"description": "Country to get trending videos from", "type": "string", "default": "US"}, "type": {"description": "Type of trending videos", "ty