In [None]:
from query_generation import SyntheticQueryGenerationPrompt, QueryTemplateList, QueryTemplateListExtractor, TemplateVariableListGeneratorAndExtractor
import os
import json
import re
from itertools import product
from query_answering import QueryAnswerPrompt
import random



In [None]:
%load_ext autoreload
%autoreload 2


In [None]:
# Place your keys here:
# In this example, we leverage Mirascopes OpenAICall functionality from two different providers, OpenAI and Groq.
# We start with using the 
os.environ["OPENAI_API_KEY"] = "<API-Key>"


In this notebook, we're creating a synthetic dataset of user queries and responses to train an LLM. The topic will be about matplotlib data visualization.

Instead of generating a ton of examples with an LLM directly (and running up a huge bill), we will use a neat trick + the power of structured outputs with Mirascope.

We will create a set of templated questions, where each template contains multiple different template variables, or values that are interchangeable with each other. We'll then generate those potential values as well.

For each question, there will be combinations of template variables to sub in to the queries, generating a very large dataset off of minimal initial tokens. 

We will start off with the templated query generation below.

In [None]:
library_focus = 'matplotlib'
task_type = "data visualization"
generation_strategy = "Query Generation with Task Templates"
generation_strategy_description = "Generate a set of task templates that outline the structure of data visualization queries. These templates need to have a diverse set of templated variables (Denoted like [XXXX]). Do not fill them in, and create more types of templated variables beyond the example(s) given. The more unique templated variables, the better"
examples = ["Create a [chart_type] using matplotlib to visualize [data_description] from [data_source] with [customization_options].", "Plot multiple [chart_type] in a single figure using matplotlib to compare [data_description] across different [category]"]

prompt = SyntheticQueryGenerationPrompt(library_focus=library_focus, task_type=task_type, generation_strategy=generation_strategy, generation_strategy_description=generation_strategy_description, examples=examples, num_results=5)
query_generation_response = prompt.call()


Here we should see a series of queries, with specific parts of them in brackets. These are our template variables.

In [None]:
print(query_generation_response.content)

We will turn this raw text into a list of queries, and extract the set of templated variables from them.
We could theoretically do this all in one call, but our focus is on quality of those examples, so we will generate the list first, then extract the list of queries + variables in a separate call.

In [None]:
query_list = QueryTemplateListExtractor(unextracted_query_list=query_generation_response.content).extract()
assert isinstance(query_list, QueryTemplateList)
print(query_list.templated_variables)
print(query_list.queries)

template_query_filename = "template_query.jsonl"

with open(template_query_filename, 'w') as file:
    for template_query in query_list.queries:
        data = {"template_query" : template_query}
        file.write(json.dumps(data)+ '\n')


Once we have the set of template variables it's time to generate the potential variables to fill them. 

Using Mirascope's Extraction functionality, we can send the model a variable, and force it's output to be serializable into a Pydantic model containing a list of potential variables for that given template variable.

For more information on how we do this, checkout `TemplateVariableListGeneratorAndExtractor` within `query_generation.py`

These calls do sometimes fail, so we track which ones fail and retry them once if they do. This is simple enough to work for our small set of template variables.

In [None]:
examples_dict = {}
failed_list = []

for template_var in query_list.templated_variables:
    print(f"querying a list for: {template_var}")
    try: 
        example_list_model = TemplateVariableListGeneratorAndExtractor(template_variable=template_var, library_focus=library_focus).extract()

    except AttributeError :
        print("Attribute Error, retrying later")
        failed_list.append(template_var)

    else:
        print(example_list_model.options)
        examples_dict[template_var] = example_list_model.options

print(examples_dict)

for template_var in failed_list:
    print(f"re-querying a list for: {template_var}")
    example_list_model = TemplateVariableListGeneratorAndExtractor(template_variable=template_var, library_focus=library_focus).extract()
    print(example_list_model.options)
    examples_dict[template_var] = example_list_model.options

print(examples_dict) 

In [None]:
# Writing all the template variable values to a file for later.
template_variable_values_filename = "template_variable_values.jsonl"

with open(template_variable_values_filename, 'w') as file:
    for key in examples_dict:
        file.write(json.dumps({key: examples_dict[key]}) + '\n')

Because our queries are templated, we can create multiple versions of each one for every possible combination of variables. Even with just our original 5 queries, there are multiple templates in each, allowing our final set to get quite large (~30,000 samples).

Because we can calculate these queries given the original templated queries and their candidate variables, we wont need to save this overall list.

In [None]:
def generate_combinations(example_dict, queries):
    result = []
    for query in queries:
        # Find all templated variables in the query string
        variables = re.findall(r'\[([^\]]+)\]', query)
        
        # Get the corresponding values for each variable from the dictionary
        value_lists = [example_dict[var.strip()] for var in variables]
        
        # Generate all possible combinations of the values
        combinations = list(product(*value_lists))
        
        # Replace the templated variables with their corresponding values
        for combination in combinations:
            filled_query = query
            for var, value in zip(variables, combination):
                filled_query = filled_query.replace(f'[{var}]', value)
            result.append(filled_query)
    
    return result

In [None]:
combos = generate_combinations(example_dict=examples_dict, queries=query_list.queries)
print(len(combos))

In [None]:
# For cost reasons, we will use Groq's api to generate the responses. To follow along, paste your Groq API Key here:
os.environ["OPENAI_API_KEY"] = "<Groq-Key>"

Because this dataset is so large now, generating the synthetic responses will be quite costly and time consuming. Additionally, there isn't a tremendous amount of variance between each one. So we can randomly sample a subset to generate responses.

We are using Mixtral via the Groq API. It's speed for consecutive requests lets us keep our code clean and running fairly fast. Our set of 2000 queries took roughly and hour and a half to run all the way through.

In [None]:
random_samples = random.sample(combos, 2000)
answers = []
for sample_query in random_samples:
    print(f"Evaluating: {sample_query}")
    code_sample = QueryAnswerPrompt(library_focus=library_focus, query=sample_query).call()
    answers.append(code_sample)

print(answers)

After generating the responses, we will save them with their queries to a jsonl file for later.

In [None]:
query_response_filename = "queries_and_responses.jsonl"

with open(query_response_filename, 'w') as file:
    for query, response in zip(random_samples,answers):
        data = {'query' : query, 'response': response.content}
        file.write(json.dumps(data)+ '\n')
    
    file.close()