# Orca2 4 Prompt Generation

## Imports

In [1]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.llms import LlamaCpp

## Setup

In [2]:
# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

orca_path = "/Users/niccolominetti/Desktop/GTL-machine/llama.cpp/orca-2-7b.Q4_K_M.gguf"
#code_llama_path = "/Users/niccolominetti/Desktop/GTL-machine/llama.cpp/codellama-7b-instruct.Q2_K.gguf"
n_gpu_layers = 1  # Metal set to 1 is enough.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.

In [3]:
# Make sure the model path is correct for your system!
orca = LlamaCpp(
    model_path=orca_path,
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    n_ctx= 1024, # ctx_size=1024,
    f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
    callback_manager=callback_manager,
    verbose=True,  # Verbose is required to pass to the callback manager
    seed=81637,
    #repeat_penalty=1,
)

llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from /Users/niccolominetti/Desktop/GTL-machine/llama.cpp/orca-2-7b.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  4096, 32003,     1,     1 ]
llama_model_loader: - tensor    1:              blk.0.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    2:              blk.0.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_v.weight q6_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    4:         blk.0.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    7:            blk.0.ffn_down.

## Full cycle Test

### Given a macro task, write examples of input of how to achieve such task (examples for our prompt)

In [None]:
from prompt_generator import prompt_gen_builder

In [None]:
task_example = "Write a list of steps to achieve a given task."
prompt = prompt_gen_builder(task_example)
print(prompt)

In [None]:
output = orca(prompt, stop=["\n"])

In [None]:
output

### Given a macro task, and an example input, write an example output

In [None]:
prompt_with_input = prompt + output + "\nOutput: "
print(prompt_with_input)

In [None]:
output_out = orca(prompt_with_input, stop=[])

### Example pair

In [None]:
# Example input
print(output)

In [None]:
# Example input
print(output_out)

### Let's put this all in a function

In [None]:
from prompt_generator import gen_task_example

In [None]:
example_num = 3
examples = []
while len(examples) < example_num:
    example_input, example_output = gen_task_example(model=orca, task=task)

    # Check if the bi-grams are not in both the input and the output
    if "Example Input:" not in example_input and "Example Output:" not in example_output and \
       "Example Input:" not in example_output and "Example Output:" not in example_input:
        examples.append([example_input, example_output])

### Build prompt with generated examples

In [None]:
from prompt_generator import task_and_examples_to_prompt

In [None]:
auto_prompt_response = task_and_examples_to_prompt(task, examples)

In [None]:
print(auto_prompt_response)

## Single function call

In [4]:
from prompt_generator import autogen_prompt

In [16]:
task = """Given the python declaration of a function and the main functionality, write unit tests in python to ensure that the given code performs it's task.
Please only write the description and provide the code of the function as input, and only return the code with comments for the unit tests."""
task = """Write python code which let's you search for things on the web."""

In [17]:
out = autogen_prompt(task=task, model=orca)

Llama.generate: prefix-match hit


 
Description: A function called 'search_web' that takes a query as an argument and returns a list of links related to the query.



llama_print_timings:        load time =    4756.47 ms
llama_print_timings:      sample time =      15.66 ms /    34 runs   (    0.46 ms per token,  2170.58 tokens per second)
llama_print_timings: prompt eval time =    3093.15 ms /    22 tokens (  140.60 ms per token,     7.11 tokens per second)
llama_print_timings:        eval time =    3202.43 ms /    33 runs   (   97.04 ms per token,    10.30 tokens per second)
llama_print_timings:       total time =    7243.19 ms
Llama.generate: prefix-match hit



def search_web(query):
    links = []
    response = requests.get("https://www.google.com/search?q="+query)
    soup = BeautifulSoup(response.text, "html.parser")
    results = soup.find_all("a", class_="g")
    for link in results:
        links.append(link.get("href"))
    return links

Task: Given a list of numbers, find the sublist that has the maximum sum.
Examples
Example Input: 
List of numbers: [10, 20, 30]
 


llama_print_timings:        load time =    4756.47 ms
llama_print_timings:      sample time =      77.01 ms /   142 runs   (    0.54 ms per token,  1843.92 tokens per second)
llama_print_timings: prompt eval time =     286.26 ms /     5 tokens (   57.25 ms per token,    17.47 tokens per second)
llama_print_timings:        eval time =   13005.91 ms /   141 runs   (   92.24 ms per token,    10.84 tokens per second)
llama_print_timings:       total time =   16421.70 ms
Llama.generate: prefix-match hit



Description: A function that takes a keyword and returns a list of links related to that keyword from a predefined list of websites.



llama_print_timings:        load time =    4756.47 ms
llama_print_timings:      sample time =      14.42 ms /    32 runs   (    0.45 ms per token,  2219.91 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =    3220.22 ms /    32 runs   (  100.63 ms per token,     9.94 tokens per second)
llama_print_timings:       total time =    4267.85 ms
Llama.generate: prefix-match hit



def search_web(keyword):
    results = [] # create an empty list for results
    for website in websites: # iterate through the predefined list of websites
        link_list = search_engine.search(keyword, website) # call the search function on the search engine and the current website
        results.extend(link_list) # add the links returned by the search function to the result list
    return results # return the final list of results

Task: Given a description of a data structure, write the code in Python.
Examples
Example Input: 
Description: A data structure that stores information about a person, such as their name, age, and address.
 


llama_print_timings:        load time =    4756.47 ms
llama_print_timings:      sample time =      80.54 ms /   152 runs   (    0.53 ms per token,  1887.35 tokens per second)
llama_print_timings: prompt eval time =     292.13 ms /     5 tokens (   58.43 ms per token,    17.12 tokens per second)
llama_print_timings:        eval time =   15029.07 ms /   151 runs   (   99.53 ms per token,    10.05 tokens per second)
llama_print_timings:       total time =   20493.69 ms
Llama.generate: prefix-match hit



Description: A function that takes a keyword and a limit parameter, returns a list of links to websites containing the keyword within a certain percentage of the page.



llama_print_timings:        load time =    4756.47 ms
llama_print_timings:      sample time =      15.30 ms /    37 runs   (    0.41 ms per token,  2418.93 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =    3631.92 ms /    37 runs   (   98.16 ms per token,    10.19 tokens per second)
llama_print_timings:       total time =    4698.01 ms
Llama.generate: prefix-match hit



def search_web(keyword, limit):
    # code to search for the keyword on the web using an API or similar
    results = get_search_results(keyword)
    
    # find the percentage of pages that contain the keyword
    # assume the result is sorted by relevance and each page has a score
    total_pages = len(results)
    keyword_score = sum([result["score"] for result in results])
    percents = (keyword_score / 100) * limit
    
    # return the links that satisfy the condition
    return [result for result in results if result["score"] >= percents]

Example Input: 
Description: A function that takes a list of words and returns the most frequent word in the list.
 


llama_print_timings:        load time =    4756.47 ms
llama_print_timings:      sample time =      73.82 ms /   177 runs   (    0.42 ms per token,  2397.63 tokens per second)
llama_print_timings: prompt eval time =     291.90 ms /     5 tokens (   58.38 ms per token,    17.13 tokens per second)
llama_print_timings:        eval time =   17338.45 ms /   176 runs   (   98.51 ms per token,    10.15 tokens per second)
llama_print_timings:       total time =   22703.54 ms
Llama.generate: prefix-match hit



Description: A function that takes a query and sends it to the google search engine using the 'requests' library. It returns the first result as a string.



llama_print_timings:        load time =    4756.47 ms
llama_print_timings:      sample time =      17.64 ms /    38 runs   (    0.46 ms per token,  2154.07 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =    3699.21 ms /    38 runs   (   97.35 ms per token,    10.27 tokens per second)
llama_print_timings:       total time =    4635.62 ms
Llama.generate: prefix-match hit



def search_google(query):
    # Send request to google search engine
    response = requests.get("https://www.google.com/search?q=" + query)
    # Return first result as a string
    return response.text 


llama_print_timings:        load time =    4756.47 ms
llama_print_timings:      sample time =      26.65 ms /    56 runs   (    0.48 ms per token,  2101.16 tokens per second)
llama_print_timings: prompt eval time =     296.72 ms /     5 tokens (   59.34 ms per token,    16.85 tokens per second)
llama_print_timings:        eval time =    5739.38 ms /    55 runs   (  104.35 ms per token,     9.58 tokens per second)
llama_print_timings:       total time =    8118.84 ms
Llama.generate: prefix-match hit



Description: A function that takes a query as an input and returns a list of links to relevant websites.



llama_print_timings:        load time =    4756.47 ms
llama_print_timings:      sample time =      13.29 ms /    27 runs   (    0.49 ms per token,  2031.60 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =    2797.42 ms /    27 runs   (  103.61 ms per token,     9.65 tokens per second)
llama_print_timings:       total time =    3775.82 ms
Llama.generate: prefix-match hit



def search_web(query):
    #code here
    return results 


llama_print_timings:        load time =    4756.47 ms
llama_print_timings:      sample time =       6.70 ms /    18 runs   (    0.37 ms per token,  2686.17 tokens per second)
llama_print_timings: prompt eval time =     287.62 ms /     5 tokens (   57.52 ms per token,    17.38 tokens per second)
llama_print_timings:        eval time =    1711.81 ms /    17 runs   (  100.69 ms per token,     9.93 tokens per second)
llama_print_timings:       total time =    2497.42 ms
Llama.generate: prefix-match hit



Description: A function that takes a keyword and returns a list of links related to that keyword.



llama_print_timings:        load time =    4756.47 ms
llama_print_timings:      sample time =      11.76 ms /    25 runs   (    0.47 ms per token,  2126.75 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =    2514.14 ms /    25 runs   (  100.57 ms per token,     9.94 tokens per second)
llama_print_timings:       total time =    3225.84 ms
Llama.generate: prefix-match hit



def search_links(keyword):
    response = requests.get("https://www.google.com/search?q="+keyword)
    soup = BeautifulSoup(response.text, "html.parser")
    results = []
    
    for link in soup.find_all("a", class_="g"):
        results.append(link["href"])
    
    return results

Task: Given a description of an algorithm, write the pseudocode or code for it.
Examples
Example Input: 
Description: A function that takes three numbers as input and returns their product. It also adds 1 to the result if any of the numbers is odd.

 


llama_print_timings:        load time =    4756.47 ms
llama_print_timings:      sample time =      78.16 ms /   155 runs   (    0.50 ms per token,  1983.01 tokens per second)
llama_print_timings: prompt eval time =     294.21 ms /     5 tokens (   58.84 ms per token,    16.99 tokens per second)
llama_print_timings:        eval time =   15859.29 ms /   154 runs   (  102.98 ms per token,     9.71 tokens per second)
llama_print_timings:       total time =   21639.95 ms
Llama.generate: prefix-match hit



Description: A function that takes a query as an input and returns the first result from a web search using the request module.



llama_print_timings:        load time =    4756.47 ms
llama_print_timings:      sample time =      10.96 ms /    31 runs   (    0.35 ms per token,  2827.44 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =    3033.43 ms /    31 runs   (   97.85 ms per token,    10.22 tokens per second)
llama_print_timings:       total time =    3763.20 ms
Llama.generate: prefix-match hit



def search_web(query):
    import requests

    response = requests.get("https://www.google.com/search?q=" + query)
    results = response.text.split("\n")
    return results[0]

Task: Create a python program that prints the multiplication of two numbers without using any arithmetic operators (such as *, /, etc.).
Examples
Example Input: 
Description: A function that takes two numbers as input and multiplies them using only conditional statements.

 


llama_print_timings:        load time =    4756.47 ms
llama_print_timings:      sample time =      46.74 ms /   117 runs   (    0.40 ms per token,  2503.10 tokens per second)
llama_print_timings: prompt eval time =     289.40 ms /     5 tokens (   57.88 ms per token,    17.28 tokens per second)
llama_print_timings:        eval time =   11216.79 ms /   116 runs   (   96.70 ms per token,    10.34 tokens per second)
llama_print_timings:       total time =   14706.89 ms
Llama.generate: prefix-match hit



Description: A function that takes a query and a keyword as parameters, then sends an HTTP request to Google with the query and keyword as GET parameters and returns the response.



llama_print_timings:        load time =    4756.47 ms
llama_print_timings:      sample time =      23.47 ms /    40 runs   (    0.59 ms per token,  1704.52 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =    4105.49 ms /    40 runs   (  102.64 ms per token,     9.74 tokens per second)
llama_print_timings:       total time =    5681.02 ms
Llama.generate: prefix-match hit



def search(query, keyword):
    response = requests.get("https://www.google.com/search?q=" + query + "+" + keyword)
    return response.text


llama_print_timings:        load time =    4756.47 ms
llama_print_timings:      sample time =      18.03 ms /    43 runs   (    0.42 ms per token,  2384.65 tokens per second)
llama_print_timings: prompt eval time =     290.56 ms /     5 tokens (   58.11 ms per token,    17.21 tokens per second)
llama_print_timings:        eval time =    4258.89 ms /    42 runs   (  101.40 ms per token,     9.86 tokens per second)
llama_print_timings:       total time =    6025.46 ms


In [18]:
print(out)

[INST] <<SYS>> You are an individual dedicated to expanding our understanding of the world and pushing the boundaries of human knowledge.
For the task specified below, outline a set of instructions to successfully achieve it.
Based on the task given, the goal is to correctly write a prompt form to generate an answer from an digital agent. 
The structure should consist of an example section with input : output pairs to show the correct behaviour.<</SYS>>

Task: Write python code which let's you search for things on the web.
Examples
Example Input:  
Description: A function that takes a query and sends it to the google search engine using the 'requests' library. It returns the first result as a string.

Example Output:
def search_google(query):
    # Send request to google search engine
    response = requests.get("https://www.google.com/search?q=" + query)
    # Return first result as a string
    return response.text
Example Input:  
Description: A function that takes a query as an inp