In [None]:
# Libraries
import pandas as pd
import random
import json
from datasets import load_dataset


## Models

### Flan-t5-base

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
t5_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

### Llama

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")

### MetaMath

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-math/MetaMath-Llemma-7B")
model = AutoModelForCausalLM.from_pretrained("meta-math/MetaMath-Llemma-7B")

## Connecting to dataset

In [None]:
dataset = load_dataset("gsm8k", "main")
dataset["test"][5]


## Functions

Function to get random question, answer

In [None]:
def get_random_test_object(dataset):
    if "test" in dataset and len(dataset["test"]) > 0:
        random_index = random.randint(0, len(dataset["test"]) - 1)
        test_object = dataset["test"][random_index]
        
        # Assuming each object in the dataset has 'question' and 'answer' keys
        question = test_object.get("question", "No question found")
        answer = test_object.get("answer", "No answer found")
        
        return question, answer
    else:
        return None, None

Function to get random mutation from dataset
- CSV file data is not consistent, hence I get the mutation from the txt file

In [None]:
# Get from csv file
def get_random_mutation(csv_file_path):
    try:
        df = pd.read_csv(csv_file_path, header=None, encoding='utf-8', delimiter='.') 
    except UnicodeDecodeError:
        df = pd.read_csv(csv_file_path, header=None, encoding='ISO-8859-1', delimiter='.') 

    random_prompt = random.choice(df[1].tolist())
    return random_prompt


In [None]:
#get from txt file
def get_random_mutation_txt(txt_file_path):
    with open(txt_file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Remove any leading/trailing whitespace and filter out empty lines
    prompts = [line.strip() for line in lines if line.strip()]
    
    if prompts:
        return random.choice(prompts)
    else:
        return "No mutation prompts found."

Function to generate intruction, which takes the following parameters:
- question: Obtain from get random_test_object()
- task_description: Description of the task we want the LLM to do
- max_tokens: To increase max_token limit, given that different models have different limits
- model: To enable the testing of different models

In [None]:
def generate_intructions(question, task_description, max_tokens, model):
    formatted_input = f"{task_description} {question}"
    input_ids = tokenizer(formatted_input, return_tensors="pt").input_ids
    outputs = model.generate(input_ids, max_new_tokens = max_tokens)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return generated_text

In [None]:
def generate_instructions_MM(question, model, tokenizer, max_tokens=500):
    # Define the prompting template
    prompting_template = (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n"
        "### Response: Let's think step by step."
    )

    # Format the input using the template and the question
    formatted_input = prompting_template.format(instruction=question)

    # Tokenize the input
    input_ids = tokenizer.encode(formatted_input, return_tensors="pt")

    # Generate output using the model
    outputs = model.generate(input_ids, max_new_tokens=max_tokens)

    # Decode the output to a human-readable format
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return generated_text

Function to process prompt via LLM
- Takes prompt, model

In [None]:
def process_with_llm(prompt, model):
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    output = model.generate(input_ids, max_length=1000)
    response = tokenizer.decode(output[0], skip_special_tokens=True)

    return response

## Testing

In [None]:
# write the task description here:
task_description = """Generate an instruction on how to solve the problem, based on the given question """

In [None]:
# getting mutated prompts
question, answer = get_random_test_object(dataset)

In [None]:
print(question)

In [None]:
print(answer)

In [None]:
num_instructions = 1
generated_instructions = []

for _ in range(num_instructions):  # Fetch a random question from your dataset
    instruction = generate_instructions_2(question, model, tokenizer)
    generated_instructions.append(instruction)

In [None]:
print(generated_instructions)

In [None]:
mutation_prompt = get_random_mutation_txt("./Prompt-Engineering-OpenDI/mutation_prompts.txt")
print(mutation_prompt)

In [None]:
def apply_mutation(instruction, mutation_prompt):
    # Example mutation - this can be customized based on your mutation logic
    return f"{mutation_prompt} {instruction}"

mutated_instructions = [apply_mutation(instruction, mutation_prompt) for instruction in generated_instructions]


In [None]:
print(mutated_instructions)

In [None]:
processed_outputs = [process_with_llm(mutated_instruction) for mutated_instruction in mutated_instructions]

In [None]:
for response in processed_outputs:
    print(response)

In [None]:
input_text = "Kylar went to the store to buy glasses for his new apartment. One glass costs $5, but every second glass costs only 60% of the price. Kylar wants to buy 16 glasses. How much does he need to pay for them?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

# Specify the max_new_tokens parameter
outputs = model.generate(input_ids, max_new_tokens=50)  # Adjust the number as needed
print(tokenizer.decode(outputs[0]))


In [None]:
input_text = "What was the answer for the previus question I asked?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

# Specify the max_new_tokens parameter
outputs = model.generate(input_ids, max_new_tokens=50)  # Adjust the number as needed
print(tokenizer.decode(outputs[0]))