In [None]:
import ctypes

from llama_cpp import llama_log_set

def my_log_callback(level, message, user_data):
    pass

log_callback = ctypes.CFUNCTYPE(None, ctypes.c_int, ctypes.c_char_p, ctypes.c_void_p)(my_log_callback)
llama_log_set(log_callback, ctypes.c_void_p())

In [None]:
from llama_cpp import Llama
llm_path = "./llama-2-7b-chat.Q5_K_M.gguf"
llm = Llama(model_path=llm_path, verbose=False)

In [None]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import tensorflow_datasets as tfds
# Prompt evaluation
emotions = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'neutral', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise']

base_path = "./data/prompt_search/"

#prompts = get_candidate_prompts(meta_prompt, n=10)

prompts = [
    "What is the emotion of the following text?",
    "List the emotions of the following text",
    "Say what the emotions in the following text are",
    "What are the emotions of the following text?",
    "Which are the emotions present in the following text?",
    "Write a list of all the emotions in the following text",
    "Write all the emotions in the following text, include synonyms which are nouns and be repetitive",
    "Write all nouns describing the emotions in the following text",
    "Write all the emotions present in the following text. You can only use the following words: " + ", ".join(emotions),
    "Write all the emotions present in the following text. Do not write emotions that are not present. You can only use the following words: " + ", ".join(emotions),
]

prompt_id = 0
prompt_results = {
    "prompt_id": [],
    "prompt": [],
    "accuracy": [],
    "recall": [],
    "precision": [],
    "f1_score": [],
    "resampling_iteration": [],
    "ds_training_samples": [],
}

resampling_iterations = 10
prompts_per_resampling = 1
training_samples = 100
for i in range(resampling_iterations):
    
    # On the new samples 
    for prompt in prompts:
        print(f"Evaluating prompt \"{prompt}\" on {training_samples} training samples of the go emotions dataset")
        test_ds = tfds.load('huggingface:go_emotions/simplified', split=f'train[:{training_samples}]')
        num_items, num_labels = len(test_ds), len(emotions)

        accuracy_list = []
        recall_list = []
        
        ds = test_ds.as_numpy_iterator()
        y_gt_list = []
        y_pred_list = []

        inference_examples = {
            "prompt_id": [],
            "prompt": [],
            "text": [],
            "output": [],
            "gt_emotions": [],
            "pred_emotions": [],
        }

        for j in tqdm(range(num_items)):
            # Get next example
            x = ds.next()
            x_txt = x["text"].decode('utf-8')
            
            # Compute llm output and predictions
            eval_prompt = "Q: " + prompt + " Text: " + x_txt + " A:"
            output = llm(eval_prompt, max_tokens=128, stop=["Q:", "\n"], echo=False)
            gt_emotions = [emotions[i] for i in x["labels"]]
            pred_emotions = [em for em in emotions if em in output['choices'][0]['text'].lower()]
            
            #if j == 3:
            #    break
            
            if j < 5:
                # Print LLM outputs and predictions
                print("Prompt:\t", eval_prompt)
                print("Answer:\t", output['choices'][0]['text'])
                print("GT:\t", " ".join(gt_emotions))
                print("Y:\t", " ".join(pred_emotions))

                # Save inference examples
                inference_examples["prompt_id"].append(prompt_id)
                inference_examples["prompt"].append(prompt)
                inference_examples["text"].append(x_txt)
                inference_examples["output"].append(output['choices'][0]['text'])
                inference_examples["gt_emotions"].append(gt_emotions)
                inference_examples["pred_emotions"].append(pred_emotions)

            # Compute ground truth and predictions as one-hot vectors
            y_gt = np.zeros(num_labels)
            for i in x["labels"]:
                y_gt[i] = 1
            y_gt_list.append(y_gt)

            y_pred_idxs = [i for i in range(len(emotions)) if emotions[i] in pred_emotions]
            y_pred = np.zeros(num_labels)
            for i in y_pred_idxs:
                y_pred[i] = 1
            y_pred_list.append(y_pred)

        y_gt = np.array(y_gt_list)
        y_pred = np.array(y_pred_list)

        # Save prompt predictions and ground truth
        np.save(f"{base_path}/y_gt_{prompt_id}.npy", y_gt)
        np.save(f"{base_path}/y_pred_{prompt_id}.npy", y_pred)

        # Compute prompt accuracy and recall
        accuracy = np.sum(y_gt == y_pred) / np.prod(y_gt.shape)
        recall = np.sum((y_gt == y_pred) & (y_gt == 1)) / np.sum(y_gt == 1)
        precision = np.sum((y_gt == y_pred) & (y_pred == 1)) / np.sum(y_pred == 1)
        f1 = 2 / (1/recall + 1/precision)
        print(f"> Accuracy: {accuracy*100:.2f}%")
        print(f"> Recall: {recall*100:.2f}%")
        print(f"> Precision: {precision*100:.2f}%\n")
        print(f"> F1 score: {f1*100:.2f}%\n")
        
        # Add prompt scores to results
        prompt_results["prompt_id"].append(prompt_id)
        prompt_results["prompt"].append(prompt)
        prompt_results["accuracy"].append(accuracy)
        prompt_results["recall"].append(recall)
        prompt_results["precision"].append(precision)
        prompt_results["f1_score"].append(f1)
        prompt_results["resampling_iteration"].append(i)
        prompt_results["ds_training_samples"].append(training_samples)

        # Save prompt inference examples
        df = pd.DataFrame(inference_examples)
        df.to_csv(f"{base_path}/inference_examples_{prompt_id}.csv", index=False)
        prompt_id += 1

    # Resample prompts
    old_prompts = prompt_results["prompt"]
    old_scores = prompt_results["f1_score"]
    sampling_p = np.array(old_scores) / np.sum(old_scores)
    
    prompts = []
    for _ in range(prompts_per_resampling):
        prompt_to_resample = np.random.choice(old_prompts, p=sampling_p)
        output = llm("Q: Rewrite the following text while preserving its meaning. Text: " + prompt_to_resample + "A: ", max_tokens=128, stop=["Q:", "\n"], echo=False)
        print(output['choices'][0]['text'])
        prompts.append(output['choices'][0]['text'])
        
# Save results for all prompts
df = pd.DataFrame(prompt_results)
print(df)
df.to_csv(f"{base_path}/prompt_results.csv", index=False)


In [None]:
import pandas as pd
base_path = "./data/prompt_search/"
prompt_results = pd.read_csv(f"{base_path}/prompt_results.csv")
#print(prompt_results)
prompt_results = prompt_results.sort_values(by="f1_score", ascending=False)
print(prompt_results["prompt"].values[0])

display_prompt_results = prompt_results.drop(["resampling_iteration", "ds_training_samples", "prompt_id"], axis=1)
display_prompt_results = display_prompt_results.reset_index(drop=True)
latex = display_prompt_results.round(3).to_latex(index=False, formatters={"name": str.upper}, float_format="{:.3f}".format)
with open(f"data/prompt_search/f1_prompts.tex", "w") as f:
    f.write(latex)

results_hm = display_prompt_results.style.background_gradient(cmap='Blues', subset=["accuracy", "recall", "precision", "f1_score"])
results_hm.hide(axis='index')
display(results_hm)

with open(f"data/prompt_search/results_by_f1.html", "w") as f:
    f.write(results_hm.to_html())

display_prompt_results = prompt_results.sort_values(by="recall", ascending=False)
print(display_prompt_results["prompt"].values[0])
display_prompt_results = display_prompt_results.drop(["resampling_iteration", "ds_training_samples", "prompt_id"], axis=1)
display_prompt_results = display_prompt_results.reset_index(drop=True)

latex = display_prompt_results.round(3).to_latex(index=False, formatters={"name": str.upper}, float_format="{:.3f}".format)
with open(f"data/prompt_search/recall_prompts.tex", "w") as f:
    f.write(latex)
results_hm = display_prompt_results.style.background_gradient(cmap='Blues', subset=["accuracy", "recall", "precision", "f1_score"])
results_hm.hide(axis='index')
display(results_hm)

with open(f"data/prompt_search/results_by_recall.html", "w") as f:
    f.write(results_hm.to_html())


# --- Old code

In [None]:
raise Exception("Stop here")

In [None]:
import tensorflow_datasets as tfds
import numpy as np
from tqdm.auto import tqdm

emotions = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'neutral', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise']
studied_emotions = ['anger', 'annoyance', 'disgust', 'fear', 'grief', 'joy', 'love', 'neutral', 'optimism', 'pride', 'relief', 'sadness', 'surprise']

if "chat" in llm_path:
    meta_prompt_0 = "Q: A relationship connects these input-output pairs: \n"
    meta_prompt_1 = "What can i ask a person to give me those outputs when presented with these inputs? "
    meta_prompt_1 += "Only state one question. A: "
else:
    meta_prompt_0 = "Q: I gave a person an instruction and some inputs. " 
    meta_prompt_0 += "The person read the instruction and wrote an output for every one of the inputs. "
    meta_prompt_0 += "Here are the input-output pairs: \n"
    meta_prompt_1 = "The instruction was A:"

def get_prompt_examples():
    train_ds = tfds.load('huggingface:go_emotions/simplified', split='train')
    num_items, num_labels = len(train_ds), len(emotions)
    # Turn prefetch dataset into itreable dataset
    train_ds = train_ds.as_numpy_iterator()

    examples = []
    emotions_used = set()
    for i in tqdm(range(num_items)):
        x = train_ds.next()

        extracted_emotions = [i for i in x["labels"] if emotions[i] in studied_emotions]

        is_old_emotion = [i in emotions_used for i in extracted_emotions]
        if all(is_old_emotion):
            continue

        x_txt = x["text"].decode('utf-8')

        y_txt = " ".join([emotions[i] for i in extracted_emotions])

        for i in extracted_emotions:
            emotions_used.add(i)
        
        examples.append(f"IN: {x_txt} OUT: {y_txt}")
        #print(x["text"].decode('utf-8'), y_txt)
        
        #if len(emotions_used) == len(studied_emotions):
        #    break
        if len(examples) > 5:
            break
    
    return examples

ex = get_prompt_examples()
print(ex)
print(len(ex))
meta_prompt = meta_prompt_0 + "\n".join(ex) + "\n" + meta_prompt_1
print(meta_prompt)
print(len(meta_prompt.split(" ")))

In [None]:
def get_candidate_prompts(prompt, n=500):
    prompts = []
    for i in range(n):
        output = llm("Q: " + prompt + "A: ", max_tokens=128, stop=["Q:", "\n"], echo=False)
        print(output['choices'][0]['text'])
        prompts.append(output['choices'][0]['text'])
    return prompts