# Scope
- Test harness for example prompts - use util functions
- explore manual annotation

In [1]:
import sys
import os

app_path = '../apps/gradio/'
utils_dir = os.path.abspath(os.path.dirname(app_path+'utils.py'))
sys.path.append(utils_dir)

# get functions used by app
from utils import construct_meme_prompt, generate_meme_completion, create_imgflip_meme

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
import pickle
import pandas as pd
with open('../data/evals/prompts.pkl','rb') as file:
    prompts = pickle.load(file)
test_set = pd.DataFrame(prompts,columns=['input'])
test_set.sample(5)

Unnamed: 0,input
20,When your girlfriend farts all the time
24,LLM Output vs. Expected Output
10,I am trying to create a funny meme using AI an...
25,LLM Debugging Be Like
17,Trying to do good but end up doing evil


In [10]:
# get gemini ready
import google.generativeai as genai
from dotenv import load_dotenv
load_dotenv()

# Load configuration and context
with open(app_path+'system_prompt.txt', 'r') as f:
    system_prompt = f.read()

with open(app_path+'memes20250128.pkl', 'rb') as f:
    meme_context = pickle.load(f)

# Configure Gemini
genai.configure(api_key=os.getenv('GEMINI_API_KEY'))
model = genai.GenerativeModel(
    model_name='gemini-1.5-flash',
    system_instruction=system_prompt,
)

In [13]:
# first pass
test_set['prompt'] = test_set['input'].apply(lambda x: construct_meme_prompt(x,meme_context)) #todo - silence print statements with a parameter


[PROMPT CONSTRUCTION] Starting with user input: I am trying to make a funny meme, but my sense of humor is holding me back
[PROMPT CONSTRUCTION] Prompt constructed successfully

[PROMPT CONSTRUCTION] Starting with user input: I am trying to make a funny meme using AI, but my sense of humor is  holding me back
[PROMPT CONSTRUCTION] Prompt constructed successfully

[PROMPT CONSTRUCTION] Starting with user input: Pharmacy overlord
[PROMPT CONSTRUCTION] Prompt constructed successfully

[PROMPT CONSTRUCTION] Starting with user input: Deep seek a Chinese AI model after being asked what happened in Tiananmen square for the millionth time
[PROMPT CONSTRUCTION] Prompt constructed successfully

[PROMPT CONSTRUCTION] Starting with user input: Yoda
[PROMPT CONSTRUCTION] Prompt constructed successfully

[PROMPT CONSTRUCTION] Starting with user input: Prompt engineering vs fine-tuning
[PROMPT CONSTRUCTION] Prompt constructed successfully

[PROMPT CONSTRUCTION] Starting with user input: Please creat

In [24]:
# default config, 5 attempts
def safe_generate_meme(prompt, model):
    if isinstance(prompt, dict) and "error" in prompt:
        return {"error": f"Skipped due to previous error: {prompt['error']}"}    
    
    try:
        return generate_meme_completion(prompt, model)
    except Exception as e:
        return {"error": str(e)}  # Or return None, or any other default value



In [None]:
test_set['response'] = test_set['prompt'].apply(lambda x: safe_generate_meme(x,model))

In [21]:
# this is quite slow, we might need to look into other forms of evals than the finished product
import time

def safe_create_meme(response_data):
    # Skip if we got an error from the previous step
    if isinstance(response_data, dict) and "error" in response_data:
        return {"error": f"Skipped due to previous error: {response_data['error']}"}
    
    try:
        # Add delay to respect API limits (e.g., 0.5 second between requests)
        time.sleep(0.5)
        
        # Create the meme
        response_data.update({
            'username': os.getenv('IMGFLIP_USERNAME'),
            'password': os.getenv('IMGFLIP_PASSWORD')
        })        
        result = create_imgflip_meme(response_data) # todo - silence print statements
        return result
        
    except Exception as e:
        return {"error": str(e)}

test_set['meme'] = test_set['response'].apply(lambda x: safe_create_meme(x))


[IMGFLIP API] Sending request to imgflip
[IMGFLIP API] Successfully created meme

[IMGFLIP API] Sending request to imgflip
[IMGFLIP API] Successfully created meme

[IMGFLIP API] Sending request to imgflip
[IMGFLIP API] Successfully created meme

[IMGFLIP API] Sending request to imgflip
[IMGFLIP API] Successfully created meme

[IMGFLIP API] Sending request to imgflip
[IMGFLIP API] Successfully created meme

[IMGFLIP API] Sending request to imgflip
[IMGFLIP API] Successfully created meme

[IMGFLIP API] Sending request to imgflip
[IMGFLIP API] Successfully created meme

[IMGFLIP API] Sending request to imgflip
[IMGFLIP API] Successfully created meme

[IMGFLIP API] Sending request to imgflip
[IMGFLIP API] Successfully created meme

[IMGFLIP API] Sending request to imgflip
[IMGFLIP API] Successfully created meme

[IMGFLIP API] Sending request to imgflip
[IMGFLIP API] Successfully created meme

[IMGFLIP API] Sending request to imgflip
[IMGFLIP API] Successfully created meme

[IMGFLIP API] S

In [22]:
def safe_create_prompt(row):
    if isinstance(row['response'], dict) and "error" in row['response']:
        return {"error": f"Skipped due to previous error: {row['response']['error']}"}
    
    elif isinstance(row['response'], dict):
        return construct_meme_prompt(row['input'],meme_context,[row['response']])
    
    else:
        return construct_meme_prompt(row['input'],meme_context)

test_set['prompt_2'] = test_set.apply(safe_create_prompt,axis=1)


[PROMPT CONSTRUCTION] Starting with user input: I am trying to make a funny meme, but my sense of humor is holding me back
[PROMPT CONSTRUCTION] Prompt constructed successfully

[PROMPT CONSTRUCTION] Starting with user input: I am trying to make a funny meme using AI, but my sense of humor is  holding me back
[PROMPT CONSTRUCTION] Prompt constructed successfully

[PROMPT CONSTRUCTION] Starting with user input: Pharmacy overlord
[PROMPT CONSTRUCTION] Prompt constructed successfully

[PROMPT CONSTRUCTION] Starting with user input: Deep seek a Chinese AI model after being asked what happened in Tiananmen square for the millionth time
[PROMPT CONSTRUCTION] Prompt constructed successfully

[PROMPT CONSTRUCTION] Starting with user input: Yoda
[PROMPT CONSTRUCTION] Prompt constructed successfully

[PROMPT CONSTRUCTION] Starting with user input: Prompt engineering vs fine-tuning
[PROMPT CONSTRUCTION] Prompt constructed successfully

[PROMPT CONSTRUCTION] Starting with user input: Please creat

In [25]:
# might be good to start capturing latency - timed out
test_set['response_2'] = test_set['prompt_2'].apply(lambda x: safe_generate_meme(x,model))

KeyboardInterrupt: 

In [26]:
# save for evaluation
test_set.to_excel('../data/evals/manual20250206.xlsx',sheet_name="test_set")

In [23]:
test_set

Unnamed: 0,input,prompt,response,meme,prompt_2
0,"I am trying to make a funny meme, but my sense...","USER INPUT: I am trying to make a funny meme, ...","{'template_id': '112126428', 'boxes[0][text]':...","{'success': True, 'image_url': 'https://i.imgf...",PREVIOUS ATTEMPTS (TRY A DIFFERENT TEMPLATE):\...
1,"I am trying to make a funny meme using AI, but...",USER INPUT: I am trying to make a funny meme u...,"{'template_id': '129242436', 'text0': 'AI can ...","{'success': True, 'image_url': 'https://i.imgf...",PREVIOUS ATTEMPTS (TRY A DIFFERENT TEMPLATE):\...
2,Pharmacy overlord,USER INPUT: Pharmacy overlord\n\nAVAILABLE CON...,"{'template_id': '129242436', 'text0': 'Pharmac...","{'success': True, 'image_url': 'https://i.imgf...",PREVIOUS ATTEMPTS (TRY A DIFFERENT TEMPLATE):\...
3,Deep seek a Chinese AI model after being asked...,USER INPUT: Deep seek a Chinese AI model after...,"{'template_id': '112126428', 'boxes[0][text]':...","{'success': True, 'image_url': 'https://i.imgf...",PREVIOUS ATTEMPTS (TRY A DIFFERENT TEMPLATE):\...
4,Yoda,USER INPUT: Yoda\n\nAVAILABLE CONTEXT: [{'id':...,"{'template_id': '129242436', 'text0': 'Yoda is...","{'success': True, 'image_url': 'https://i.imgf...",PREVIOUS ATTEMPTS (TRY A DIFFERENT TEMPLATE):\...
5,Prompt engineering vs fine-tuning,USER INPUT: Prompt engineering vs fine-tuning\...,"{'template_id': '112126428', 'boxes[0][text]':...","{'success': True, 'image_url': 'https://i.imgf...",PREVIOUS ATTEMPTS (TRY A DIFFERENT TEMPLATE):\...
6,Please create me a funny meme about parents an...,USER INPUT: Please create me a funny meme abou...,"{'template_id': '112126428', 'boxes[0][text]':...","{'success': True, 'image_url': 'https://i.imgf...",PREVIOUS ATTEMPTS (TRY A DIFFERENT TEMPLATE):\...
7,Overengineering LLM Apps,USER INPUT: Overengineering LLM Apps\n\nAVAILA...,"{'template_id': '129242436', 'text0': 'Overeng...","{'success': True, 'image_url': 'https://i.imgf...",PREVIOUS ATTEMPTS (TRY A DIFFERENT TEMPLATE):\...
8,"I am trying to make a funny meme, but my sense...","USER INPUT: I am trying to make a funny meme, ...","{'template_id': '112126428', 'boxes[0][text]':...","{'success': True, 'image_url': 'https://i.imgf...",PREVIOUS ATTEMPTS (TRY A DIFFERENT TEMPLATE):\...
9,i want to represent that a builder of an app w...,USER INPUT: i want to represent that a builder...,"{'template_id': '181913649', 'text0': 'App use...","{'success': True, 'image_url': 'https://i.imgf...",PREVIOUS ATTEMPTS (TRY A DIFFERENT TEMPLATE):\...


In [14]:
?generate_meme_completion

[0;31mSignature:[0m
[0mgenerate_meme_completion[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mprompt[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmodel[0m[0;34m:[0m [0mAny[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_attempts[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m5[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mconfig[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mAny[0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mAny[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Generates a meme completion and validates JSON output, with multiple attempts.
Includes JSON formatting instructions in prompt.
[0;31mFile:[0m      ~/dev/llmeme/apps/gradio/utils.py
[0;31mType:[0m      function

In [12]:
?construct_meme_prompt

[0;31mSignature:[0m
[0mconstruct_meme_prompt[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0muser_input[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmeme_context[0m[0;34m:[0m [0mlist[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mprevious_attempts[0m[0;34m:[0m [0mlist[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0mstr[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m Constructs a prompt for the meme generation model with randomized meme context.
[0;31mFile:[0m      ~/dev/llmeme/apps/gradio/utils.py
[0;31mType:[0m      function

In [None]:
# scratch
# test_set2 = pd.read_excel("../data/evals/created_memes.xlsx",sheet_name="test_set")
# test_set2['prompt'] = test_set2['input'].apply(lambda x: construct_meme_prompt(x,context)) 
# test_set2['response'] = test_set2.prompt.apply(lambda x: test_completion(x))
# test_set2['cleaned_response'] = test_set2.response.apply(lambda x: clean_response(x))
# test_set2['is_valid_json'] = test_set2.cleaned_response.apply(lambda x: is_valid_json(x))
# test_set2['is_drake'] = test_set2.cleaned_response.apply(lambda x: '181913649' in x)