# first look at user data

In [3]:
# setup
import os
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime
from dotenv import load_dotenv

# Load environment variables from both .env and shell
load_dotenv()  # This adds .env variables to os.environ

def create_engine_with_params(db_params: dict) -> create_engine:
    """Create SQLAlchemy engine using database parameters."""
    url = f"postgresql://{db_params['user']}:{db_params['password']}@{db_params['host']}:{db_params['port']}/{db_params['dbname']}"
    return create_engine(url)

db_params = {
    "dbname": "postgres", 
    "user": os.getenv("DB_USER"),
    "password": os.getenv("DB_PASSWORD"),
    "host": os.getenv("DB_HOST"),
    "port": "5432"  # Default port for PostgreSQL
}

engine = create_engine_with_params(db_params)

In [4]:
# recent records
df = pd.read_sql_query("SELECT * FROM events", con=engine)
df.sort_values(by="timestamp", ascending=False).head()

Unnamed: 0,event_id,timestamp,session_id,type,data,metadata
71,47c8ae19-e931-4eb1-9975-a0b256ff87da,2025-01-30 07:01:57.673085+00:00,ad740480-2ea6-464f-a8dc-f235fd8771f7,meme_created,{'prompt': 'Toddler is not impressed with this...,"{'environment': 'development', 'timestamp_utc'..."
70,1c61a7ba-7b94-435d-af16-4ec7e548d0ca,2025-01-30 07:01:36.183643+00:00,ad740480-2ea6-464f-a8dc-f235fd8771f7,meme_created,{'prompt': 'Toddler is not impressed with this...,"{'environment': 'development', 'timestamp_utc'..."
69,78b6ee9c-7cc4-4a4d-965e-50d1538bc72d,2025-01-30 07:01:12.527844+00:00,ad740480-2ea6-464f-a8dc-f235fd8771f7,meme_created,{'prompt': 'Toddler is not impressed with this...,"{'environment': 'development', 'timestamp_utc'..."
68,20807149-682b-4ff6-add7-f4f0cc7f039f,2025-01-30 07:01:02.015821+00:00,ad740480-2ea6-464f-a8dc-f235fd8771f7,meme_created,{'prompt': 'Toddler is not impressed with this...,"{'environment': 'development', 'timestamp_utc'..."
67,08a4f448-b127-4588-9e6e-43fd80022d7e,2025-01-30 07:00:56.291250+00:00,ad740480-2ea6-464f-a8dc-f235fd8771f7,meme_created,{'prompt': 'Toddler is not impressed with this...,"{'environment': 'development', 'timestamp_utc'..."


In [6]:
# logs can get better - i.e. differentiate first try from retries; encourage meme liked
df.type.value_counts()

type
meme_created       45
error              14
llm_response        5
meme_liked          3
test_event          2
meme_downloaded     2
startup             1
Name: count, dtype: int64

In [13]:
# we want to fix these logs such that we see what the api payload is - so we quickly know the captions
# might also make sense to add a "reasoning" field to the LLM output
df[df['data'].apply(lambda x: 'prompt' in x)]['data'].iloc[5]

{'prompt': 'Help me express the stress of trying to get a live demo working in very little time.',
 'image_url': 'https://i.imgflip.com/9iewoc.jpg',
 'template_id': '181913649'}

In [15]:
# First filter for rows with prompts, then extract the desired fields
meme_details = df['data'].apply(pd.Series)[['prompt', 'image_url', 'template_id']]

# Some rows might not have image_url or template_id (like error cases)
# so let's filter to only successful meme creations
created_memes = meme_details.dropna()

# Display the results
created_memes.head()

Unnamed: 0,prompt,image_url,template_id
14,Help me express the stress of trying to get a ...,https://i.imgflip.com/9iewoc.jpg,181913649
16,Help me express the stress of trying to get a ...,https://i.imgflip.com/9iewvf.jpg,181913649
17,Help me express the stress of trying to get a ...,https://i.imgflip.com/9iewwc.jpg,181913649
18,Help me express the stress of trying to get a ...,https://i.imgflip.com/9iewwn.jpg,181913649
20,I am trying to create a funny meme using AI an...,https://i.imgflip.com/9iexgl.jpg,181913649


In [18]:
# let's explore and annoted this first set using excel
# goal is to create a set of 20-30 example prompts that we can use to evaluate the model
# we can also annotate and some statistics like "is_drake" - but also off_optic, and with more insight to payload, is_json, is_complete 
created_memes.to_excel("../data/evals/created_memes.xlsx",sheet_name="raw_data")

# Creating tests
- First pass - is_json, is_complete, is_drake, off_optic
- Second pass - is_different

In [19]:
test_set = pd.read_excel("../data/evals/created_memes.xlsx",sheet_name="test_set")
test_set.head()

Unnamed: 0,input
0,Ice cream is too cold and expensive
1,Ice cream is too cold and expensive. Grandpa y...
2,Visit dinosaur museum with Henry expecting him...
3,Chicken lunch for ages
4,"Nvidia releases the 5090, right when deepseek ..."


In [22]:
# test harness - 
import sys
sys.path.append('../apps/gradio')  # Add the utils.py location to Python path
from utils import construct_meme_prompt
# context is better when read from the pickle file due to comments that are lost when saved to json
context = pd.read_pickle("../data/processed/memes20250128.pkl").to_dict(orient='records')

context[0]

{'id': '181913649',
 'name': 'Drake Hotline Bling',
 'url': 'https://i.imgflip.com/30b1gx.jpg',
 'width': 1200,
 'height': 1200,
 'box_count': 2,
 'captions': 1364500,
 'kym_definition': "\nDrakeposting refers to the practice of posting reaction images and other still shots from the music video of the Canadian hip hop artist's 2015 hit single Hotline Bling on the imageboard site 4chan, typically to express one's disdain of the thread topic or something said by another poster in a similar vein to the usage of Costanza.jpg.\n",
 'kym_examples': "[\n    {\n        'text0': 'Waiting for your icecream to thaw',\n        'text1': 'Warming up your spoon in the microwave to scoop it easier'\n    },\n    {\n        'text0': 'Complaining about paywalled journalism',\n        'text1': 'Complaining about paywalled academic articles'\n    },    \n]",
 'imgflip_api_post': "\n{\n    'template_id':'181913649',\n    'username': os.getenv('IMGFLIP_USERNAME'),\n    'password': os.getenv('IMGFLIP_PASSWORD

In [25]:
# saving for later
import pickle
with open('../apps/gradio/memes20250128.pkl', 'wb') as f:  # 'wb' means write binary
    pickle.dump(context, f)

In [24]:
context[3]

{'id': '217743513',
 'name': 'UNO Draw 25 Cards',
 'url': 'https://i.imgflip.com/3lmzyx.jpg',
 'width': 500,
 'height': 494,
 'box_count': 2,
 'captions': 588250,
 'kym_definition': '\nDraw 25 refers to a two-paneled image macro with one image of a customizable Uno wild card next to an image of a man with a large hand of cards. The wild card typically gives the option of performing an action or "draw 25" and the second image infers that a person decided against performing the action. The image macro rose to popularity on Twitter, Reddit and Instagram in January 2020.\n',
 'kym_examples': "[\n    {\n        'text0': 'Acknowledge that your child made a valid point against you', # option other than draw 25\n        'text1': 'Parents', # the person who decided to draw 25\n    },\n    {\n        'text0': 'Date me', # option other than draw 25\n        'text1': 'My crush', # the person who decided to draw 25\n    },    \n]",
 'imgflip_api_post': "\n{\n    'template_id':'217743513',\n    'use

In [26]:
test_set['prompt'] = test_set['input'].apply(lambda x: construct_meme_prompt(x,context)) 
test_set.iloc[0]


[PROMPT CONSTRUCTION] Starting with user input: Ice cream is too cold and expensive
[PROMPT CONSTRUCTION] Prompt constructed successfully

[PROMPT CONSTRUCTION] Starting with user input: Ice cream is too cold and expensive. Grandpa yells at cloud.
[PROMPT CONSTRUCTION] Prompt constructed successfully

[PROMPT CONSTRUCTION] Starting with user input: Visit dinosaur museum with Henry expecting him to hate it. Instead he loves it and I get sick on the VR experience 
[PROMPT CONSTRUCTION] Prompt constructed successfully

[PROMPT CONSTRUCTION] Starting with user input: Chicken lunch for ages
[PROMPT CONSTRUCTION] Prompt constructed successfully

[PROMPT CONSTRUCTION] Starting with user input: Nvidia releases the 5090, right when deepseek arrives 
[PROMPT CONSTRUCTION] Prompt constructed successfully

[PROMPT CONSTRUCTION] Starting with user input: I am trying to create a funny meme using AI and I think AI is holding me back
[PROMPT CONSTRUCTION] Prompt constructed successfully

[PROMPT CONS

input                   Ice cream is too cold and expensive
prompt    USER INPUT:\nIce cream is too cold and expensi...
Name: 0, dtype: object

In [28]:
print(test_set.iloc[0]['prompt'])

USER INPUT:
Ice cream is too cold and expensive
====
AVAILABLE CONTEXT:
[{'id': '181913649', 'name': 'Drake Hotline Bling', 'url': 'https://i.imgflip.com/30b1gx.jpg', 'width': 1200, 'height': 1200, 'box_count': 2, 'captions': 1364500, 'kym_definition': "\nDrakeposting refers to the practice of posting reaction images and other still shots from the music video of the Canadian hip hop artist's 2015 hit single Hotline Bling on the imageboard site 4chan, typically to express one's disdain of the thread topic or something said by another poster in a similar vein to the usage of Costanza.jpg.\n", 'kym_examples': "[\n    {\n        'text0': 'Waiting for your icecream to thaw',\n        'text1': 'Warming up your spoon in the microwave to scoop it easier'\n    },\n    {\n        'text0': 'Complaining about paywalled journalism',\n        'text1': 'Complaining about paywalled academic articles'\n    },    \n]", 'imgflip_api_post': "\n{\n    'template_id':'181913649',\n    'username': os.getenv('

In [30]:
# get the model loaded
from dotenv import load_dotenv
import google.generativeai as genai

# Load environment variables from both .env and shell
load_dotenv()  # This adds .env variables to os.environ

with open('../apps/gradio/system_prompt.txt', 'r') as f:
    system_prompt = f.read()

# Configure Gemini
genai.configure(api_key=os.getenv('GEMINI_API_KEY'))
model = genai.GenerativeModel(
    model_name='gemini-1.5-flash',
    system_instruction=system_prompt,
)

In [31]:
# user prompt config - which we may want to change
config = {
    'max_output_tokens': 1000,
    'temperature': 0.1,
}

In [32]:
def test_completion(prompt):
    response = model.generate_content(
        prompt,
        generation_config=genai.GenerationConfig(**config)
    )
    return response.text

test_set['response'] = test_set.prompt.apply(lambda x: test_completion(x))

In [34]:
print(test_set.iloc[0]['response'])
# here is a problem - the response is not a valid json

```json
{
  "template_id": "181913649",
  "text0": "Eating ice cream that's slightly melted",
  "text1": "Paying full price for ice cream that's rock solid"
}
```



In [35]:
import json
def is_valid_json(response):
    try:
        json.loads(response)
        return True
    except json.JSONDecodeError:
        return False
test_set['is_valid_json'] = test_set.response.apply(lambda x: is_valid_json(x))
test_set['is_valid_json'].sum()/test_set.shape[0]

np.float64(0.4666666666666667)

In [36]:
test_set.shape[0]

15

In [37]:
test_set[~test_set['is_valid_json']]['response'].values


array(['```json\n{\n  "template_id": "181913649",\n  "text0": "Eating ice cream that\'s slightly melted",\n  "text1": "Paying full price for ice cream that\'s rock solid"\n}\n```\n',
       '```json\n{\n  "template_id": "181913649",\n  "text0": "Buying the 5090",\n  "text1": "Waiting for Deepseek to be good"\n}\n```\n',
       '```json\n{\n  "template_id": "181913649",\n  "text0": "Trying to make a funny meme with AI",\n  "text1": "Giving up and making a meme about AI hindering my meme-making process"\n}\n```\n',
       '```json\n{\n  "template_id": "181913649",\n  "text0": "Smooth demo",\n  "text1": "App crashes spectacularly"\n}\n```\n',
       '```json\n{\n  "template_id": "124822590",\n  "boxes[0][text]": "Actually giving a thoughtful answer to \\"Tell me about yourself\\"",\n  "boxes[1][text]": "Brain farting and saying \\"uh... I like memes\\"",\n  "boxes[2][text]": "Me"\n}\n```\n',
       '```json\n{\n  "template_id": "181913649",\n  "text0": "Singing Hamilton accurately",\n  "t

In [38]:
# drake test
test_set['is_drake'] = test_set.response.apply(lambda x: '181913649' in x)
test_set['is_drake'].sum()/test_set.shape[0]

np.float64(0.8666666666666667)

In [39]:
test_set.iloc[0]['prompt']

'USER INPUT:\nIce cream is too cold and expensive\n====\nAVAILABLE CONTEXT:\n[{\'id\': \'181913649\', \'name\': \'Drake Hotline Bling\', \'url\': \'https://i.imgflip.com/30b1gx.jpg\', \'width\': 1200, \'height\': 1200, \'box_count\': 2, \'captions\': 1364500, \'kym_definition\': "\\nDrakeposting refers to the practice of posting reaction images and other still shots from the music video of the Canadian hip hop artist\'s 2015 hit single Hotline Bling on the imageboard site 4chan, typically to express one\'s disdain of the thread topic or something said by another poster in a similar vein to the usage of Costanza.jpg.\\n", \'kym_examples\': "[\\n    {\\n        \'text0\': \'Waiting for your icecream to thaw\',\\n        \'text1\': \'Warming up your spoon in the microwave to scoop it easier\'\\n    },\\n    {\\n        \'text0\': \'Complaining about paywalled journalism\',\\n        \'text1\': \'Complaining about paywalled academic articles\'\\n    },    \\n]", \'imgflip_api_post\': "\\n{

In [44]:
# we can potentially fix the json problem through using text processing
def clean_response(response: str) -> str:
    """
    Cleans AI response to ensure valid JSON formatting.
    Handles edge cases with 'json' prefix and smart quote issues.
    """
    # Remove 'json' prefix and newlines at start
    cleaned = response.lstrip('json\n')
    
    # Remove markdown code blocks and backticks
    cleaned = cleaned.strip('`').replace('```json', '').replace('```', '')
    
    # Replace various quote types with standard double quotes
    cleaned = cleaned.replace('"', '"').replace('"', '"').replace("'", '"')
    
    # Handle escaped quotes inside text strings
    cleaned = cleaned.replace('\\"', '"')

    
    # Remove any trailing commas before closing braces
    cleaned = cleaned.replace(',}', '}')
    cleaned = cleaned.replace(',]', ']')
    
    return cleaned.strip()
    
test_set['cleaned_response'] = test_set.response.apply(lambda x: clean_response(x))
test_set['is_valid_json'] = test_set.cleaned_response.apply(lambda x: is_valid_json(x))
test_set['is_valid_json'].sum()/test_set.shape[0]


np.float64(0.3333333333333333)

In [43]:
test_set[~test_set['is_valid_json']]['cleaned_response'].values

array(['json\n{\n  "template_id": "181913649",\n  "text0": "Eating ice cream that"s slightly melted",\n  "text1": "Paying full price for ice cream that"s rock solid"\n}',
       '{\n   "template_id": "181913649",\n   "text0": "Eating ice cream that"s slightly melted",\n   "text1": "Paying full price for ice cream that"s rock solid"\n}',
       'json\n{\n  "template_id": "181913649",\n  "text0": "Buying the 5090",\n  "text1": "Waiting for Deepseek to be good"\n}',
       'json\n{\n  "template_id": "181913649",\n  "text0": "Trying to make a funny meme with AI",\n  "text1": "Giving up and making a meme about AI hindering my meme-making process"\n}',
       'json\n{\n  "template_id": "181913649",\n  "text0": "Smooth demo",\n  "text1": "App crashes spectacularly"\n}',
       'json\n{\n  "template_id": "124822590",\n  "boxes[0][text]": "Actually giving a thoughtful answer to \\"Tell me about yourself\\"",\n  "boxes[1][text]": "Brain farting and saying \\"uh... I like memes\\"",\n  "boxes[2][

##  let's update the system prompt

In [48]:
# claude suggests simplifying the system prompt given the low temperature
system_prompt = """
You are a meme expert who generates valid JSON for the imgflip API. 

The will choose a meme template from the provided context: The context for each meme includes:
- kym_definition: How the meme is typically used 
- kym_examples: Example text placements
- box_count: Number of text boxes needed
- imgflip_api_post: The exact JSON format to follow

Look at each meme's definition and examples to choose one that best fits the user's input in an unexpected way. If you see previous failed attempts, choose a different template and approach.

OUTPUT RULES:
1. Respond with ONLY a JSON object - no prefixes, no backticks, no markdown
2. Use standard double quotes (") for all strings
3. Match your chosen template's imgflip_api_post format exactly, but without the username/password fields

Example outputs for different templates:
{
    "template_id": "87743020",
    "text0": "top text",
    "text1": "bottom text"
}

{
    "template_id": "124822590",
    "boxes[0][text]": "first text",
    "boxes[1][text]": "second text",
    "boxes[2][text]": "third text"
}
"""

In [49]:
model = genai.GenerativeModel(
    model_name='gemini-1.5-flash',
    system_instruction=system_prompt,
)

In [50]:
import random

def construct_meme_prompt(
    user_input: str,
    meme_context: list,
    previous_attempts: list = None
) -> str:
    """
    Constructs a prompt for the meme generation model with randomized meme context.
    """
    if not user_input.strip():
        raise ValueError("user_input must be non-empty")
    if not meme_context:
        raise ValueError("meme_context must be non-empty")
    
    # Shuffle the meme context
    shuffled_context = list(meme_context)  # Create a copy
    random.shuffle(shuffled_context)
    
    sections = []
    
    # Add failed attempts if they exist
    if previous_attempts:
        failed = ["PREVIOUS ATTEMPTS (TRY A DIFFERENT TEMPLATE):", *previous_attempts]
        sections.append("\n".join(failed))
    
    sections.extend([
        f"USER INPUT: {user_input.strip()}",
        f"AVAILABLE CONTEXT: {str(shuffled_context)}"
    ])
    
    return "\n\n".join(sections)

In [51]:
test_set2 = pd.read_excel("../data/evals/created_memes.xlsx",sheet_name="test_set")
test_set2['prompt'] = test_set2['input'].apply(lambda x: construct_meme_prompt(x,context)) 
test_set2['response'] = test_set2.prompt.apply(lambda x: test_completion(x))
test_set2['cleaned_response'] = test_set2.response.apply(lambda x: clean_response(x))
test_set2['is_valid_json'] = test_set2.cleaned_response.apply(lambda x: is_valid_json(x))
test_set2['is_drake'] = test_set2.cleaned_response.apply(lambda x: '181913649' in x)

In [52]:
print(test_set2.is_valid_json.sum()/test_set2.shape[0])

0.9333333333333333


In [53]:
print(test_set2.is_drake.sum()/test_set2.shape[0])

0.26666666666666666


### Decisions
- Update system prompt
- Update prompt construction
- Use pickle not json for context
- Deploy and see!

# Debugging errors
Helped picked up JSON errors

In [15]:

df.sort_values(by="timestamp", ascending=False).head().iloc[0]['data']

{'prompt': 'I am trying to demo this app and it is quite flaky i.e. errors quite often',
 'error_type': 'RuntimeError',
 'error_message': 'Failed after 10 attempts. Last error: Attempt 10 failed: Expecting value: line 1 column 1 (char 0)'}

In [6]:
df.sort_values(by="timestamp", ascending=False).head(10).iloc[5]

event_id                   d17555ad-f8e0-4759-b920-38878f323357
timestamp                      2025-01-30 02:05:21.262870+00:00
session_id                 fca15b75-dca1-4cad-9dba-78dbd15e34e3
type                                                      error
data          {'prompt': 'I want to make a funny meme and am...
metadata      {'environment': 'development', 'timestamp_utc'...
Name: 34, dtype: object

In [7]:
df.sort_values(by="timestamp", ascending=False).head(10).iloc[5]['data']

{'prompt': 'I want to make a funny meme and am using AI as a crutch',
 'error_type': 'RuntimeError',
 'error_message': 'Failed after 5 attempts. Last error: Attempt 5 failed: Expecting value: line 1 column 1 (char 0)'}

In [9]:
df.sort_values(by="timestamp", ascending=False).head(10).iloc[0]['data']

{'prompt': 'Ice cream is too cold and expensive. Grandpa yells at cloud.',
 'image_url': 'https://i.imgflip.com/9ifa2x.jpg',
 'template_id': '181913649'}