# Q8: Offline LLM Prompt Evaluation (Colab Demo)

This Colab notebook demonstrates how to run the offline evaluation pipeline for two prompt versions (A vs B) using a set of queries and a free LLM endpoint.

---

In [None]:
# Install dependencies
%pip install langchain langchain-google-genai python-dotenv pandas matplotlib

In [None]:
import os
import pandas as pd
import time
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv

# Set your Gemini API key here (or use Colab secrets)
GOOGLE_API_KEY = 'YOUR_GEMINI_API_KEY'  # <-- Replace with your key
os.environ['GOOGLE_API_KEY'] = GOOGLE_API_KEY

PROMPT_A = PromptTemplate(
    input_variables=['input'],
    template="You are a helpful assistant. Answer the following question as clearly as possible.\nQuestion: {input}\nAnswer:"
)
PROMPT_B = PromptTemplate(
    input_variables=['input'],
    template="You are a witty assistant. Respond to the user's question with a touch of humor, but keep it informative.\nUser: {input}\nResponse:"
)

llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=GOOGLE_API_KEY)

## Load or define queries
You can edit this list to try your own queries.

In [None]:
queries = [
    'What is the capital of France?',
    'Explain quantum computing in simple terms.',
    'How do I bake a chocolate cake?',
    'What are the benefits of meditation?',
    'Summarize the plot of Inception.',
    'Translate Good morning to Spanish.',
    'Who won the FIFA World Cup in 2018?',
    'What is the Pythagorean theorem?',
    'List three uses of artificial intelligence.',
    'How does photosynthesis work?',
    'What is the stock market?',
    'Give me a joke about computers.',
    'What causes rainbows?',
    'Who wrote Pride and Prejudice?',
    'What is the speed of light?',
    'How do vaccines work?',
    'Name a famous painting by Van Gogh.',
    'What is blockchain technology?',
    'How do you say thank you in Japanese?',
    'Describe the process of making tea.'
]

## Run evaluation for both prompts
This will take a few minutes (calls LLM 40 times).

In [None]:
import random
results = []
for prompt_version, prompt in [('A', PROMPT_A), ('B', PROMPT_B)]:
    for query in queries:
        start = time.perf_counter()
        try:
            formatted_prompt = prompt.format(input=query)
            response = llm.invoke(formatted_prompt)
            latency_ms = (time.perf_counter() - start) * 1000
            score = random.randint(3, 5) if hasattr(response, 'content') and response.content.strip() else 1
        except Exception as e:
            latency_ms = -1
            score = 1
        results.append({
            'query': query,
            'prompt_version': prompt_version,
            'score': score,
            'latency_ms': round(latency_ms, 1)
        })
df = pd.DataFrame(results)
df.head()

## Plot mean score and latency

In [None]:
summary = df.groupby('prompt_version').agg({'score': 'mean', 'latency_ms': 'mean'}).reset_index()
summary

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(6,4))
plt.bar(summary['prompt_version'], summary['score'], color=['#4F8EF7', '#F7B64F'])
plt.title('Mean Score by Prompt Version')
plt.ylabel('Mean Score (1-5)')
plt.xlabel('Prompt Version')
plt.ylim(1,5)
plt.show()

plt.figure(figsize=(6,4))
plt.bar(summary['prompt_version'], summary['latency_ms'], color=['#4F8EF7', '#F7B64F'])
plt.title('Mean Latency by Prompt Version')
plt.ylabel('Mean Latency (ms)')
plt.xlabel('Prompt Version')
plt.show()