# Imports & setup

In [9]:
import pandas as pd
import json
import time
from tqdm import tqdm


# Load dataset (sample ~200 rows)

In [10]:
# Load dataset (download from Kaggle and place locally)
df = pd.read_csv("data\yelp.csv")  
# Expect columns: text, stars

df = df[['text', 'stars']].dropna()
df = df.sample(200, random_state=42).reset_index(drop=True)

df.head()


Unnamed: 0,text,stars
0,We got here around midnight last Friday... the...,4
1,Brought a friend from Louisiana here. She say...,5
2,"Every friday, my dad and I eat here. We order ...",3
3,"My husband and I were really, really disappoin...",1
4,Love this place! Was in phoenix 3 weeks for w...,5


# LLM call (Gemini / OpenRouter placeholder)

In [11]:
def call_llm(prompt: str) -> str:
    """
    Replace this with Gemini or OpenRouter API call.
    Must return raw text response from LLM.
    """
    raise NotImplementedError("Add LLM API call here")


# Prompt versions

In [12]:
PROMPTS = {
    "v1_simple": lambda review: f"""
Read the following review and predict a star rating from 1 to 5.
Return JSON only.

Review:
{review}
""",

    "v2_structured": lambda review: f"""
You are a sentiment analysis assistant.

Rules:
- Output valid JSON only
- predicted_stars must be an integer between 1 and 5
- explanation must be under 20 words

Review:
{review}

Output format:
{{
  "predicted_stars": <int>,
  "explanation": "<reason>"
}}
""",

    "v3_reasoning_guided": lambda review: f"""
Classify the review using these steps internally:
1. Identify sentiment polarity
2. Identify intensity
3. Map to a 1–5 star rating

Do not show steps.
Return JSON only in the format below.

Review:
{review}

Format:
{{
  "predicted_stars": <int>,
  "explanation": "<reason>"
}}
"""
}


# Safe JSON parser

In [13]:
def parse_json(response):
    try:
        data = json.loads(response)
        if "predicted_stars" in data:
            return data, True
    except Exception:
        pass
    return None, False


# Evaluation loop

In [14]:
results = []

for prompt_name, prompt_fn in PROMPTS.items():
    correct = 0
    valid_json = 0

    for _, row in tqdm(df.iterrows(), total=len(df)):
        prompt = prompt_fn(row["text"])

        try:
            response = call_llm(prompt)
        except:
            continue

        parsed, is_valid = parse_json(response)

        if is_valid:
            valid_json += 1
            if int(parsed["predicted_stars"]) == int(row["stars"]):
                correct += 1

        time.sleep(0.5)  # avoid rate limits

    results.append({
        "Prompt": prompt_name,
        "Accuracy": round(correct / len(df), 3),
        "JSON_Validity_Rate": round(valid_json / len(df), 3)
    })


100%|██████████| 200/200 [00:00<00:00, 10910.31it/s]
100%|██████████| 200/200 [00:00<00:00, 19986.20it/s]
100%|██████████| 200/200 [00:00<00:00, 29586.32it/s]


# results table

In [15]:
results_df = pd.DataFrame(results)
results_df


Unnamed: 0,Prompt,Accuracy,JSON_Validity_Rate
0,v1_simple,0.0,0.0
1,v2_structured,0.0,0.0
2,v3_reasoning_guided,0.0,0.0


# Save results

In [16]:
results_df.to_csv("prompt_comparison_results.csv", index=False)
