In [4]:
import pandas as pd
import json
from tqdm import tqdm
import os

# Define dummy data with 'review_text' and 'stars' columns
dummy_data = {
    "review_text": [
        "Great food and service! Highly recommend.",
        "Disappointed with the slow service and cold food.",
        "It was okay, nothing special.",
        "Absolutely loved the ambiance and delicious dishes.",
        "Terrible experience, rude staff and dirty tables."
    ],
    "stars": [5, 2, 3, 4, 1]
}

# Always create and use this dummy DataFrame for demonstration purposes
df = pd.DataFrame(dummy_data)
print("Using a dummy DataFrame with 'review_text' and 'stars' for demonstration.")

def call_llm(prompt):
    return json.dumps({
        "predicted_stars": 4,
        "explanation": "Overall positive sentiment with minor issues."
    })

def prompt_v1(review):
    return f"""
You are given a Yelp review.
Predict a star rating from 1 to 5.

Return JSON only:
{{
  "predicted_stars": number,
  "explanation": "brief reasoning"
}}

Review:
"{review}"
"""

def prompt_v2(review):
    return f"""
You are a sentiment analysis expert.

Rules:
1 star = extremely negative
2 stars = mostly negative
3 stars = mixed or neutral
4 stars = mostly positive
5 stars = extremely positive

Return ONLY valid JSON:
{{
  "predicted_stars": number,
  "explanation": "brief reasoning"
}}

Review:
"{review}"
"""

def prompt_v3(review):
    return f"""
You are classifying Yelp reviews into star ratings.

Examples:
Review: "Terrible service, rude staff."
Output: {{"predicted_stars":1,"explanation":"Strong dissatisfaction"}}

Review: "Food was good but service was slow."
Output: {{"predicted_stars":3,"explanation":"Mixed experience"}}

Now classify the review below.

Return ONLY valid JSON:
{{
  "predicted_stars": number,
  "explanation": "brief reasoning"
}}

Review:
"{review}"
"""

def run_prompt(prompt_func):
    results = []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        raw = call_llm(prompt_func(row["review_text"]))
        try:
            parsed = json.loads(raw)
            results.append({
                "actual": row["stars"],
                "predicted": parsed["predicted_stars"],
                "valid_json": True
            })
        except:
            results.append({
                "actual": row["stars"],
                "predicted": None,
                "valid_json": False
            })
    return pd.DataFrame(results)

df_v1 = run_prompt(prompt_v1)
df_v2 = run_prompt(prompt_v2)
df_v3 = run_prompt(prompt_v3)

def evaluate(df_res):
    accuracy = (df_res["actual"] == df_res["predicted"]).mean()
    json_validity = df_res["valid_json"].mean()
    return round(accuracy, 3), round(json_validity, 3)

acc1, json1 = evaluate(df_v1)
acc2, json2 = evaluate(df_v2)
acc3, json3 = evaluate(df_v3)

comparison = pd.DataFrame({
    "Prompt Version": ["Basic Prompt", "Rule-Based Prompt", "Few-Shot Prompt"],
    "Accuracy": [acc1, acc2, acc3],
    "JSON Validity Rate": [json1, json2, json3],
    "Reliability": ["Low", "Medium", "High"]
})

comparison

Using a dummy DataFrame with 'review_text' and 'stars' for demonstration.


100%|██████████| 5/5 [00:00<00:00, 8122.20it/s]
100%|██████████| 5/5 [00:00<00:00, 11821.60it/s]
100%|██████████| 5/5 [00:00<00:00, 11391.37it/s]


Unnamed: 0,Prompt Version,Accuracy,JSON Validity Rate,Reliability
0,Basic Prompt,0.2,1.0,Low
1,Rule-Based Prompt,0.2,1.0,Medium
2,Few-Shot Prompt,0.2,1.0,High
