In [1]:
import os
import json
from dataclasses import dataclass
from typing import Dict, Any, Optional, List

import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, mean_absolute_error
from dotenv import load_dotenv

from openai import OpenAI

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
assert OPENAI_API_KEY is not None, "Please set OPENAI_API_KEY in a .env file."

client = OpenAI(api_key=OPENAI_API_KEY)

MODEL_NAME = "gpt-4o-mini" 


In [2]:
DATA_PATH = "data/yelp_reviews.csv" 

df = pd.read_csv(DATA_PATH)

TEXT_COL = "text"
STAR_COL = "stars"

df = df[[TEXT_COL, STAR_COL]].dropna()

df = df[(df[STAR_COL] >= 1) & (df[STAR_COL] <= 5)]

print("Dataset shape:", df.shape)
print(df.head())
print(df[STAR_COL].value_counts().sort_index())


Dataset shape: (10000, 2)
                                                text  stars
0  My wife took me here on my birthday for breakf...      5
1  I have no idea why some people give bad review...      5
2  love the gyro plate. Rice is so good and I als...      4
3  Rosie, Dakota, and I LOVE Chaparral Dog Park!!...      5
4  General Manager Scott Petello is a good egg!!!...      5
stars
1     749
2     927
3    1461
4    3526
5    3337
Name: count, dtype: int64


In [3]:
N_SAMPLES = 200  

df_sample = df.sample(N_SAMPLES, random_state=42).reset_index(drop=True)
print("Sample size:", len(df_sample))
df_sample.head()

Sample size: 200


Unnamed: 0,text,stars
0,We got here around midnight last Friday... the...,4
1,Brought a friend from Louisiana here. She say...,5
2,"Every friday, my dad and I eat here. We order ...",3
3,"My husband and I were really, really disappoin...",1
4,Love this place! Was in phoenix 3 weeks for w...,5


In [4]:
def call_llm(prompt: str, model: str = MODEL_NAME, max_output_tokens: int = 256) -> str:
    
    response = client.responses.create(model=model,input=prompt,max_output_tokens=max_output_tokens)
    return response.output_text


def parse_json_from_text(raw_text: str) -> Optional[Dict[str, Any]]:

    if raw_text is None:
        return None

    text = raw_text.strip()

    if text.startswith("```"):
        parts = text.split("```")
        for part in parts:
            part = part.strip()
            if part.startswith("{") and part.endswith("}"):
                text = part
                break

    try:
        obj = json.loads(text)
        if isinstance(obj, dict):
            return obj
        return None
    except json.JSONDecodeError:
        return None


def sanitize_predicted_stars(value: Any) -> Optional[int]:
    try:
        val = int(round(float(value)))
        if 1 <= val <= 5:
            return val
        return None
    except (ValueError, TypeError):
        return None


In [6]:
def build_prompt_v1(review_text: str) -> str:
    """
    Prompt V1: Simple zero-shot classification with minimal instructions.
    """
    return f"""
You are an assistant that reads Yelp reviews and predicts a star rating from 1 to 5.

Instructions:
- Consider sentiment, tone, and how satisfied the customer sounds.
- Rating guide:
  1 = extremely negative
  2 = negative
  3 = mixed / neutral
  4 = positive
  5 = very positive / delighted

Return your answer as valid JSON in the following format:
{{
  "predicted_stars": <integer 1-5>,
  "explanation": "<brief reasoning>"
}}

Review:
\"\"\"{review_text}\"\"\"

Now respond with JSON only.
"""


In [7]:
def build_prompt_v2(review_text: str) -> str:
    """
    Prompt V2: Adds a more explicit rating rubric and stricter instructions.
    """
    return f"""
You are a careful Yelp rating assistant.

Task:
Given a single Yelp review, assign a star rating from 1 to 5 and briefly explain your reasoning.

Rating rubric:
- 1 star: Very negative. Strong complaints, no redeeming qualities.
- 2 stars: Negative overall, but with at least one small positive aspect.
- 3 stars: Mixed / neutral. Clear balance of pros and cons, or vague sentiment.
- 4 stars: Positive overall, minor issues only.
- 5 stars: Very positive / enthusiastic, would strongly recommend.

Guidelines:
- Focus on sentiment about the overall experience (food, service, ambiance, price).
- If the sentiment is unclear or conflicting, choose 3 stars.
- Be consistent with the rubric above.
- Output MUST be valid JSON, with NO extra commentary or text.

Output format:
{{
  "predicted_stars": 4,
  "explanation": "The review is mostly positive with minor complaints."
}}

Review:
\"\"\"{review_text}\"\"\"

Now output the JSON only.
"""


In [8]:
FEW_SHOT_EXAMPLES = """
Example 1:
Review:
"I waited 40 minutes for cold food. The server was rude and never apologized. Worst place I've been."
JSON:
{
  "predicted_stars": 1,
  "explanation": "The review is very negative about both food and service."
}

Example 2:
Review:
"Food was okay, nothing special. Service was a bit slow but not terrible. Might come back if nearby."
JSON:
{
  "predicted_stars": 3,
  "explanation": "The review is mixed with both mild positives and negatives."
}

Example 3:
Review:
"Amazing tacos and super friendly staff! Fast service and great atmosphere. Highly recommended."
JSON:
{
  "predicted_stars": 5,
  "explanation": "The sentiment is very positive and enthusiastic."
}
"""

def build_prompt_v3(review_text: str) -> str:
    """
    Prompt V3: Few-shot examples + strict JSON-only request.
    """
    return f"""
You are a Yelp star rating model. Your job is to output ONLY a JSON object with two fields:
- "predicted_stars": an integer from 1 to 5
- "explanation": a short explanation (1â€“2 sentences)

Rating rubric:
1 = extremely negative, severe problems
2 = negative overall, but with some minor positives
3 = mixed or neutral sentiment
4 = positive overall with minor issues
5 = strongly positive, enthusiastic recommendation

If the sentiment is unclear, choose 3.

You MUST:
- Follow the rubric consistently.
- Output valid JSON with double quotes around keys and string values.
- Do NOT include any extra text before or after the JSON.
- Do NOT wrap the JSON in markdown code fences.

{FEW_SHOT_EXAMPLES}

Now classify this new review and respond with JSON only.

Review:
\"\"\"{review_text}\"\"\"
"""

In [None]:
@dataclass
class EvalResult:
    prompt_version: str
    n_samples: int
    accuracy: float
    mae: float
    json_valid_rate: float


def evaluate_prompt_version(
    df_subset: pd.DataFrame,
    build_prompt_fn,
    prompt_name: str,
    model: str = MODEL_NAME,
) -> (EvalResult, pd.DataFrame):
    
    y_true: List[int] = []
    y_pred: List[Optional[int]] = []
    json_valid_flags: List[bool] = []
    raw_outputs: List[str] = []

    for _, row in tqdm(df_subset.iterrows(), total=len(df_subset), desc=f"Evaluating {prompt_name}"):
        review = row[TEXT_COL]
        true_stars = int(row[STAR_COL])

        prompt = build_prompt_fn(review)
        raw_text = call_llm(prompt, model=model)
        raw_outputs.append(raw_text)

        json_obj = parse_json_from_text(raw_text)
        if json_obj is None:
            json_valid_flags.append(False)
            pred_stars = None
        else:
            json_valid_flags.append(True)
            pred_stars = sanitize_predicted_stars(json_obj.get("predicted_stars"))

        y_true.append(true_stars)
        y_pred.append(pred_stars)

    y_true_valid = [t for t, p in zip(y_true, y_pred) if p is not None]
    y_pred_valid = [p for p in y_pred if p is not None]

    if len(y_true_valid) == 0:
        accuracy = 0.0
        mae = 0.0
    else:
        accuracy = accuracy_score(y_true_valid, y_pred_valid)
        mae = mean_absolute_error(y_true_valid, y_pred_valid)

    json_valid_rate = float(np.mean(json_valid_flags))

    summary = EvalResult(prompt_version=prompt_name,n_samples=len(df_subset),accuracy=accuracy,mae=mae,json_valid_rate=json_valid_rate)
    results_df = df_subset.copy()
    results_df["true_stars"] = y_true
    results_df["predicted_stars"] = y_pred
    results_df["json_valid"] = json_valid_flags
    results_df["raw_output"] = raw_outputs
    results_df["prompt_version"] = prompt_name

    return summary, results_df

In [10]:
summaries = []
all_results = []

prompt_versions = [
    (build_prompt_v1, "V1_simple_zero_shot"),
    (build_prompt_v2, "V2_rubric"),
    (build_prompt_v3, "V3_few_shot_strict_json"),
]

for builder, name in prompt_versions:
    summary, res_df = evaluate_prompt_version(df_sample, builder, name)
    summaries.append(summary)
    all_results.append(res_df)

# Combine all predictions
results_df_all = pd.concat(all_results, ignore_index=True)

# Ensure outputs folder exists
os.makedirs("outputs", exist_ok=True)

# Save for reference / report
results_df_all.to_csv("outputs/task1_yelp_llm_results.csv", index=False)

Evaluating V1_simple_zero_shot:   1%|          | 2/200 [00:09<15:08,  4.59s/it]


KeyboardInterrupt: 