In [2]:
import time
import pandas as pd
from typing import List
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from pydantic import BaseModel, Field

In [3]:
# =========================
# 1. Schema & Parser
# =========================
class ReviewAnalysis(BaseModel):
    predicted_stars: int = Field(description="The star rating from 1 to 5")
    explanation: str = Field(description="Detailed reasoning for the score")

parser = JsonOutputParser(pydantic_object=ReviewAnalysis)

# We escape the instructions so LangChain doesn't think the JSON schema is a variable
# .replace("{", "{{").replace("}", "}}") is the trick here.
format_instructions = parser.get_format_instructions().replace("{", "{{").replace("}", "}}")

In [4]:
# =========================
# 2. Data Preparation
# =========================
try:
    df = pd.read_csv("yelp.csv")
    test_df = df.sample(n=5, random_state=42).reset_index(drop=True)
except Exception:
    # Synthetic data for immediate testing if CSV isn't present
    print("CSV file not found!!!.")

In [5]:
# =========================
# 3. Initialize LLM
# =========================
llm = ChatOllama(model="llama3.2", temperature=0)


In [None]:
# =========================
# 4. Detailed & Proper Strategies 
# =========================

# 1. Few-Shot (Boundary Focused)
# We choose examples that specifically target the "4 vs 5" and "1 vs 2" boundaries.
prompt_few_shot = ChatPromptTemplate.from_messages([
    ("system", f"You are a Yelp calibration expert. Your goal is to distinguish between 'Good' and 'Exceptional'. {format_instructions}"),
    ("human", "Review: Great food, we liked the pasta. Service was fine."),
    ("ai", '{{"predicted_stars": 4, "explanation": "A positive review without superlatives or mentions of going above-and-beyond is a 4, not a 5."}}'),
    ("human", "Review: This is the best meal I have ever had! Absolutely flawless service."),
    ("ai", '{{"predicted_stars": 5, "explanation": "Uses extreme superlatives and indicates a perfect experience."}}'),
    ("human", "Review: {text}")
])

# 2. Chain-of-Thought (The "Deductive" Approach)
# We force the model to look for "Negatives" first to counteract the 65% positivity bias.
prompt_cot = ChatPromptTemplate.from_messages([
    ("system", f"You are a critical reviewer. Don't be afraid to give lower scores if they are earned. {format_instructions}"),
    ("human", """Review: {text}

Analyze the review following this logic:
Step 1: Identify any specific 'Points of Friction' (slowness, price, noise, cold food).
Step 2: If there are ANY points of friction, the score cannot be a 5.
Step 3: If the experience was 'fine' but lacked excitement, it is a 3 or 4.
Step 4: Only assign a 5 if the reviewer expresses genuine delight with NO complaints.

Provide your reasoning in the 'explanation' field.""")
])

# 3. Discriminative Emotion (Intensity Scaling)
prompt_emotion = ChatPromptTemplate.from_messages([
    ("system", f"You are an expert in sentiment nuance. {format_instructions}"),
    ("human", """Review: {text}

Compare the intensity of the sentiment:
- 4 Stars (Satisfied): Words like 'good', 'nice', 'standard', 'will come back'.
- 5 Stars (Thrilled): Words like 'obsessed', 'amazing', 'incredible', 'favorite'.

Assign a score based on the 'vibrancy' of the language.""")
])

strategies = {
    "Few-Shot": prompt_few_shot,
    "Chain-of-Thought": prompt_cot,
    "Emotion-Weighted": prompt_emotion
}

In [7]:
# =========================
# 5. Evaluation Function
# =========================
def evaluate_strategy(prompt, test_df, llm):
    # The chain now includes the parser at the end
    chain = prompt | llm | parser
    
    results_list = []
    
    for _, row in test_df.iterrows():
        start_time = time.time()
        predicted_val = None
        success = False
        
        try:
            # We only pass 'text' because it's the only single-brace variable left
            response = chain.invoke({"text": row["text"]})
            predicted_val = response.get("predicted_stars")
            success = True
        except Exception as e:
            print(f"Error during chain execution: {e}")
            
        latency = time.time() - start_time
        
        results_list.append({
            "actual": row["stars"],
            "predicted": predicted_val,
            "json_valid": 1 if success else 0,
            "latency": latency
        })

    eval_results = pd.DataFrame(results_list)
    
    # Metrics
    valid = eval_results[eval_results["json_valid"] == 1]
    exact_acc = (valid["actual"] == valid["predicted"]).mean() if not valid.empty else 0
    
    return {
        "json_validity_pct": eval_results["json_valid"].mean() * 100,
        "exact_accuracy_pct": exact_acc * 100,
        "avg_latency_sec": eval_results["latency"].mean()
    }


In [None]:
# =========================
# 6. Run
# =========================
final_results = []
for name, prompt in strategies.items():
    print(f"Running {name}...")
    metrics = evaluate_strategy(prompt, test_df, llm)
    metrics["strategy"] = name
    final_results.append(metrics)

print("\n=== COMPARISON TABLE ===")
print(pd.DataFrame(final_results)[["strategy", "json_validity_pct", "exact_accuracy_pct", "avg_latency_sec"]])