In [None]:
import json
import time
import pandas as pd
import numpy as np
from google import genai
from typing import Dict, Any, List

# --- Configuration & Initialization ---
# The execution environment provides the API key automatically.
client = genai.Client(api_key="AIzaSyDaLU3unG0kN_L7Iw80NsMHLIVuxERrt3c")
MODEL_NAME = "gemini-2.5-flash-preview-09-2025"

def call_gemini_with_retry(prompt: str, retries: int = 5) -> str:
    """Calls Gemini API with exponential backoff."""
    for i in range(retries):
        try:
            response = client.models.generate_content(
                model=MODEL_NAME,
                contents=prompt
            )
            return response.text
        except Exception as e:
            if i == retries - 1:
                return "ERROR: Max retries exceeded"
            time.sleep(2**i)
    return "ERROR"

# --- Prompting Strategies ---

def get_prompt_v1(review_text: str) -> str:
    """Version 1: Direct Instruction (Baseline)"""
    return f"""You are a sentiment analysis assistant.
Task: Classify the following Yelp review into a rating of 1 to 5 stars.
Output: Return only a JSON object in this format:
{{
"predicted_stars": <int>,
"explanation": "<string>"
}}

Review: {review_text}"""

def get_prompt_v2(review_text: str) -> str:
    """Version 2: Analytical Rubric (Improved Context)"""
    return f"""Role: Expert Customer Experience Analyst.
Task: Predict the star rating (1-5) for a Yelp review based on this rubric:
- 1 Star: Disastrous, fundamental failures, "never again."
- 2 Stars: Poor experience with a few redeeming qualities.
- 3 Stars: Average/Mediocre; met basic expectations but didn't impress.
- 4 Stars: Great; minor issues but overall very positive.
- 5 Stars: Exceptional; perfect service/product, highly recommended.

Constraint: Respond ONLY with valid JSON.
JSON Structure:
{{
"predicted_stars": <int>,
"explanation": "Briefly describe the specific sentiment triggers found in the text."
}}

Review: {review_text}"""

def get_prompt_v3(review_text: str) -> str:
    """Version 3: Chain-of-Thought (CoT) Extraction"""
    return f"""Analyze the provided Yelp review.
1. Identify the key Pros mentioned.
2. Identify the key Cons mentioned.
3. Determine the final star rating (1-5) based on the balance of these factors.

Final Output: You must provide your final answer in JSON format only.
{{
"predicted_stars": <int>,
"explanation": "A summary of the Pros/Cons balance that led to this rating."
}}

Review: {review_text}"""

# --- Evaluation Logic ---

def clean_json_response(raw_text: str) -> Dict[str, Any]:
    """Extracts and parses JSON from model response."""
    try:
        # Remove markdown code blocks if present
        cleaned = raw_text.strip().replace("```json", "").replace("```", "").strip()
        return json.loads(cleaned)
    except Exception:
        return None

def run_evaluation(df: pd.DataFrame, prompt_func, version_name: str):
    print(f"\n--- Evaluating {version_name} ---")
    results = []

    for idx, row in df.iterrows():
        prompt = prompt_func(row['text'])
        raw_response = call_gemini_with_retry(prompt)
        parsed = clean_json_response(raw_response)

        actual = int(row['stars'])
        predicted = parsed.get("predicted_stars") if parsed else None

        is_valid_json = parsed is not None
        is_correct = (predicted == actual) if predicted is not None else False
        is_within_one = (abs(predicted - actual) <= 1) if predicted is not None else False

        results.append({
            "actual": actual,
            "predicted": predicted,
            "is_valid_json": is_valid_json,
            "is_correct": is_correct,
            "is_within_one": is_within_one
        })

        # Simple progress log
        if (idx + 1) % 10 == 0:
            print(f"Processed {idx + 1}/{len(df)}")

    # Calculate metrics
    res_df = pd.DataFrame(results)
    accuracy = res_df['is_correct'].mean() * 100
    within_one = res_df['is_within_one'].mean() * 100
    json_validity = res_df['is_valid_json'].mean() * 100

    print(f"Results for {version_name}:")
    print(f" - Accuracy: {accuracy:.1f}%")
    print(f" - +/- 1 Star Accuracy: {within_one:.1f}%")
    print(f" - JSON Validity: {json_validity:.1f}%")

    return {
        "version": version_name,
        "accuracy": accuracy,
        "within_one": within_one,
        "json_validity": json_validity
    }

# --- Main Execution ---

if __name__ == "__main__":
    # Note: Replace with actual path to your kaggle 'yelp.csv'
    # For this demo, we assume the dataset has 'text' and 'stars' columns
    try:
        data = pd.read_csv("/yelp.csv")
        # Sample 200 rows for efficiency
        sample_df = data.sample(n=200, random_state=42).reset_index(drop=True)
    except FileNotFoundError:
        print("Error: 'yelp.csv' not found. Please ensure the dataset is in the working directory.")
        # Mock data for demonstration if file is missing
        sample_df = pd.DataFrame({
            "text": ["The food was great but the service was slow.", "Terrible experience!", "Best pizza in town."],
            "stars": [4, 1, 5]
        })

    evaluations = []
    evaluations.append(run_evaluation(sample_df, get_prompt_v1, "V1: Direct"))
    evaluations.append(run_evaluation(sample_df, get_prompt_v2, "V2: Rubric"))
    evaluations.append(run_evaluation(sample_df, get_prompt_v3, "V3: CoT"))

    print("\n" + "="*30)
    print("FINAL COMPARISON TABLE")
    print("="*30)
    summary = pd.DataFrame(evaluations)
    print(summary)


--- Evaluating V1: Direct ---
Processed 10/200
Processed 20/200
Processed 30/200
Processed 40/200
Processed 50/200
Processed 60/200
Processed 70/200
Processed 80/200
Processed 90/200
Processed 100/200
Processed 110/200
Processed 120/200
Processed 130/200
Processed 140/200
Processed 150/200
Processed 160/200
Processed 170/200
Processed 180/200
Processed 190/200
Processed 200/200
Results for V1: Direct:
 - Accuracy: 5.5%
 - +/- 1 Star Accuracy: 10.5%
 - JSON Validity: 10.5%

--- Evaluating V2: Rubric ---
Processed 10/200
Processed 20/200
Processed 30/200
Processed 40/200
Processed 50/200
Processed 60/200
Processed 70/200
Processed 80/200
Processed 90/200
Processed 100/200
Processed 110/200
Processed 120/200
Processed 130/200
Processed 140/200
Processed 150/200
Processed 160/200
Processed 170/200
Processed 180/200
Processed 190/200
Processed 200/200
Results for V2: Rubric:
 - Accuracy: 0.0%
 - +/- 1 Star Accuracy: 0.0%
 - JSON Validity: 0.0%

--- Evaluating V3: CoT ---
Processed 10/200
P

In [None]:
from google.colab import drive
drive.mount('/content/drive')