In [1]:
!pip install -q google-genai

import os
import json
import re
import numpy as np
import pandas as pd
from google import genai


In [32]:

os.environ["GEMINI_API_KEY"] = "AIzaSyA38-cZZ2pYeIQQ0tjPhDKt_DOsFUntCdg"


client = genai.Client()
MODEL_NAME = "gemini-2.5-flash"  


In [9]:

csv_path = "/kaggle/input/yelp-reviews-dataset/yelp.csv"

df = pd.read_csv(csv_path)


df = df[["text", "stars"]].dropna()

sample_df = df.sample(n=200, random_state=42).reset_index(drop=True)
sample_df.head()


Unnamed: 0,text,stars
0,We got here around midnight last Friday... the...,4
1,Brought a friend from Louisiana here. She say...,5
2,"Every friday, my dad and I eat here. We order ...",3
3,"My husband and I were really, really disappoin...",1
4,Love this place! Was in phoenix 3 weeks for w...,5


In [10]:
def call_gemini(prompt: str) -> str:
    """Call Gemini with a text prompt and return the raw text response."""
    response = client.models.generate_content(
        model=MODEL_NAME,
        contents=prompt
    )
    return response.text


def extract_json(text: str):
    """
    Try to extract a JSON object from the model response.
    Returns (data, is_valid_json).
    """
    if text is None:
        return None, False
    
 
    try:
        return json.loads(text), True
    except Exception:
        pass
    
    match = re.search(r'\{.*\}', text, flags=re.DOTALL)
    if not match:
        return None, False
    
    json_str = match.group(0)
    try:
        return json.loads(json_str), True
    except Exception:
        return None, False


In [11]:
FEW_SHOT_EXAMPLES = """
You are an expert Yelp review rater. 
Your job is to assign an integer star rating from 1 to 5 based ONLY on the review text.

Here are some examples from the dataset:

Example 1:
Review:
"My wife took me here on my birthday for breakfast and it was excellent.  The weather was perfect which made sitting outside overlooking their grounds an absolute pleasure.  Our waitress was excellent and our food arrived quickly on the semi-busy Saturday morning.  It looked like the place fills up pretty quickly so the earlier you get here the better.

Do yourself a favor and get their Bloody Mary.  It was phenomenal and simply the best I've ever had.  I'm pretty sure they only use ingredients from their garden and blend them fresh when you order it.  It was amazing.

While EVERYTHING on the menu looks excellent, I had the white truffle scrambled eggs vegetable skillet and it was tasty and delicious.  It came with 2 pieces of their griddled bread with was amazing and it absolutely made the meal complete.  It was the best "toast" I've ever had.

Anyway, I can't wait to go back!"
Stars: 5

Example 2:
Review:
"love the gyro plate. Rice is so good and I also dig their candy selection :)"
Stars: 4

Example 3:
Review:
"Was it worth the 21$ for a salad and small pizza? Absolutely not! Bad service. Maybe the guys grandma died I don't know. I want to tell you what really made me mad about the experience. We order the small pizza and salad and the guys could have cared less and took our $ and we sat down. We were looking around and hmm, there's a sign saying "x large pizza and large salad only 23$". Wow that would have been nice if the guy told us that. I left hungry, mad and unsatisfied. 

To the owner: teach your employees the value of upselling and telling the specials. Something so small can affect a customers experience negatively. 

And your salads are severely overpriced 

Won't go back unless I'm desperate."
Stars: 2

Example 4:
Review:
"We went here on a Saturday afternoon and this place was incredibly empty.  They had brunch specials going on, including $2 bloody mary's and mimosas, but we were more in the mood for lunch.  Except for the bloody mary, I had to try one.  It came out in a high-ball-sized glass.  Boo!  But it was really tasty. Yay!  The hubby remembered a sign outside the restaurant a few weeks back that said they had Arrogant Bastard, and he got a 22 oz bottle for $4.75.  Hey, that's not fair!!

Next up: the wings.  We were a bit hesitant to order them when the waitress informed us that they are "seasoned" but not sauced, so they can't be ordered hot.  We did ask for them crispy though, and the waitress even asked the cooks to throw them back in for a few minutes when they came out not visibly crispy.  These non-traditional wings were actually pretty damn good.  The seasoning was a little spicy and salty with just a hint of sweet.  If I were in the mood for the tang and kick of Frank's Hot Sauce, these wouldn't cut it, but otherwise they were good enough to go back again for.

My entree was the Tilapia salad, and I was a bit disappointed.  The fish was a bit dry and uninspired. And the greens underneath were overdressed and wilted.  I ate the greens around the fish and picked out the almonds and Mandarin oranges, but I had to leave the mush hiding underneath the fish.

It wasn't bad enough to say I wouldn't go back, but I won't be anxiously awaiting my next trip."
Stars: 3

Example 5:
Review:
"U can go there n check the car out. If u wanna buy 1 there? That's wrong move! If u even want a car service from there? U made a biggest mistake of ur life!! I had 1 time asked my girlfriend to take my car there for an oil service, guess what? They ripped my girlfriend off by lying how bad my car is now. If without fixing the problem. Might bring some serious accident. Then she did what they said. 4 brand new tires, timing belt, 4 new brake pads. U know why's the worst? All of those above I had just changed 2 months before!!! What a trashy dealer is that? People, better off go somewhere!"
Stars: 1

Now, based on the patterns above, rate the following review from 1 to 5 stars.
"""

def build_few_shot_prompt(review_text: str) -> str:
    return f"""
{FEW_SHOT_EXAMPLES}

Review to rate:
{review_text}

You MUST respond with a single valid JSON object in exactly this format:
{{
  "predicted_stars": <integer from 1 to 5>,
  "explanation": "Brief reasoning for the assigned rating."
}}

Do not include any text outside of this JSON object.
"""


In [12]:
RUBRIC_TEXT = """
You are scoring Yelp reviews using the following rubric:

1 star: Extremely negative. Strong dissatisfaction, serious problems, regret, or 'never coming back'.
2 stars: Mostly negative. Several issues or clear frustration, but maybe a few minor positives.
3 stars: Mixed or neutral. Balanced pros and cons, 'it was okay', or 'not great but not terrible'.
4 stars: Mostly positive. Good experience overall with minor issues or suggestions.
5 stars: Extremely positive. Strong enthusiasm, clear recommendation, 'best ever', 'can't wait to return'.

Always focus ONLY on the content of the review text, not what you think might have happened.
"""

def build_rubric_prompt(review_text: str) -> str:
    return f"""
{RUBRIC_TEXT}

Review to rate:
{review_text}

Using the rubric, assign a rating from 1 to 5.

You MUST respond with a single valid JSON object in exactly this format:
{{
  "predicted_stars": <integer from 1 to 5>,
  "explanation": "Brief reasoning for the assigned rating based on the rubric."
}}

Do not include any text outside of this JSON object.
"""


In [13]:
def build_cot_prompt(review_text: str) -> str:
    return f"""
You are an expert sentiment analyst for Yelp reviews.

First, internally think step-by-step about:
- The main sentiment in the review (positive, negative, mixed)
- The strength of that sentiment (mild, moderate, strong)
- How that maps onto a 1 to 5 star scale.

Then, after you finish reasoning, respond ONLY with a single valid JSON object.
Do not show your step-by-step reasoning directly, just reflect it in the explanation text.

Review to rate:
{review_text}

Respond ONLY in this JSON format:
{{
  "predicted_stars": <integer from 1 to 5>,
  "explanation": "Brief reasoning for the assigned rating."
}}
"""


In [33]:
import time
import pandas as pd
import os 


MIN_DELAY = 7   
def call_model_throttled(prompt):
    global last_call_time
    now = time.time()
    if last_call_time is not None:
        elapsed = now - last_call_time
        if elapsed < MIN_DELAY:
            time.sleep(MIN_DELAY - elapsed)
    result = call_gemini(prompt) 
    last_call_time = time.time()
    return result

last_call_time = None

def safe_int_star(value):
    try:
        v = int(value)
        if 1 <= v <= 5:
            return v
    except Exception:
        pass
    return None

RESULTS_FILE = "partial_results.csv"


if os.path.exists(RESULTS_FILE):
    
    existing_df = pd.read_csv(RESULTS_FILE)
    results = existing_df.to_dict('records')

    processed_count = len(results)
    print(f"RESUMING JOB: Found {processed_count} completed rows in {RESULTS_FILE}.")
else:
    
    results = []
    processed_count = 0
    print("STARTING NEW JOB: No previous results found.")




for index, row in sample_df.iterrows():
    
    
    if index < processed_count:
        continue 

        
    review_text = str(row["text"])
    actual_stars = int(row["stars"])
    
    row_result = {
        "text": review_text,
        "actual_stars": actual_stars,
    }
    

    few_prompt = build_few_shot_prompt(review_text) 
    few_raw = call_model_throttled(few_prompt)
    few_json, few_valid = extract_json(few_raw)
    few_pred = safe_int_star(few_json.get("predicted_stars")) if few_valid else None
    
    row_result.update({
        "few_pred": few_pred,
        "few_json_valid": few_valid,
    })
    
   
    rub_prompt = build_rubric_prompt(review_text) 
    rub_raw = call_model_throttled(rub_prompt)
    rub_json, rub_valid = extract_json(rub_raw)
    rub_pred = safe_int_star(rub_json.get("predicted_stars")) if rub_valid else None
    
    row_result.update({
        "rubric_pred": rub_pred,
        "rubric_json_valid": rub_valid,
    })
    
   
    cot_prompt = build_cot_prompt(review_text)
    cot_raw = call_model_throttled(cot_prompt)
    cot_json, cot_valid = extract_json(cot_raw)
    cot_pred = safe_int_star(cot_json.get("predicted_stars")) if cot_valid else None
    
    row_result.update({
        "cot_pred": cot_pred,
        "cot_json_valid": cot_valid,
    })
    
    results.append(row_result)
    
    
    if len(results) % 20 == 0:
        pd.DataFrame(results).to_csv(RESULTS_FILE, index=False)


eval_df = pd.DataFrame(results)
print("Processing complete.")
eval_df.head()

RESUMING JOB: Found 120 completed rows in partial_results.csv.
Processing complete.


Unnamed: 0,text,actual_stars,few_pred,few_json_valid,rubric_pred,rubric_json_valid,cot_pred,cot_json_valid
0,We got here around midnight last Friday... the...,4,4,True,5,True,5.0,True
1,Brought a friend from Louisiana here. She say...,5,5,True,5,True,5.0,True
2,"Every friday, my dad and I eat here. We order ...",3,4,True,4,True,5.0,True
3,"My husband and I were really, really disappoin...",1,1,True,1,True,1.0,True
4,Love this place! Was in phoenix 3 weeks for w...,5,5,True,5,True,5.0,True


In [34]:
eval_df = pd.DataFrame(results)
def compute_metrics(pred_col, json_valid_col):
    sub = eval_df[~eval_df[pred_col].isna()].copy()
    if len(sub) == 0:
        return {
            "n_scored": 0,
            "accuracy": np.nan,
            "soft_accuracy": np.nan,
            "json_valid_rate": eval_df[json_valid_col].mean()
        }
    
    exact = (sub[pred_col] == sub["actual_stars"]).mean()
    soft = (sub[pred_col] - sub["actual_stars"]).abs().le(1).mean()
    json_valid_rate = eval_df[json_valid_col].mean()
    
    return {
        "n_scored": len(sub),
        "accuracy": exact,
        "soft_accuracy": soft,
        "json_valid_rate": json_valid_rate
    }

few_metrics    = compute_metrics("few_pred",    "few_json_valid")
rubric_metrics = compute_metrics("rubric_pred", "rubric_json_valid")
cot_metrics    = compute_metrics("cot_pred",    "cot_json_valid")

summary = pd.DataFrame.from_dict(
    {
        "few_shot": few_metrics,
        "rubric_based": rubric_metrics,
        "cot_reasoning": cot_metrics,
    },
    orient="index"
)

summary


Unnamed: 0,n_scored,accuracy,soft_accuracy,json_valid_rate
few_shot,200,0.62,0.98,1.0
rubric_based,200,0.545,0.915,1.0
cot_reasoning,199,0.517588,0.934673,0.995
