In [1]:
!pip install pandas numpy tqdm scikit-learn python-dotenv



In [2]:
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
from sklearn.metrics import accuracy_score

In [3]:
from google.colab import files
import pandas as pd

uploaded = files.upload()
df = pd.read_csv("yelp.csv")

Saving yelp.csv to yelp.csv


In [5]:
df = df[['text', 'stars']]

df_sample = df.sample(n=200, random_state=42).reset_index(drop=True)

df_sample.head()

Unnamed: 0,text,stars
0,We got here around midnight last Friday... the...,4
1,Brought a friend from Louisiana here. She say...,5
2,"Every friday, my dad and I eat here. We order ...",3
3,"My husband and I were really, really disappoin...",1
4,Love this place! Was in phoenix 3 weeks for w...,5


In [6]:
def prompt_v1(review_text):
    return f"""
You are an assistant that classifies Yelp reviews into star ratings.

Review:
"{review_text}"

Return a JSON object with:
- predicted_stars (integer from 1 to 5)
- explanation (one sentence)
"""

In [7]:
def prompt_v2(review_text):
    return f"""
You are an expert sentiment analyst.

Classify the following Yelp review into a star rating using these rules:
- 1 star: Very negative, strong complaints
- 2 stars: Mostly negative with minor positives
- 3 stars: Mixed or neutral experience
- 4 stars: Mostly positive with small issues
- 5 stars: Extremely positive, enthusiastic praise

Review:
"{review_text}"

Return ONLY valid JSON in this format:
{{
  "predicted_stars": <1-5>,
  "explanation": "<brief reason>"
}}
"""

In [8]:
def prompt_v3(review_text):
    return f"""
You are a strict JSON-only classifier.

Analyze sentiment, opinion strength, and complaints internally.

Review:
"{review_text}"

Output ONLY the final JSON:
{{
  "predicted_stars": <integer between 1 and 5>,
  "explanation": "<one concise sentence>"
}}
"""

In [9]:
import requests
import os

OPENROUTER_API_KEY = "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"

def call_llm(prompt):
    response = requests.post(
        "https://openrouter.ai/api/v1/chat/completions",
        headers={
            "Authorization": f"Bearer {OPENROUTER_API_KEY}",
            "Content-Type": "application/json"
        },
        json={
            "model": "mistralai/mistral-7b-instruct",
            "messages": [{"role": "user", "content": prompt}],
            "temperature": 0
        }
    )
    return response.json()["choices"][0]["message"]["content"]


In [10]:
def get_prediction(prompt_func, review_text):
    try:
        response = call_llm(prompt_func(review_text))
        parsed = json.loads(response)
        return parsed["predicted_stars"], True
    except:
        return None, False

In [11]:
def evaluate_prompt(prompt_func, df):
    predictions = []
    actuals = []
    json_valid = 0

    for _, row in tqdm(df.iterrows(), total=len(df)):
        pred, valid = get_prediction(prompt_func, row["text"])
        actuals.append(row["stars"])
        predictions.append(pred if pred else -1)
        json_valid += int(valid)

    accuracy = accuracy_score(
        actuals,
        [p if p != -1 else 0 for p in predictions]
    )

    return {
        "accuracy": accuracy,
        "json_validity_rate": json_valid / len(df)
    }

In [12]:

df_sample = df.sample(n=200, random_state=42).reset_index(drop=True)


In [13]:
llm_cache = {}

def call_llm_cached(prompt):
    if prompt in llm_cache:
        return llm_cache[prompt]

    response = call_llm(prompt)
    llm_cache[prompt] = response
    return response


In [14]:
def get_prediction(prompt_func, review_text):
    try:
        prompt = prompt_func(review_text)
        response = call_llm_cached(prompt)

        parsed = json.loads(response)
        return parsed["predicted_stars"], True
    except Exception as e:
        return None, False

In [15]:
def evaluate_prompt(prompt_func, df):
    predictions = []
    actuals = []
    json_valid = 0

    for _, row in tqdm(df.iterrows(), total=len(df)):
        pred, valid = get_prediction(prompt_func, row["text"])

        actuals.append(int(row["stars"]))

        if pred is None:
            predictions.append(0)
        else:
            predictions.append(int(pred))

        json_valid += int(valid)

    accuracy = accuracy_score(actuals, predictions)

    return {
        "accuracy": accuracy,
        "json_validity_rate": json_valid / len(df)
    }


In [16]:
results = []

for name, prompt_func in [
    ("Prompt V1 - Basic", prompt_v1),
    ("Prompt V2 - Criteria", prompt_v2),
    ("Prompt V3 - Strict JSON", prompt_v3)
]:
    metrics = evaluate_prompt(prompt_func, df_sample)
    results.append({
        "Prompt Version": name,
        "Accuracy": round(metrics["accuracy"], 3),
        "JSON Validity Rate": round(metrics["json_validity_rate"], 3)
    })


100%|██████████| 200/200 [02:28<00:00,  1.35it/s]
100%|██████████| 200/200 [02:53<00:00,  1.15it/s]
100%|██████████| 200/200 [01:21<00:00,  2.46it/s]


In [17]:
results_df = pd.DataFrame(results)
results_df.to_csv("task1_prompt_results.csv", index=False)
results_df

Unnamed: 0,Prompt Version,Accuracy,JSON Validity Rate
0,Prompt V1 - Basic,0.02,0.02
1,Prompt V2 - Criteria,0.115,0.18
2,Prompt V3 - Strict JSON,0.05,0.08



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

