In [1]:
!pip install -q openai


In [2]:
import json
import random
import pandas as pd
from tqdm import tqdm
from openai import OpenAI
import os
import time
from sklearn.metrics import accuracy_score, mean_absolute_error

In [3]:
# for m in genai.list_models():
#     if "generateContent" in m.supported_generation_methods:
#         print(m.name)


In [4]:
#OPENROUTER_API_KEY = os.environ["OPENROUTER_API_KEY"]


In [5]:
OPENROUTER_API_KEY = "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"

In [8]:
client = OpenAI(
    api_key=OPENROUTER_API_KEY,
    base_url="https://openrouter.ai/api/v1"
)

In [9]:
MODEL_NAME = "mistralai/mistral-7b-instruct"


In [10]:
df = pd.read_csv("/content/drive/MyDrive/Fynd/yelp.csv")

df = df[["text", "stars"]]

df = df[df["text"].str.strip().astype(bool)]

df_sample = df.sample(n=200, random_state=42).reset_index(drop=True)

df_sample.head()


Unnamed: 0,text,stars
0,We got here around midnight last Friday... the...,4
1,Brought a friend from Louisiana here. She say...,5
2,"Every friday, my dad and I eat here. We order ...",3
3,"My husband and I were really, really disappoin...",1
4,Love this place! Was in phoenix 3 weeks for w...,5


In [11]:
df_sample['text'][1]

"Brought a friend from Louisiana here.  She says that the crawfish etouffee here is the best she's had outside of Louisiana!"

In [12]:
PROMPT_V1 = """
You are an expert sentiment analysis system for Yelp reviews.
Analyze the following review and predict the star rating (1-5 stars).
RATING GUIDELINES:
- 1 star: Extremely negative experience, major issues
- 2 stars: Mostly negative but with some redeeming qualities
- 3 stars: Mixed experience, equal good and bad points
- 4 stars: Good experience overall, minor issues
- 5 stars: Excellent experience, highly positive
RULES:
- predicted_stars must be an integer between 1-5
- Keep explanation brief (1 sentence)

Return ONLY valid JSON:
{{
  "predicted_stars": number,
  "explanation": "short reasoning"
}}

Review:
{text}
"""

In [13]:
PROMPT_V2 = """
You are a Yelp rating expert. Analyze the review and assign a star rating (1-5) based on these criteria:
RATING RUBRIC:
★★★★★ (5) - Exceptional experience, exceeds expectations
★★★★☆ (4) - Very good with minor flaws
★★★☆☆ (3) - Average experience, meets basic expectations
★★☆☆☆ (2) - Below average, several issues
★☆☆☆☆ (1) - Terrible experience, strongly not recommended
CONSIDER:
- Food quality (if mentioned)
- Service experience
- Ambience (if mentioned)
- Value for money
- Any specific complaints or praises
REQUIREMENTS:
- predicted_stars must be 1, 2, 3, 4, or 5
- Keep explanation under 15 words

Return ONLY valid JSON:
{{
  "predicted_stars": integer between 1 and 5,
  "explanation": "one concise sentence"
}}

Review:
{text}
"""

In [14]:
PROMPT_V3 = """
You are a meticulous rating analyst. Follow these steps to evaluate the review:
1. SENTIMENT ANALYSIS:
   - Identify positive aspects (e.g., good food, great service)
   - Note negative aspects (e.g., slow service, poor quality)
   - Assess overall sentiment (very negative to very positive)
2. RATING DETERMINATION:
   - 1★: Overwhelmingly negative
   - 2★: Mostly negative with few positives
   - 3★: Balanced mix of positive and negative
   - 4★: Mostly positive with minor issues
   - 5★: Overwhelmingly positive
3. VALIDATION:
   - Ensure rating matches sentiment analysis
   - Check for any rating constraints mentioned
CRITICAL INSTRUCTIONS:
- No additional text outside the JSON
- predicted_stars must be an integer 1-5
- Keep explanation under 20 words

Return ONLY valid JSON:
Output format:
{{
  "predicted_stars": integer,
  "explanation": "short justification"
}}

Review:
{text}
"""




In [15]:
from openai import OpenAI

client = OpenAI(
    api_key=OPENROUTER_API_KEY,
    base_url="https://openrouter.ai/api/v1",
    default_headers={
        "HTTP-Referer": "http://localhost",
        "X-Title": "Yelp Rating Prediction Task"
    }
)


In [16]:
def get_prediction(prompt_template, text):
    prompt = prompt_template.format(text=text)

    try:
        response = client.chat.completions.create(
            model=MODEL_NAME,
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            max_tokens=256
        )

        raw = response.choices[0].message.content

        if raw is None or raw.strip() == "":
            return None, None, False

        parsed = extract_json(raw)

        if parsed is None:
            return None, None, False

        return (
            parsed.get("predicted_stars"),
            parsed.get("explanation"),
            True
        )

    except Exception as e:
        print("OpenRouter error:", e)
        return None, None, False


In [17]:
def process_dataset(df, prompt_template, sleep_time=0.5):
    results = []

    for _, row in tqdm(df.iterrows(), total=len(df)):
        pred, exp, valid = get_prediction(prompt_template, row["text"])

        results.append({
            "review": row["text"],
            "actual_stars": row["stars"],
            "predicted_stars": pred,
            "explanation": exp,
            "valid_json": valid
        })

        time.sleep(sleep_time)  # polite throttling

    return pd.DataFrame(results)


In [18]:
import re

def extract_json(text):
    """
    Extracts the first JSON object from text.
    Returns dict or None.
    """
    try:
        match = re.search(r"\{.*\}", text, re.DOTALL)
        if not match:
            return None
        return json.loads(match.group())
    except:
        return None


In [19]:
results_v1 = process_dataset(df_sample, PROMPT_V1, sleep_time=6)
results_v2 = process_dataset(df_sample, PROMPT_V2, sleep_time=6)
results_v3 = process_dataset(df_sample, PROMPT_V3, sleep_time=6)


100%|██████████| 200/200 [25:23<00:00,  7.62s/it]
100%|██████████| 200/200 [23:39<00:00,  7.10s/it]
100%|██████████| 200/200 [23:26<00:00,  7.03s/it]


In [21]:
correct_predictions_v1 = (results_v1['predicted_stars'] == results_v1['actual_stars']).sum()
print(f"Number of correct predictions in results_v1: {correct_predictions_v1}")

Number of correct predictions in results_v1: 52


In [22]:
correct_predictions_v2 = (results_v2['predicted_stars'] == results_v2['actual_stars']).sum()
print(f"Number of correct predictions in results_v2: {correct_predictions_v2}")

Number of correct predictions in results_v2: 23


In [23]:
correct_predictions_v3 = (results_v3['predicted_stars'] == results_v3['actual_stars']).sum()
print(f"Number of correct predictions in results_v3: {correct_predictions_v3}")

Number of correct predictions in results_v3: 30


In [24]:
results_v1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   review           200 non-null    object 
 1   actual_stars     200 non-null    int64  
 2   predicted_stars  91 non-null     float64
 3   explanation      91 non-null     object 
 4   valid_json       200 non-null    bool   
dtypes: bool(1), float64(1), int64(1), object(2)
memory usage: 6.6+ KB


In [27]:
results_v2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   review           200 non-null    object
 1   actual_stars     200 non-null    int64 
 2   predicted_stars  54 non-null     object
 3   explanation      54 non-null     object
 4   valid_json       200 non-null    bool  
dtypes: bool(1), int64(1), object(3)
memory usage: 6.6+ KB


In [28]:
results_v3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   review           200 non-null    object 
 1   actual_stars     200 non-null    int64  
 2   predicted_stars  55 non-null     float64
 3   explanation      55 non-null     object 
 4   valid_json       200 non-null    bool   
dtypes: bool(1), float64(1), int64(1), object(2)
memory usage: 6.6+ KB


In [25]:
def compute_metrics(df):
    valid_df = df[df["valid_json"] == True]

    accuracy = (valid_df["predicted_stars"] == valid_df["actual_stars"]).mean()
    json_validity = df["valid_json"].mean()

    return {
        "accuracy": round(accuracy * 100, 2),
        "json_validity_rate": round(json_validity * 100, 2)
    }


In [26]:
comparison = pd.DataFrame([
    {"Prompt": "P1", **compute_metrics(results_v1)},
    {"Prompt": "P2", **compute_metrics(results_v2)},
    {"Prompt": "P3", **compute_metrics(results_v3)}
])

comparison


Unnamed: 0,Prompt,accuracy,json_validity_rate
0,P1,57.14,45.5
1,P2,42.59,27.0
2,P3,54.55,27.5
