# Labelling with OpenAI GPT 5 Nano

## Install packages

In [24]:
%pip install openai python-dotenv pandas

Note: you may need to restart the kernel to use updated packages.


## Import packages

In [212]:
from openai import OpenAI
import os
from dotenv import load_dotenv
import pandas as pd
import json
import time
import math

## Login to OpenAI

In [111]:
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=openai_api_key)

## Load data (kaggle)

In [26]:
df = pd.read_csv("./data/reviews.csv")
reviews = df["text"].dropna().tolist()
print(df.head())
print(len(df))
print(len(reviews))

                     business_name    author_name  \
0  Haci'nin Yeri - Yigit Lokantasi    Gulsum Akar   
1  Haci'nin Yeri - Yigit Lokantasi  Oguzhan Cetin   
2  Haci'nin Yeri - Yigit Lokantasi     Yasin Kuyu   
3  Haci'nin Yeri - Yigit Lokantasi     Orhan Kapu   
4  Haci'nin Yeri - Yigit Lokantasi     Ozgur Sati   

                                                text  \
0  We went to Marmaris with my wife for a holiday...   
1  During my holiday in Marmaris we ate here to f...   
2  Prices are very affordable. The menu in the ph...   
3  Turkey's cheapest artisan restaurant and its f...   
4  I don't know what you will look for in terms o...   

                                               photo  rating  \
0         dataset/taste/hacinin_yeri_gulsum_akar.png       5   
1        dataset/menu/hacinin_yeri_oguzhan_cetin.png       4   
2  dataset/outdoor_atmosphere/hacinin_yeri_yasin_...       3   
3  dataset/indoor_atmosphere/hacinin_yeri_orhan_k...       5   
4           dataset/menu

## Creating prompt

In [217]:
SYSTEM_PROMPT_STRING = """You are a strict moderation judge for location reviews. Classify each review into exactly ONE of:
advertisement (self-promo, discount codes, contact links),
irrelevant (off-topic, questions/chat unrelated to a real visit),
rant_without_visit (angry/defamatory without evidence of an actual visit),
clean (a normal on-topic review—positive or negative—from a real/likely visit).
If multiple seem plausible, choose the most severe (advertisement > irrelevant > rant_without_visit > clean).
Ignore emojis, casing, punctuation spam, and translation artifacts.
Do not guess facts."""

schema = {
    "name": "google_reviews_label",
    "schema": {
        "type": "object",
        "properties": {
            "label": {"type":"string","enum":["advertisement","irrelevant","rant_without_visit","clean"]}
        },
        "required": ["label"],
        "additionalProperties": False
    }
}

prompts = [f"Classify this review:\n<review>{review}</review>" for review in reviews]
print(prompts[0])

Classify this review:
<review>We went to Marmaris with my wife for a holiday. We chose this restaurant as a place for dinner based on the reviews and because we wanted juicy food. When we first went there was a serious queue. You proceed by taking the food you want in the form of an open buffet. Both vegetable dishes and meat dishes were plentiful. There was also dessert for those who wanted it. After you get what you want you pay at the cashier. They don't go through cards they work in cash. There was a lot of food variety. And the food prices were unbelievably cheap. We paid only 84 TL for all the meals here. It included buttermilk and bread. But unfortunately I can't say it's too clean as a place..</review>


## Labelling

In [218]:
jsonl_file = "./data/requests.jsonl"
if os.path.exists(jsonl_file):
    os.remove(jsonl_file)
for i in range(math.ceil(len(prompts)/750)):
    with open(jsonl_file, "w", encoding="utf-8") as f:
        for j, prompt in enumerate(prompts[i * 750:(i + 1) * 750], start=1):
            request_obj = {
                "custom_id": f"{i*750 + j}",
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": {
                "model": "gpt-5-nano",
                "messages": [
                        {"role":"system","content": SYSTEM_PROMPT_STRING},
                        {"role":"user","content": prompt}
                    ]
                }
            }
            f.write(json.dumps(request_obj) + "\n")
    batch_input_file = client.files.create(
        file=open(jsonl_file, "rb"),
        purpose="batch"
    )
    batch_input_file_id = batch_input_file.id
    batch = client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
            "description": "nightly eval job"
        }
    )
    batch_id = batch.id
    batch = client.batches.retrieve(batch_id)
    while (batch.status != "completed"):
        time.sleep(5)
        batch = client.batches.retrieve(batch_id)
    file_response = client.files.content(batch.output_file_id)
    with open("./data/responses.jsonl", "a", encoding="utf-8") as f:
        f.write(file_response.text)

## Save to csv file

In [219]:
with open("./data/responses.jsonl", "r", encoding="utf-8") as f:
    lines = f.read().splitlines()
responses = [json.loads(line)["response"]["body"]["choices"][0]["message"]["content"] for line in lines]
df["label"] = responses
df.to_csv("./data/labeled.csv", index=False)

## Change label to 0 and 1

In [220]:
df = pd.read_csv("./data/labeled.csv")
df['label'] = df['label'].apply(lambda x: 0 if str(x).lower() == "clean" else 1)
df = df[['text', 'label']]
df.to_csv("./data/reviews_binary.csv", index=False)
print(df.head())

                                                text  label
0  We went to Marmaris with my wife for a holiday...      0
1  During my holiday in Marmaris we ate here to f...      0
2  Prices are very affordable. The menu in the ph...      0
3  Turkey's cheapest artisan restaurant and its f...      1
4  I don't know what you will look for in terms o...      0
