# Labelling with OpenAI GPT 5 Nano

## Install packages

In [265]:
%pip install openai python-dotenv pandas tiktoken

Collecting tiktoken
  Downloading tiktoken-0.11.0-cp313-cp313-win_amd64.whl.metadata (6.9 kB)
Downloading tiktoken-0.11.0-cp313-cp313-win_amd64.whl (883 kB)
   ---------------------------------------- 0.0/883.9 kB ? eta -:--:--
   ----------- ---------------------------- 262.1/883.9 kB ? eta -:--:--
   ---------------------------------------- 883.9/883.9 kB 2.1 MB/s eta 0:00:00
Installing collected packages: tiktoken
Successfully installed tiktoken-0.11.0
Note: you may need to restart the kernel to use updated packages.


## Import packages

In [20]:
from openai import OpenAI
import os
from dotenv import load_dotenv
import pandas as pd
import json
import time
import math
import gzip
import tiktoken

## Login to OpenAI

In [2]:
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=openai_api_key)

## Load data

### kaggle

In [3]:
df = pd.read_csv("./data/reviews.csv")
df = df.dropna(subset=["text"]).reset_index(drop=True)
print(len(df))
print(df.head())

1100
                     business_name    author_name  \
0  Haci'nin Yeri - Yigit Lokantasi    Gulsum Akar   
1  Haci'nin Yeri - Yigit Lokantasi  Oguzhan Cetin   
2  Haci'nin Yeri - Yigit Lokantasi     Yasin Kuyu   
3  Haci'nin Yeri - Yigit Lokantasi     Orhan Kapu   
4  Haci'nin Yeri - Yigit Lokantasi     Ozgur Sati   

                                                text  \
0  We went to Marmaris with my wife for a holiday...   
1  During my holiday in Marmaris we ate here to f...   
2  Prices are very affordable. The menu in the ph...   
3  Turkey's cheapest artisan restaurant and its f...   
4  I don't know what you will look for in terms o...   

                                               photo  rating  \
0         dataset/taste/hacinin_yeri_gulsum_akar.png       5   
1        dataset/menu/hacinin_yeri_oguzhan_cetin.png       4   
2  dataset/outdoor_atmosphere/hacinin_yeri_yasin_...       3   
3  dataset/indoor_atmosphere/hacinin_yeri_orhan_k...       5   
4           dataset

### UCSD

In [21]:
def parse(path):
    with gzip.open(path, 'rt', encoding='utf-8') as g:
        for line in g:
            yield json.loads(line)
data = list(parse("./data/review-Vermont_10.json.gz"))
df = pd.DataFrame(data)
df = df.dropna(subset=["text"]).reset_index(drop=True)
df = df[50000:100000]
print(len(df))
print(df.head())

50000
                     user_id               name           time  rating  \
50000  103234900551010368888     Anthony Notaro  1621938846599       5   
50001  105721221296569160265  Pete Chilimindris  1617837636215       5   
50002  117952308343374371773    Collette Arnold  1617152719092       5   
50003  107070260610385191338      Debbie Conger  1617751005979       1   
50004  117533832737257915923        Adam Walker  1620700864307       4   

                                                    text  \
50000  New menu  and I gotta say, it's pretty amazing...   
50001  Amazing food and great atmosphere. First time ...   
50002  They vamped up their menu recently & their coc...   
50003  Waitress was great. Very friendly and service ...   
50004  Drinks were on point, service was sluggish, fo...   

                                                    pics  \
50000  [{'url': ['https://lh5.googleusercontent.com/p...   
50001                                               None   
50002  [

## Creating prompt

In [None]:
SYSTEM_PROMPT_STRING = """You are a strict moderation judge for location reviews. Classify each review into exactly ONE of:
advertisement (self-promo, discount codes, contact links),
irrelevant (off-topic, questions/chat unrelated to a real visit),
rant_without_visit (angry/defamatory without evidence of an actual visit),
clean (a normal on-topic review—positive or negative—from a real/likely visit).
If multiple seem plausible, choose the most severe (advertisement > irrelevant > rant_without_visit > clean).
Ignore emojis, casing, punctuation spam, and translation artifacts.
Do not guess facts."""

df["prompt"] = df["text"].apply(lambda review: f"Classify this review:\n<review>{review}</review>")
print(df["prompt"].head())

50000    Classify this review:\n<review>New menu  and I...
50001    Classify this review:\n<review>Amazing food an...
50002    Classify this review:\n<review>They vamped up ...
50003    Classify this review:\n<review>Waitress was gr...
50004    Classify this review:\n<review>Drinks were on ...
Name: prompt, dtype: object


## Create batches

In [59]:
enc = tiktoken.encoding_for_model("gpt-5-nano")

def count_tokens(text):
    return len(enc.encode(text))

MAX_TOKENS = 200000
batches = []
current_batch = []
current_tokens = 0

for idx, prompt in enumerate(df["prompt"], start=1):
    prompt_tokens = count_tokens(prompt) + count_tokens(SYSTEM_PROMPT_STRING) + 100
    if current_tokens + prompt_tokens > MAX_TOKENS:
        batches.append(current_batch)
        current_batch = []
        current_tokens = 0
    current_batch.append((idx, prompt))
    current_tokens += prompt_tokens

if current_batch:
    batches.append(current_batch)
    
print(len(batches))
print(batches[:5])

64


## Labelling

In [None]:
jsonl_file = "./data/requests.jsonl"
responses_file = "./data/responses.jsonl"
if os.path.exists(jsonl_file):
    os.remove(jsonl_file)
if os.path.exists(responses_file):
    os.remove(responses_file)
for i, batch_prompts in enumerate(batches):
    with open(jsonl_file, "w", encoding="utf-8") as f:
        for idx, prompt in batch_prompts:
            request_obj = {
                "custom_id": f"{idx}",
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": {
                "model": "gpt-5-nano",
                "messages": [
                        {"role":"system","content": SYSTEM_PROMPT_STRING},
                        {"role":"user","content": prompt}
                    ]
                }
            }
            f.write(json.dumps(request_obj) + "\n")
    batch_input_file = client.files.create(
        file=open(jsonl_file, "rb"),
        purpose="batch"
    )
    batch_input_file_id = batch_input_file.id
    batch = client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
            "description": "nightly eval job"
        }
    )
    batch_id = batch.id
    batch = client.batches.retrieve(batch_id)
    while (batch.status != "completed"):
        if (batch.status == "failed"):
            raise Exception("Batch failed")
        time.sleep(5)
        batch = client.batches.retrieve(batch_id)
    file_response = client.files.content(batch.output_file_id)
    with open(responses_file, "a", encoding="utf-8") as f:
        f.write(file_response.text)

APITimeoutError: Request timed out.

## Save to csv file

In [252]:
df_responses = pd.read_json(responses_file, lines=True)
df["label"] = df_responses["response"].apply(lambda x: x['body']['choices'][0]['message']['content'])
df.to_csv("./data/review-Vermont_10-labeled.csv", index=False)
print(df.head())

                 user_id             name           time  rating  \
0  118026874392842649478    rebecca kerns  1620085852324       5   
1  101532740754036204131    Peter DeForge  1580309946474       5   
2  115404122636203550540    Chad Goulette  1605195974445       5   
3  104789336434407408181  Mark LaFountain  1593005848256       5   
4  108980665975608069965           Jeff R  1582059996120       5   

                                                text  pics  \
0      Always done right from wood stove to screens!  None   
1  A great company to work with.  Their sales and...  None   
2  Great place to do business with staff was grea...  None   
3  Awesome Customer service, quick response, and ...  None   
4  If you need a top quality job, by a group of p...  None   

                                                resp  \
0  {'time': 1620087641504, 'text': 'Good Evening,...   
1  {'time': 1580320228721, 'text': 'Good Afternoo...   
2  {'time': 1605195166792, 'text': 'Hi Chad!

Tha.