# Labelling with OpenAI GPT 5 Nano

## Install packages

In [None]:
%pip install openai python-dotenv pandas tiktoken

## Import packages

In [None]:
from openai import OpenAI
import os
from dotenv import load_dotenv
import pandas as pd
import json
import time
import gzip
import tiktoken
import numpy as np

## Login to OpenAI

In [None]:
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=openai_api_key)

## Load data

### kaggle

In [None]:
df = pd.read_csv("./data/reviews.csv")
df = df.dropna(subset=["text"]).reset_index(drop=True)
print(len(df))
print(df.head())

### UCSD

In [None]:
def parse(path):
    with gzip.open(path, 'rt', encoding='utf-8') as g:
        for line in g:
            yield json.loads(line)
data = list(parse("./data/review-Vermont_10.json.gz"))
df = pd.DataFrame(data)
df = df.dropna(subset=["text"]).reset_index(drop=True)
print(len(df))
print(df.head())

## Creating prompt

In [None]:
SYSTEM_PROMPT_STRING = """You are a strict moderation judge for location reviews. Classify each review into exactly ONE of:
advertisement (self-promo, discount codes, contact links),
irrelevant (off-topic, questions/chat unrelated to a real visit),
rant_without_visit (angry/defamatory without evidence of an actual visit),
clean (a normal on-topic review—positive or negative—from a real/likely visit).
If multiple seem plausible, choose the most severe (advertisement > irrelevant > rant_without_visit > clean).
Ignore emojis, casing, punctuation spam, and translation artifacts.
Do not guess facts."""

df["prompt"] = df["text"].apply(lambda review: f"Classify this review:\n<review>{review}</review>")
print(df["prompt"].head())

## Create batches

In [None]:
enc = tiktoken.encoding_for_model("gpt-5-nano")

def count_tokens(text):
    return len(enc.encode(text))

MAX_TOKENS = 900000
batches = []
current_batch = []
current_tokens = 0

for idx, prompt in enumerate(df["prompt"], start=1):
    prompt_tokens = count_tokens(prompt) + count_tokens(SYSTEM_PROMPT_STRING) + 100
    if current_tokens + prompt_tokens > MAX_TOKENS:
        batches.append(current_batch)
        current_batch = []
        current_tokens = 0
    current_batch.append((idx, prompt))
    current_tokens += prompt_tokens

if current_batch:
    batches.append(current_batch)
    
print(len(batches))
print(batches[:5])

## Labelling

In [None]:
jsonl_file = "./data/requests.jsonl"
responses_file = "./data/responses.jsonl"
if os.path.exists(jsonl_file):
    os.remove(jsonl_file)
if os.path.exists(responses_file):
    os.remove(responses_file)
for i, batch_prompts in enumerate(batches):
    with open(jsonl_file, "w", encoding="utf-8") as f:
        for idx, prompt in batch_prompts:
            request_obj = {
                "custom_id": f"{idx}",
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": {
                "model": "gpt-5",
                "messages": [
                        {"role":"system","content": SYSTEM_PROMPT_STRING},
                        {"role":"user","content": prompt}
                    ]
                }
            }
            f.write(json.dumps(request_obj) + "\n")
    batch_input_file = client.files.create(
        file=open(jsonl_file, "rb"),
        purpose="batch"
    )
    batch_input_file_id = batch_input_file.id
    batch = client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
            "description": "nightly eval job"
        }
    )
    batch_id = batch.id
    batch = client.batches.retrieve(batch_id)
    while (batch.status != "completed"):
        if (batch.status == "failed"):
            raise Exception("Batch failed")
        time.sleep(5)
        batch = client.batches.retrieve(batch_id)
    file_response = client.files.content(batch.output_file_id)
    with open(responses_file, "a", encoding="utf-8") as f:
        f.write(file_response.text)

## Save to csv file

In [None]:
df_responses = pd.read_json(responses_file, lines=True)
df["label"] = np.nan
labels = df_responses["response"].apply(lambda x: x['body']['choices'][0]['message']['content'])
df.loc[0:0+len(labels)-1, "label"] = labels.values
df.to_csv("./data/review-Vermont_10-labeled.csv", index=False)
print(df.head())