In [102]:
import pathlib

import duckdb
import openai
import pandas as pd
import json
from sklearn.metrics import classification_report, f1_score, balanced_accuracy_score
import tiktoken

from tqdm import tqdm

In [63]:
con = duckdb.connect(":memory:")

# Full dataset

In [64]:
con.execute("SELECT COUNT(*) FROM '../data/wildchat.parquet'").fetchone()

(3920520,)

In [65]:
con.execute("SELECT COUNT(DISTINCT conversation_hash) FROM '../data/wildchat.parquet'").fetchone()

(826406,)

# Filtered sample

In [66]:
con.execute("SELECT COUNT(*) FROM '../data/wildchat.parquet' WHERE country = 'United States' AND role = 'user' AND language = 'English'").fetchone()

(265419,)

In [67]:
con.execute("SELECT COUNT(DISTINCT conversation_hash) FROM '../data/wildchat.parquet' WHERE country = 'United States' AND role = 'user' AND language = 'English'").fetchdf()

Unnamed: 0,count(DISTINCT conversation_hash)
0,146164


# Random annotation sample

In [5]:
con.execute("SELECT classification, COUNT(*) FROM '../data/sample_for_annotation_annotated.csv' GROUP BY classification").fetchdf()

Unnamed: 0,classification,count_star()
0,0,992
1,1,8


# Targeted search sample

In [7]:
with open("../data/searched_news.txt", "r") as f:
    records = f.read()

print(len(records.split("---")) - 1)

58


# LLM performance check

In [101]:
llm = openai.OpenAI()

In [11]:
with open("./prompts/classification.txt", "r") as f:
    prompt = f.read()

## Prepare human annotations for LLM performance check

In [14]:
annotations = con.execute("SELECT content, classification FROM '../data/sample_for_annotation_annotated.csv'").fetchdf()

In [23]:
searched = pd.DataFrame([r.strip() for r in records.split("---")[:-1]], columns=["content"])
searched["classification"] = 1

In [24]:
annotations = pd.concat([annotations, searched])

In [26]:
annotations.classification.value_counts()

classification
0    992
1     66
Name: count, dtype: int64

## Run LLM evaluation

In [27]:
outputs_mini = []
outputs_o = []
for _, s in tqdm(annotations.iterrows(), total=len(annotations)):
    resp = llm.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": f"[MESSAGE]{s.content}[\\MESSAGE]"},
        ]
    )
    outputs_mini.append(resp.choices[0].message.content)
    resp = llm.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": f"[MESSAGE]{s.content}[\\MESSAGE]"},
        ]
    )
    outputs_o.append(resp.choices[0].message.content)

100%|██████████| 1058/1058 [28:41<00:00,  1.63s/it]


In [28]:
annotations["gpt-4o-mini"] = outputs_mini
annotations["gpt-4o"] = outputs_o
annotations.to_csv("../data/sample_for_annotation_annotated_llm.csv", index=False)

In [30]:
def extract_classification(output):
    clf = output.split("\n")[0]
    try:
        return int(clf)
    except ValueError:
        return None

In [31]:
annotations["clf_gpt-4o-mini"] = annotations["gpt-4o-mini"].apply(extract_classification)
annotations["clf_gpt-4o"] = annotations["gpt-4o"].apply(extract_classification)

In [35]:
# set to 0 if not found (1 record)
annotations.loc[annotations["clf_gpt-4o"].isna(), "clf_gpt-4o"] = 0

In [36]:
annotations["clf_gpt-4o-mini"].value_counts()

clf_gpt-4o-mini
0    974
1     84
Name: count, dtype: int64

In [37]:
annotations["clf_gpt-4o"].value_counts()

clf_gpt-4o
0.0    940
1.0    118
Name: count, dtype: int64

In [39]:
print(classification_report(annotations["classification"], annotations["clf_gpt-4o-mini"]))

              precision    recall  f1-score   support

           0       0.99      0.98      0.98       992
           1       0.71      0.91      0.80        66

    accuracy                           0.97      1058
   macro avg       0.85      0.94      0.89      1058
weighted avg       0.98      0.97      0.97      1058



In [41]:
print(classification_report(annotations["classification"], annotations["clf_gpt-4o"]))

              precision    recall  f1-score   support

           0       1.00      0.95      0.97       992
           1       0.54      0.97      0.70        66

    accuracy                           0.95      1058
   macro avg       0.77      0.96      0.83      1058
weighted avg       0.97      0.95      0.95      1058



In [42]:
print(f1_score(annotations["classification"], annotations["clf_gpt-4o-mini"], average="weighted"))
print(f1_score(annotations["classification"], annotations["clf_gpt-4o"], average="weighted"))
print(balanced_accuracy_score(annotations["classification"], annotations["clf_gpt-4o-mini"]))
print(balanced_accuracy_score(annotations["classification"], annotations["clf_gpt-4o"]))

0.9732161297828683
0.9538368811813375
0.9424486803519061
0.957630742913001


# Cost estimate for entire sample

In [45]:
enc = tiktoken.encoding_for_model("gpt-4o-mini")

In [68]:
# Input
all_user_messages = con.execute("SELECT content FROM '../data/wildchat.parquet' WHERE country = 'United States' AND role = 'user' AND language = 'English'").fetchdf()
all_tokens = sum([len(enc.encode(m, disallowed_special=())) for m in all_user_messages.content])
print(f"Total tokens: {all_tokens}")
print(f"Cost: ${all_tokens * 0.15 / 1_000_000:.2f}")

Total tokens: 66777553
Cost: $10.02


In [55]:
# Output
all_outputs = con.execute("""SELECT "gpt-4o-mini" FROM '../data/sample_for_annotation_annotated_llm.csv'""").fetchdf()
all_tokens = sum([len(enc.encode(m, disallowed_special=())) for m in all_outputs["gpt-4o-mini"]])
print(f"Total tokens: {all_tokens}")
cost_per_record = all_tokens * 0.6 / 1_000_000 / len(all_outputs)
print(f"Cost per record: ${cost_per_record:.2f}")
print(f"Total cost: ${cost_per_record * len(all_user_messages):.2f}")

Total tokens: 32158
Cost per record: $0.00
Total cost: $5.02


# LLM annotate entire sample

In [112]:
full_sample = con.execute("""SELECT conversation_hash, content FROM '../data/wildchat.parquet' 
            WHERE country = 'United States' AND role = 'user' AND language = 'English'
            """).fetch_df()


In [120]:
# Calculate the number of chunks needed
num_chunks = len(full_sample) // 40000 + (1 if len(full_sample) % 40000 != 0 else 0)

# Create a list to store the chunks
chunks = []

# Split the dataframe into chunks
for i in range(num_chunks):
    start_idx = i * 40000
    end_idx = min((i + 1) * 40000, len(full_sample))
    chunk = full_sample.iloc[start_idx:end_idx].copy()
    chunks.append(chunk)

print(f"Number of chunks created: {len(chunks)}")
print(f"Rows in first chunk: {len(chunks[0])}")
print(f"Rows in last chunk: {len(chunks[-1])}")

Number of chunks created: 7
Rows in first chunk: 40000
Rows in last chunk: 25419


In [121]:
def format_for_batch_submission(row):
    obj = {
        "custom_id": str(row.name),
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
        "model": "gpt-4o-mini",
        "messages": [
            {"role": "system", "content": prompt},
            {"role": "user", "content": f"[MESSAGE]{row.content}[\\MESSAGE]"},
            ]
        }
    }

    return obj

In [122]:
# apply formatting to each chunk
chunks = [chunk.apply(format_for_batch_submission, axis=1) for chunk in chunks]

In [123]:
# save each chunk as a .jsonl file
for i, chunk in enumerate(chunks):
    with open(f"../data/batches/batch_{i}.jsonl", "w") as f:
        for _, s in chunk.items():
            f.write(json.dumps(s) + "\n")


In [124]:
# list all .jsonl files in the batches folder
batch_files = list(pathlib.Path("../data/batches").glob("*.jsonl"))
print(f"Number of batch files: {len(batch_files)}")

batch_input_refs = []

for batch_file in batch_files:
    batch_input_file = llm.files.create(
        file=open(batch_file, "rb"),
        purpose="batch"
    )
    batch_input_refs.append(batch_input_file.id)


Number of batch files: 7


In [125]:
batch_input_refs

['file-noBpGJ1Y3yNuE7VaICcICs64',
 'file-cBJRySmzxfntYyg2DH0b41hb',
 'file-PzohLzh5th966r5WWmpGjYth',
 'file-YlEkC5TAg8LNOD7OaVs9zSES',
 'file-ECcElQVgppt7WNgq5tGwPQPI',
 'file-CXt4tZjV1s05gZVlGkZWy9vv',
 'file-fC9w6JcSA7ULiVq3L0H2WFrF']

In [127]:
for b in batch_input_refs[1:]:
    llm.batches.create(
        input_file_id=b,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={"description": "Batch annotation of user messages in WildChat"}
    )