In [21]:
import duckdb
import openai
import pandas as pd

from tqdm import tqdm

In [2]:
con = duckdb.connect(":memory:")

# Filtered sample

In [3]:
con.execute("SELECT COUNT(*) FROM '../data/wildchat.parquet' WHERE country = 'United States'").fetchone()

(275084,)

In [4]:
con.execute("SELECT COUNT(DISTINCT conversation_hash) FROM '../data/wildchat.parquet' WHERE country = 'United States'").fetchdf()

Unnamed: 0,count(DISTINCT conversation_hash)
0,145041


# Random annotation sample

In [5]:
con.execute("SELECT classification, COUNT(*) FROM '../data/sample_for_annotation_annotated.csv' GROUP BY classification").fetchdf()

Unnamed: 0,classification,count_star()
0,0,992
1,1,8


# Targeted search sample

In [7]:
with open("../data/searched_news.txt", "r") as f:
    records = f.read()

print(len(records.split("---")) - 1)

58


# LLM performance check

In [8]:
llm = openai.OpenAI()

In [11]:
with open("./prompts/classification.txt", "r") as f:
    prompt = f.read()

## Prepare human annotations for LLM performance check

In [14]:
annotations = con.execute("SELECT content, classification FROM '../data/sample_for_annotation_annotated.csv'").fetchdf()

In [23]:
searched = pd.DataFrame([r.strip() for r in records.split("---")[:-1]], columns=["content"])
searched["classification"] = 1

In [24]:
annotations = pd.concat([annotations, searched])

In [26]:
annotations.classification.value_counts()

classification
0    992
1     66
Name: count, dtype: int64

In [None]:
outputs_mini = []
outputs_o = []
for _, s in tqdm(annotations.iterrows(), total=len(annotations)):
    resp = llm.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": f"[MESSAGE]{s.content}[\\MESSAGE]"},
        ]
    )
    outputs_mini.append(resp.choices[0].message.content)
    resp = llm.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": f"[MESSAGE]{s.content}[\\MESSAGE]"},
        ]
    )
    outputs_o.append(resp.choices[0].message.content)