In [1]:
import duckdb
import openai
import pandas as pd
from sklearn.metrics import f1_score, balanced_accuracy_score

from tqdm import tqdm

In [3]:
con = duckdb.connect(':memory:')

In [4]:
df = con.execute("""SELECT DISTINCT conversation_hash, content FROM '../data/wildchat.parquet' JOIN '../data/nomic_filter_broad.csv' USING(conversation_hash) WHERE country = 'United States' AND role = 'user';""").fetch_df()

In [10]:
sample = df.sample(1000)

In [12]:
sample["classification"] = ""

In [13]:
sample.to_csv("../data/sample_for_annotation.csv", index=False)

In [17]:
con.execute("SELECT COUNT(*) FROM '../data/sample_for_annotation_annotated.csv' WHERE classification = 1").fetch_df()

Unnamed: 0,count_star()
0,8


In [31]:
search_terms = [
    "news",
    "new york times",
    "bbc",
    "ukraine",
    "breaking",
    "cnn"
]

In [33]:
# search for search terms in lowercased content
targeted = df[df["content"].str.lower().str.contains("|".join(search_terms))]
targeted["classification"] = ""
targeted.to_csv("../data/targeted_for_annotation.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  targeted["classification"] = ""


In [138]:
annotations = pd.read_csv("../data/targeted_for_annotation.csv")

In [139]:
annotations = pd.concat([annotations, pd.read_csv("../data/sample_for_annotation_annotated.csv")])

In [140]:
with open("../data/searched_news.txt", "r") as f:
    st = f.read().split("\n---\n")

In [141]:
annotations = pd.concat([annotations, pd.DataFrame({"conversation_hash": [""] * len(st), "content": st, "classification": [1] * len(st)})])

In [142]:
# change 9 to 0
annotations["classification"] = annotations["classification"].replace(9, 0)
annotations = annotations[annotations.classification.notna()].drop_duplicates("content")

In [143]:
annotations.reset_index(drop=True, inplace=True)

In [144]:
prompt = """You are an AI assistant designed to classify user messages as either news-related (1) or not news-related (0). Your task is to analyze each message and determine if it pertains to news, current events, or information seeking about news sources.
Classification Guidelines:

Classify as news-related (1):

Messages seeking information about news sources
Questions about current events or recent happenings
Requests for updates on any topic that could be considered news
Inquiries about reputable news outlets or publications
Messages mentioning specific news events or asking for news summaries
Requests for information that would typically be found in news articles
Any message that mentions news, even if it's not the primary focus (err on the side of including ambiguous cases)


Classify as not news-related (0):

Personal questions or statements unrelated to current events
Requests for advice on personal matters
Fiction-related queries or creative writing prompts
Technical questions unrelated to news (e.g., coding, math)
General knowledge questions that aren't tied to current events


Output:

Provide only a binary output: 1 for news-related, 0 for not news-related
Include a brief explanation for your classification, focusing on the intent and content of the message


Important Notes:

Consider all news topics equally relevant (politics, sports, entertainment, etc.)
If a message contains both news-related and unrelated content, classify it as news-related (1)
Focus on the intent and content of the message, not on the specific sources mentioned
The message you are meant to classify will be wrapped in these tags: [MESSAGE]...[/MESSAGE]



Examples:

"Please list ten websites where I can find bilingual magazines both in Chinese and English for free download. The magazines should be as well-known as 'New York Times' and 'The Economist', and the information therein should be based on authoritative and reliable sources." -> 1
"Where to get FHA loan." -> 0
"Name a few major events in the middle east from 2020 from the BBC." -> 1
"Make Season 1 Episode 14 about Tommy and The Girls hanging out again, and Momo comes close to Tommy hugging him and caring for a him, and giving him a soft kiss on the lips" -> 0

Your task is to classify each message accurately based on these guidelines.
"""

# Check LLM annotations

In [129]:
oai = openai.OpenAI()

In [177]:
outputs = []
for _, s in tqdm(annotations.iterrows(), total=len(annotations)):
    resp = oai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": f"[MESSAGE]{s.content}[\\MESSAGE]"},
        ]
    )
    outputs.append(resp.choices[0].message.content)


100%|██████████| 1255/1255 [20:38<00:00,  1.01it/s] 


In [181]:
def get_classification(text):
    try:
        return int(text.split("\n")[0])
    except ValueError:
        return None

In [182]:
annotations["llm_output"] = outputs
annotations["llm_clf"] = annotations.llm_output.apply(get_classification)

In [187]:
scores = annotations.loc[annotations.llm_clf.notna(), ["classification", "llm_clf"]]

In [188]:
f1_score(scores.classification, scores.llm_clf)

np.float64(0.6896551724137931)

In [189]:
balanced_accuracy_score(scores.classification, scores.llm_clf)

np.float64(0.9223530775696329)

# Expand LLM annotations

In [197]:
llm_sample = con.execute("""WITH init AS (
            SELECT DISTINCT 
                conversation_hash, 
                content 
            FROM 
                '../data/wildchat.parquet' 
            JOIN '../data/nomic_filter_broad.csv' 
            USING(conversation_hash) 
            WHERE 
                country = 'United States' 
                AND role = 'user' 
                AND conversation_hash NOT IN (SELECT conversation_hash FROM annotations)
        )
            SELECT * FROM init
            USING SAMPLE 5000""").fetch_df()

In [199]:
outputs = []
for _, s in tqdm(llm_sample.iterrows(), total=len(llm_sample)):
    resp = oai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": f"[MESSAGE]{s.content}[\\MESSAGE]"},
        ]
    )
    outputs.append(resp.choices[0].message.content)

100%|██████████| 5000/5000 [1:00:12<00:00,  1.38it/s]


In [201]:
llm_sample["llm_output"] = outputs
llm_sample["llm_clf"] = llm_sample.llm_output.apply(get_classification)

In [204]:
llm_sample["classification"] = llm_sample.llm_clf

In [206]:
pd.concat([annotations, llm_sample]).to_csv("../data/annotations_all.csv", index=False)