In [3]:
import json
import re

import duckdb
from openai import OpenAI
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [4]:
con = duckdb.connect()

In [5]:
q = "SELECT url FROM 'hf://datasets/nhagar/falcon_urls/data/train-00024-of-00170.parquet' LIMIT 100"

In [6]:
samp = con.execute(q).fetchdf()

In [7]:
samp["is_news"] = 0

In [8]:
samp.to_csv("../data/falcon_urls_sample.csv", index=False)

In [9]:
samp_labeled = con.execute("SELECT * FROM '../data/falcon_urls_sample.csv'").fetchdf()

In [10]:
with open('prompt.txt', 'r') as file:
    prompt = file.read()

In [11]:
llm = OpenAI(base_url="http://127.0.0.1:1234/v1", api_key="lm-studio")

In [12]:
def classify_url(url, model):
    resp = llm.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": url},
        ],
    )
    txt = resp.choices[0].message.content

    json_extract_pattern = re.compile(r"```json\n(.*?)\n```", re.DOTALL)
    json_extract = json_extract_pattern.search(txt).group(1)

    return json.loads(json_extract)

In [19]:
models = [
    #"llama-3.2-3b-instruct-4bit",
    #"qwen2.5-7b-instruct-1m",
    "gemma-2-9b-it-GGUF",
    #"qwen2.5-14b-instruct-1m"
]

In [20]:
results = {}
for model in models:
    samp_labeled[f"{model}_label"] = samp_labeled["url"].apply(classify_url, model=model)
    samp_labeled[f"{model}_is_news"] = samp_labeled[f"{model}_label"].apply(lambda x: x["is_news"])
    samp_labeled[f"{model}_reason"] = samp_labeled[f"{model}_label"].apply(lambda x: x["reason"])
    results[model] = {
        "accuracy": accuracy_score(samp_labeled["is_news"], samp_labeled[f"{model}_is_news"]),
        "precision": precision_score(samp_labeled["is_news"], samp_labeled[f"{model}_is_news"]),
        "recall": recall_score(samp_labeled["is_news"], samp_labeled[f"{model}_is_news"]),
        "f1": f1_score(samp_labeled["is_news"], samp_labeled[f"{model}_is_news"]),
    }

    with open(f"../data/model_results_{model}.json", "w") as f:
        json.dump(results, f, indent=2)

  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
samp_labeled.to_csv('../data/sample_gemma_labeled.csv',index=False)

In [24]:
# test
# Start with all domains from Common Crawl as input

# Two maps needed at start
# Domain → datasets mapping: Associate each domain with a set of datasets that use it
# Domain → URL sample mapping: For each domain, select 3 URL samples

#For each sampled URL:
    # Classify each URL using the ModernBERT classifier
    # classifier analyzes slug to determine if it represents news content
    # Labels sampled URLs
# Domains now have news_domain label (True if 4/7 or more URLs were labeled is_news)

# grab raw datasets for C4 and other large datasets (don't use hf library)
# build off dataset pipeline logic (loop of batches)
# need a deduplicated index, 3 URLs