In [1]:
import json
import re

import duckdb
from openai import OpenAI
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
con = duckdb.connect()

In [3]:
q = "SELECT url FROM 'hf://datasets/nhagar/falcon_urls/data/train-00024-of-00170.parquet' LIMIT 100"

In [4]:
samp = con.execute(q).fetchdf()

In [5]:
samp["is_news"] = 0

In [6]:
samp.to_csv("../data/falcon_urls_sample.csv", index=False)

In [7]:
samp_labeled = con.execute("SELECT * FROM '../data/falcon_urls_sample.csv'").fetchdf()

In [8]:
with open('prompt.txt', 'r') as file:
    prompt = file.read()

In [9]:
llm = OpenAI(base_url="http://127.0.0.1:1234/v1", api_key="lm-studio")

In [10]:
def classify_url(url, model):
    resp = llm.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": url},
        ],
    )
    txt = resp.choices[0].message.content

    json_extract_pattern = re.compile(r"```json\n(.*?)\n```", re.DOTALL)
    json_extract = json_extract_pattern.search(txt).group(1)

    return json.loads(json_extract)

In [11]:
models = [
    "llama-3.2-3b-instruct",
    "qwen2.5-7b-instruct-1m",
    "gemma-2-9b-it",
    "qwen2.5-14b-instruct-1m"
]

In [None]:
results = {}
for model in models:
    samp_labeled[f"{model}_label"] = samp_labeled["url"].apply(classify_url, model=model)
    samp_labeled[f"{model}_is_news"] = samp_labeled[f"{model}_label"].apply(lambda x: x["is_news"])
    samp_labeled[f"{model}_reason"] = samp_labeled[f"{model}_label"].apply(lambda x: x["reason"])
    results[model] = {
        "accuracy": accuracy_score(samp_labeled["is_news"], samp_labeled[f"{model}_is_news"]),
        "precision": precision_score(samp_labeled["is_news"], samp_labeled[f"{model}_is_news"]),
        "recall": recall_score(samp_labeled["is_news"], samp_labeled[f"{model}_is_news"]),
        "f1": f1_score(samp_labeled["is_news"], samp_labeled[f"{model}_is_news"]),
    }

In [None]:
with open("../data/model_results_qwen14.json", "w") as f:
    json.dump(results, f, indent=2)