In [70]:
import json
import re

import duckdb
from openai import OpenAI
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [58]:
con = duckdb.connect()

In [59]:
q = "SELECT url FROM 'hf://datasets/nhagar/falcon_urls/data/train-00024-of-00170.parquet' LIMIT 100"

In [60]:
samp = con.execute(q).fetchdf()

In [61]:
samp["is_news"] = 0

In [12]:
samp.to_csv("../data/falcon_urls_sample.csv", index=False)

In [62]:
samp_labeled = con.execute("SELECT * FROM '../data/falcon_urls_sample.csv'").fetchdf()

In [63]:
prompt = """Classify whether a given URL points to a news website from a journalistic publisher or not.

Carefully consider if the website is recognized for publishing news:
- National sources: e.g., The New York Times.
- Local sources: e.g., NBC Connecticut.
- International sources: e.g., The Guardian.

Websites that should not be classified as news include:
- Personal blogs
- Real estate listings
- Food blogs
- Hobbyist forums

These may appear news-like but are not categorized as news sites.

# Steps

1. **Identify the Publisher**: Extract and identify the main domain of the URL to understand the source.
2. **Knowledge Check**: Cross-reference the main domain with known lists or databases of news sources.
3. **Evaluate Content Type**: If the publisher is not readily identified or ambiguous, evaluate the nature of the content in the URL, to infer whether it is a news story.
4. **Conclusion**: Determine if the website is a news site from a journalistic publisher or not.

# Output Format

Provide the classification in the form of a JSON:
```json
{
  "url": "[URL]",
  "is_news": [true/false],
  "reason": "[Brief explanation of the decision]"
}
```

# Examples

**Example 1**  
- **Input**: `https://www.nytimes.com`
- **Output**: 
```json
{
  "url": "https://www.nytimes.com",
  "is_news": true,
  "reason": "The URL is for The New York Times, a well-known national news publisher."
}
```

**Example 2**  
- **Input**: `https://www.joesblogaboutgardening.com`
- **Output**:
```json
{
  "url": "https://www.joesblogaboutgardening.com",
  "is_news": false,
  "reason": "The URL is for a personal blog about gardening, not a reputable news publisher."
}
```

# Notes

- Pay close attention to subdomains which might reflect news sections of larger non-news websites.
- Stay mindful of masquerading sites, which attempt to resemble reputable news publishers.
"""

In [64]:
llm = OpenAI(base_url="http://127.0.0.1:1234/v1", api_key="lm-studio")

In [76]:
def classify_url(url, model):
    resp = llm.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": url},
        ],
    )
    txt = resp.choices[0].message.content

    json_extract_pattern = re.compile(r"```json\n(.*?)\n```", re.DOTALL)
    json_extract = json_extract_pattern.search(txt).group(1)

    return json.loads(json_extract)

In [77]:
models = [
    "llama-3.2-3b-instruct",
    "qwen2.5-7b-instruct-1m",
    "gemma-2-9b-it"
]

In [78]:
results = {}
for model in models:
    samp_labeled[f"{model}_label"] = samp_labeled["url"].apply(classify_url, model=model)
    samp_labeled[f"{model}_is_news"] = samp_labeled[f"{model}_label"].apply(lambda x: x["is_news"])
    samp_labeled[f"{model}_reason"] = samp_labeled[f"{model}_label"].apply(lambda x: x["reason"])
    results[model] = {
        "accuracy": accuracy_score(samp_labeled["is_news"], samp_labeled[f"{model}_is_news"]),
        "precision": precision_score(samp_labeled["is_news"], samp_labeled[f"{model}_is_news"]),
        "recall": recall_score(samp_labeled["is_news"], samp_labeled[f"{model}_is_news"]),
        "f1": f1_score(samp_labeled["is_news"], samp_labeled[f"{model}_is_news"]),
    }

In [81]:
with open("../data/model_results.json", "w") as f:
    json.dump(results, f, indent=2)