In [1]:
# load 
import dask.dataframe as dd

# total urls: 365,233,500
df = dd.read_parquet("hf://datasets/nhagar/c4_en_urls/data/train-*.parquet")
df100 = df.head(100)

In [2]:
import pandas as pd
labeled_domains = pd.read_csv('../data/url_domain_1to1_mappings.csv')
labeled_domains.sample(5)

Unnamed: 0,URL,Domains
4037,www.sustaincase.com,News
1334,www.epdf.tips,"Science, Academia, & Technology"
41,www.jotform.com,Business & E-Commerce
512,www.marketplace.org,News
4094,www.biogeoamb.grupos.uniovi.es,General Information & Education


In [3]:
from openai import OpenAI
import json
import re

with open('prompt_is_news.txt', 'r') as file:
    prompt_binary_label = file.read()
    
with open('prompt_service_label.txt', 'r') as file:
    prompt_service_label = file.read()
    
llm = OpenAI(base_url="http://127.0.0.1:1234/v1", api_key="lm-studio")

In [4]:
def classify_url(url, model, prompt):
    resp = llm.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": url},
        ],
    )
    txt = resp.choices[0].message.content

    json_extract_pattern = re.compile(r"```json\n(.*?)\n```", re.DOTALL)
    json_extract = json_extract_pattern.search(txt).group(1)

    return json.loads(json_extract)

In [11]:
sample_1M = df.sample(frac=0.0001).compute().copy()

In [None]:
import pandas as pd
import json
import time
from datetime import datetime

# List of models to evaluate
models = [
    #"llama-3.2-3b-instruct-4bit",
    #"qwen2.5-7b-instruct-1m",
    "gemma-2-9b-it-GGUF",
    #"qwen2.5-14b-instruct-1m"
]

dataframes = {
    "sample_10": df100.sample(10).copy(),
    "sample_100": df100.copy(),
    "sample_1M": df.sample(frac=0.001).copy(),
}

results = {}

# Outer loop for each dataframe
for df_name, df in dataframes.items():
    print(f"\n===== Processing dataframe: {df_name} =====")
    results[df_name] = {}

    df["is_news"] = 0
    df["label"] = ""

    for model in models:
        print(f"\nProcessing model: {model} on dataframe: {df_name}")

        results[df_name][model] = {
            "binary_classification_time": 0,
            "service_classification_time": 0,
            "total_time": 0,
            "start_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            "url_count": len(df)
        }
        
        # Time the binary classification
        print(f"Starting binary classification for {model}...")
        start_time = time.time()
        df[f"{model}_binary_label"] = df["url"].apply(
            classify_url, model=model, prompt=prompt_binary_label
        )
        binary_time = time.time() - start_time
        results[df_name][model]["binary_classification_time"] = binary_time
        print(f"  Completed binary classification in {binary_time:.2f}s ({binary_time/len(df):.4f}s per URL)")
        
        # Time the service classification
        print(f"Starting service classification for {model}...")
        start_time = time.time()
        df[f"{model}_service_label"] = df["url"].apply(
            classify_url, model=model, prompt=prompt_service_label
        )
        service_time = time.time() - start_time
        results[df_name][model]["service_classification_time"] = service_time
        print(f"  Completed service classification in {service_time:.2f}s ({service_time/len(df):.4f}s per URL)")
        
        # Calculate derived fields
        df[f"{model}_is_news"] = df[f"{model}_binary_label"].apply(
            lambda x: x["is_news"]
        )
        df[f"{model}_label"] = df[f"{model}_service_label"].apply(
            lambda x: x["label"]
        )
        
        # Record total time and end time
        total_time = binary_time + service_time
        results[df_name][model]["total_time"] = total_time
        results[df_name][model]["end_time"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        results[df_name][model]["avg_time_per_url"] = total_time / len(df)
        
        print(f"Model {model} completed on {df_name}")
        print(f"  Total time: {total_time:.2f}s ({total_time/len(df):.4f}s per URL)")
        
        # Save intermediate results after each model
        with open(f"../data/results_{df_name}_{model}.json", "w") as f:
            json.dump(results[df_name][model], f, indent=2)
    
    # Save the dataframe with all results for this dataset
    output_filename = f"../data/{df_name}_all_models.csv"
    df.to_csv(output_filename, index=False)
    print(f"Saved results for {df_name} to {output_filename}")

# Save final consolidated results
with open("../data/all_experiments_results.json", "w") as f:
    json.dump(results, f, indent=2)

print("\nAll experiments completed successfully!")
print(f"Final results saved to ../data/all_experiments_results.json")


===== Processing dataframe: sample_10 =====

Processing model: gemma-2-9b-it-GGUF on dataframe: sample_10
Starting binary classification for gemma-2-9b-it-GGUF...
  Completed binary classification in 51.11s (5.1111s per URL)
Starting service classification for gemma-2-9b-it-GGUF...
  Completed service classification in 54.27s (5.4271s per URL)
Model gemma-2-9b-it-GGUF completed on sample_10
  Total time: 105.38s (10.5382s per URL)
Saved results for sample_10 to ../data/sample_10_all_models.csv

===== Processing dataframe: sample_100 =====

Processing model: gemma-2-9b-it-GGUF on dataframe: sample_100
Starting binary classification for gemma-2-9b-it-GGUF...
  Completed binary classification in 529.32s (5.2932s per URL)
Starting service classification for gemma-2-9b-it-GGUF...
  Completed service classification in 518.91s (5.1891s per URL)
Model gemma-2-9b-it-GGUF completed on sample_100
  Total time: 1048.23s (10.4823s per URL)
Saved results for sample_100 to ../data/sample_100_all_mod

In [None]:
    results[model] = {
        "accuracy": accuracy_score(samp_labeled["is_news"], samp_labeled[f"{model}_is_news"]),
        "precision": precision_score(samp_labeled["is_news"], samp_labeled[f"{model}_is_news"]),
        "recall": recall_score(samp_labeled["is_news"], samp_labeled[f"{model}_is_news"]),
        "f1": f1_score(samp_labeled["is_news"], samp_labeled[f"{model}_is_news"]),
    }