In [1]:
from huggingface_hub import hf_hub_download, list_repo_files
from tqdm import tqdm
import dask.dataframe as dd
import glob
import os
import pandas as pd

labels = pd.read_csv('../data/combined_domain_labels_16k_splits.csv')
labels.sample(10)

Unnamed: 0,domain,label,label_source,set
10167,tricountysentry.com,News,northeastern_domain_demo,train
11414,redherring.com,News,northeastern_domain_demo,train
11855,thesyrinx.com,News,northeastern_domain_demo,train
7348,herald-publishing.com,News,northeastern_domain_demo,test
11651,surpriseindependent.com,News,northeastern_domain_demo,train
8046,laprensanwa.com,News,northeastern_domain_demo,train
3443,best-car-lease-deals.co.uk,Entertainment & Culture,data_provenance_init,train
571,benzinga.com,News,northeastern_domain_demo,train
1967,assetverification.com,Business & E-Commerce,data_provenance_init,train
11367,patriotpost.us,News,northeastern_domain_demo,train


In [2]:
# Load datasets
datasets = {
    #"zyda_main": "hf://datasets/nhagar/zyda_urls/**/*.parquet",
    "zyda_fwe3": "hf://datasets/nhagar/zyda-2_urls_fwe3/**/*.parquet",
    "zyda_dclm_crossdeduped": "hf://datasets/nhagar/zyda-2_urls_dclm_crossdeduped/**/*.parquet",
    "dclm_baseline_batch4": "hf://datasets/nhagar/dclm-baseline-1.0-parquet_urls/batch_4/train-*.parquet",
    "dclm_dedup": "hf://datasets/nhagar/dclm-dedup_urls/**/*.parquet",
    "falcon_refinedweb": "hf://datasets/nhagar/falcon-refinedweb_urls/batch*/train-*.parquet",
    "falcon_main": "hf://datasets/nhagar/falcon_urls/data/train-*.parquet",
    "c4_en": "hf://datasets/nhagar/c4_en_urls/data/train-*.parquet",
    "cultura": "hf://datasets/nhagar/cultura_urls/data/train-*.parquet"
}

In [3]:
DATASETS = ["nhagar/zyda-2_urls_zyda_crossdeduped-filtered",
            #"nhagar/falcon_urls"]
           ]

           
for dataset in tqdm(DATASETS):
    try:
        # Get files list from repo
        files = [f for f in list_repo_files(dataset, repo_type="dataset") 
                if f.endswith('.parquet')]
        
        if not files:
            print(f"No parquet files found for {dataset}, skipping")
            continue
            
        print(f"Found {len(files)} parquet files for {dataset}")
        print(f"Downloading all parquet files...")

        downloaded_files = []
        
        for file in files:
            download_path = hf_hub_download(
                repo_id=dataset,
                filename=file,
                repo_type="dataset",
                cache_dir="hf_cache"
            )
            downloaded_files.append(download_path)
        
        print(f"Downloaded {len(downloaded_files)} files")
        print(f"First file path: {downloaded_files[0]}")
        
        # Use the actual downloaded paths directly
        print("Processing...")
        df = dd.read_parquet(downloaded_files).compute()
        
    except Exception as e:
        print(f"Error with {dataset}: {str(e)}")
        # Save progress on error
        dataset_name = dataset.replace('nhagar/','')

  0%|                                                     | 0/1 [00:00<?, ?it/s]

Found 1 parquet files for nhagar/zyda-2_urls_zyda_crossdeduped-filtered
Downloading all parquet files...
Downloaded 1 files
First file path: hf_cache/datasets--nhagar--zyda-2_urls_zyda_crossdeduped-filtered/snapshots/695209cf7133a596fc999304fa623e802439281f/batch_1.parquet
Processing...


100%|█████████████████████████████████████████████| 1/1 [00:07<00:00,  7.97s/it]


In [4]:
# Filter to labeled domains
df.head(5)

Unnamed: 0,url,domain
0,https://www.hennsnoxlaw.com/faqs,hennsnoxlaw.com
1,https://store.basscentral.com/dingwall/dingwal...,basscentral.com
2,http://theplayfullife.polarnopyretusa.com/name...,polarnopyretusa.com
3,http://www.katephillipsevents.com/contact,katephillipsevents.com
4,https://www.littleroomunderthestairs.com/2015/...,littleroomunderthestairs.com


In [5]:
import time
from tqdm.notebook import tqdm

def filter_with_progress(df, domain_set, batch_size=100000):
    start_time = time.time()
    total_rows = len(df)
    filtered_rows = []
    
    for i in tqdm(range(0, total_rows, batch_size), desc="Filtering domains"):
        batch = df.iloc[i:min(i+batch_size, total_rows)]
        filtered_batch = batch[batch['domain'].isin(domain_set)]
        filtered_rows.append(filtered_batch)
        
        # Show additional progress info
        if (i + batch_size) % (batch_size * 10) == 0 or (i + batch_size) >= total_rows:
            elapsed = time.time() - start_time
            #print(f"Processed {min(i+batch_size, total_rows)}/{total_rows} rows ({(min(i+batch_size, total_rows)/total_rows)*100:.1f}%) in {elapsed:.1f}s")
    
    return pd.concat(filtered_rows, ignore_index=True)

# Use the function
domain_set = set(labels[labels.set=='train']['domain'])
filtered_df = filter_with_progress(df, domain_set)

Filtering domains:   0%|          | 0/1912 [00:00<?, ?it/s]

In [6]:
# Get 3 urls from each domain

In [37]:
import pandas as pd

result_data = []
processed_domains = set()

# Group and process in one pass
for domain, group in filtered_df.groupby('domain'):
    if domain not in domain_set:
        continue
        
    urls = group['url'].head(3).tolist()
    result_data.extend([(domain, url) for url in urls])
    processed_domains.add(domain)
    
    if len(processed_domains) % 1000 == 0:
        print(f"Processed {len(processed_domains)}/{len(domain_set)} domains")
    
    # Stop if we have all domains
    if len(processed_domains) == len(domain_set):
        break

url_df = pd.DataFrame(result_data, columns=['domain', 'url'])

Processed 1000/14256 domains
Processed 2000/14256 domains
Processed 3000/14256 domains
Processed 4000/14256 domains
Processed 5000/14256 domains
Processed 6000/14256 domains
Processed 7000/14256 domains
Processed 8000/14256 domains
Processed 9000/14256 domains
Processed 10000/14256 domains


In [38]:
# Extract features
url_df=url_df.merge(right=labels, left_on="domain", right_on="domain")
url_df.sample(10)

Unnamed: 0,domain,url,label,label_source,set
10594,heal.com,https://heal.com/healthrecords/,"Science, Academia, & Technology",data_provenance_init,train
13068,kiowacountypress.net,https://kiowacountypress.net/content/cdot-hold...,News,northeastern_domain_demo,train
8378,fbnewsleader.com,https://www.fbnewsleader.com/news/coyote-sight...,News,northeastern_domain_demo,train
29809,westword.com,https://www.westword.com/best-of/2019/shopping...,News,northeastern_domain_demo,train
24644,tecumsehchieftain.com,https://www.tecumsehchieftain.com/articles/new...,News,northeastern_domain_demo,train
22272,sanpedronewspilot.com,http://sanpedronewspilot.com/profiles/blogs/20...,News,northeastern_domain_demo,train
30142,wicd15.com,http://wicd15.com/template/cgi-bin/archived.pl...,News,northeastern_domain_demo,train
9408,gasconadecountyrepublican.com,http://gasconadecountyrepublican.com/content/g...,News,northeastern_domain_demo,train
6145,dailynexus.com,http://dailynexus.com/2019-04-12/rowan-blasts-...,News,northeastern_domain_demo,train
26116,themonitor.net,https://www.themonitor.net/article/traffic-sto...,News,northeastern_domain_demo,train


# Binary Classification

In [84]:
import pandas as pd
import numpy as np
import re
import time
from urllib.parse import urlparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, fbeta_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from openai import OpenAI
import json
import re
import time
from datetime import datetime

def extract_url_features(url):
    parsed = urlparse(url)
    domain = parsed.netloc
    path = parsed.path
    query = parsed.query

    features = {
        'url_length': len(url),
        'domain_length': len(domain),
        'path_length': len(path),
        'num_slashes': url.count('/'),
        'num_dots': url.count('.'),
        'num_equal': url.count('='),
        'num_params': len(query.split('&')) if query else 0,
        'has_https': int(url.startswith('https')),
        'has_www': int('www.' in domain),
        'num_digits': sum(c.isdigit() for c in url),
        'num_path_tokens': len([p for p in path.split('/') if p]),
        'has_news_in_domain': int('news' in domain.lower()),
        'has_news_in_path': int('news' in path.lower()),
        'has_article_in_path': int('article' in path.lower()),
        'has_content_in_path': int('content' in path.lower()),
        'has_story_in_path': int('story' in path.lower()),
        'has_date_pattern': int(bool(re.search(r'/(19|20)\d{2}[-/](0[1-9]|1[0-2])[-/](0[1-9]|[12][0-9]|3[01])/', url))),
        'has_blog_in_url': int('blog' in url.lower()),
        'num_hyphens': url.count('-'),
        'num_underscores': url.count('_'),
        'has_query_string': int(bool(query)),
        'has_ip_address': int(bool(re.match(r'(\d{1,3}\.){3}\d{1,3}', domain))),
        #'tld': domain.split('.')[-1] if '.' in domain else '',
        'tld_length': len(domain.split('.')[-1]) if '.' in domain else 0,
        'subdomain_depth': domain.count('.') - 1,
        'is_shortened_url': int(domain in ['bit.ly', 'tinyurl.com', 'goo.gl', 'ow.ly', 't.co']),
        'has_port': int(bool(parsed.port)),
        'port_number': parsed.port if parsed.port else 0,
        'has_fragment': int(bool(parsed.fragment)),
        'fragment_length': len(parsed.fragment),
        'path_digit_ratio': sum(c.isdigit() for c in path) / len(path) if len(path) > 0 else 0,
        'domain_digit_ratio': sum(c.isdigit() for c in domain) / len(domain) if len(domain) > 0 else 0,
        'query_length': len(query),
        'num_semicolons': url.count(';'),
        'num_at_symbols': url.count('@'),
        'num_percent': url.count('%'),
    }

    return pd.Series(features)

# Function to evaluate a model
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    # Training
    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time
    
    # Prediction
    start_time = time.time()
    y_pred = model.predict(X_test)
    inference_time = time.time() - start_time
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    f2 = fbeta_score(y_test, y_pred, beta=2, average='weighted')
    
    print(f"\n{name} Results:")
    print(f"Training time: {train_time:.4f} seconds")
    print(f"Inference time: {inference_time:.4f} seconds")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"F2 Score: {f2:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    return {
        'Model': name,
        'Train Time (s)': train_time,
        'Inference Time (s)': inference_time,
        'Accuracy': accuracy,
        'F1 Score': f1,
        'F2 Score': f2
    }


def classify_url_with_llm(url, model, prompt):
    llm = OpenAI(base_url="http://127.0.0.1:1234/v1", api_key="lm-studio")
    
    resp = llm.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": url},
        ],
    )
    txt = resp.choices[0].message.content

    # Extract JSON output from the response
    json_extract_pattern = re.compile(r"```json\n(.*?)\n```", re.DOTALL)
    json_extract = json_extract_pattern.search(txt).group(1)

    return json.loads(json_extract)



def classify_urls(url_df):
    print("Starting URL Classification for News Detection...")
    
    # Convert multi-class labels to binary (news or not_news)
    url_df["binary_label"] = url_df["label"].apply(lambda x: "news" if x == "News" else "not_news")
    
    # Check data
    print(f"Total samples: {len(url_df)}")
    print(f"News samples: {sum(url_df['binary_label'] == 'news')}")
    print(f"Non-news samples: {sum(url_df['binary_label'] == 'not_news')}")
    
    # Split data if no test set defined
    if 'set' not in url_df.columns or url_df['set'].isna().any():
        print("Creating train/test split...")
        train_df, test_df = train_test_split(url_df, test_size=0.2, stratify=url_df['binary_label'], random_state=42)
    else:
        train_df = url_df[url_df['set'] == 'train']
        test_df = url_df[url_df['set'] == 'test']
        if len(test_df) == 0:  # If no test set exists
            print("No test set found, creating from train set...")
            train_df, test_df = train_test_split(train_df, test_size=0.2, stratify=train_df['binary_label'], random_state=42)
    
    print(f"Training samples: {len(train_df)}")
    print(f"Testing samples: {len(test_df)}")
    
    # Prepare labels
    y_train = train_df['binary_label']
    y_test = test_df['binary_label']
    
    # Store results
    results = []
    
    # Approach 1: Lexical Features
    print("\n=== Approach 1: Using Lexical Features ===")
    X_train_lex = train_df['url'].apply(extract_url_features)
    X_test_lex = test_df['url'].apply(extract_url_features)
    
    # Scale features
    scaler = StandardScaler()
    X_train_lex_scaled = scaler.fit_transform(X_train_lex)
    X_test_lex_scaled = scaler.transform(X_test_lex)
    
    # Models
    dt_lex = DecisionTreeClassifier(max_depth=10, random_state=42)
    knn_lex = KNeighborsClassifier(n_neighbors=5)
    rf_lex = RandomForestClassifier(n_estimators=100, random_state=42)
    
    # Evaluate
    results.append(evaluate_model("Decision Tree (Lexical)", dt_lex, X_train_lex_scaled, X_test_lex_scaled, y_train, y_test))
    results.append(evaluate_model("KNN (Lexical)", knn_lex, X_train_lex_scaled, X_test_lex_scaled, y_train, y_test))
    results.append(evaluate_model("Random Forest (Lexical)", rf_lex, X_train_lex_scaled, X_test_lex_scaled, y_train, y_test))

    
    # Approach 2: Character N-grams
    print("\n=== Approach 2: Using Character N-grams ===")
    char_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3, 5), max_features=1000)
    X_train_char = char_vectorizer.fit_transform(train_df['url'])
    X_test_char = char_vectorizer.transform(test_df['url'])
    
    # Models
    dt_char = DecisionTreeClassifier(max_depth=20, random_state=42)
    knn_char = KNeighborsClassifier(n_neighbors=5)
    rf_char = RandomForestClassifier(n_estimators=100, random_state=42)
    
    # Evaluate
    results.append(evaluate_model("Decision Tree (Char N-grams)", dt_char, X_train_char, X_test_char, y_train, y_test))
    results.append(evaluate_model("KNN (Char N-grams)", knn_char, X_train_char, X_test_char, y_train, y_test))
    results.append(evaluate_model("Random Forest (Char N-grams)", rf_char, X_train_char, X_test_char, y_train, y_test))
    
    # Approach 3: Domain Features
    print("\n=== Approach 3: Using Domain-Specific Features ===")
    # Extract URL parts
    train_df['domain'] = train_df['url'].apply(lambda x: urlparse(x).netloc)
    test_df['domain'] = test_df['url'].apply(lambda x: urlparse(x).netloc)
    train_df['path'] = train_df['url'].apply(lambda x: urlparse(x).path)
    test_df['path'] = test_df['url'].apply(lambda x: urlparse(x).path)
    
    # Create TF-IDF features for domains
    domain_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 5), max_features=500)
    X_train_domain = domain_vectorizer.fit_transform(train_df['domain'])
    X_test_domain = domain_vectorizer.transform(test_df['domain'])
    
    # Create TF-IDF features for paths
    path_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 5), max_features=500, token_pattern=r'[a-zA-Z0-9]+')
    X_train_path = path_vectorizer.fit_transform(train_df['path'])
    X_test_path = path_vectorizer.transform(test_df['path'])
    
    # Combine sparse matrices
    from scipy.sparse import hstack
    X_train_combined = hstack([X_train_domain, X_train_path])
    X_test_combined = hstack([X_test_domain, X_test_path])
    
    # Models
    dt_combined = DecisionTreeClassifier(max_depth=100, random_state=42)
    knn_combined = KNeighborsClassifier(n_neighbors=5)
    rf_combined = RandomForestClassifier(n_estimators=100, random_state=42)
    
    # Evaluate
    results.append(evaluate_model("Decision Tree (Domain+Path)", dt_combined, X_train_combined, X_test_combined, y_train, y_test))
    results.append(evaluate_model("KNN (Domain+Path)", knn_combined, X_train_combined, X_test_combined, y_train, y_test))
    results.append(evaluate_model("Random Forest (Domain+Path)", rf_combined, X_train_combined, X_test_combined, y_train, y_test))
    
    # Approach 4: Using LLM Classification
    print("\n=== Approach 4: Using LLM Classification ===")
    
    # Load prompts
    with open('prompt_is_news.txt', 'r') as file:
        prompt_binary_label = file.read()

    
    # Sample a small subset of test data for LLM evaluation (LLMs are slower)
    llm_test_size = min(1000, len(test_df))
    llm_test_df = test_df.sample(llm_test_size, random_state=42)
    
    # List of models to evaluate
    llm_models = [
        "gemma-2-9b-it-GGUF",
    ]
    
    for model in llm_models:
        print(f"\nProcessing LLM model: {model}")
        
        # Time the binary classification
        print(f"Starting binary classification for {model}...")
        start_time = time.time()
        llm_test_df[f"{model}_binary_label"] = llm_test_df["url"].apply(
            classify_url_with_llm, model=model, prompt=prompt_binary_label
        )
        binary_time = time.time() - start_time
        print(f"  Completed binary classification in {binary_time:.2f}s ({binary_time/len(llm_test_df):.4f}s per URL)")
        
        # Extract binary predictions
        llm_test_df[f"{model}_is_news"] = llm_test_df[f"{model}_binary_label"].apply(
            lambda x: x["is_news"]
        )
        
        # Convert to correct format for evaluation (0/1 -> news/not_news)
        llm_test_df[f"{model}_pred"] = llm_test_df[f"{model}_is_news"].apply(
            lambda x: "news" if x == 1 else "not_news"
        )
        
        # Calculate metrics
        accuracy = accuracy_score(llm_test_df["binary_label"], llm_test_df[f"{model}_pred"])
        f1 = f1_score(llm_test_df["binary_label"], llm_test_df[f"{model}_pred"], pos_label="news", average="binary")
        f2 = fbeta_score(llm_test_df["binary_label"], llm_test_df[f"{model}_pred"], beta=2, pos_label="news", average="binary")
        
        # Add to results
        results.append({
            'Model': f"{model} (LLM)",
            'Train Time (s)': binary_time,  # Total time spent on inference
            'Inference Time (s)': binary_time / len(llm_test_df),  # Average time per URL
            'Accuracy': accuracy,
            'F1 Score': f1,
            'F2 Score': f2
        })
        
        print(f"\n{model} (LLM) Results:")
        print(f"Processing time: {binary_time:.4f} seconds")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"F1 Score: {f1:.4f}")
        print(f"F2 Score: {f2:.4f}")
        print("\nClassification Report:")
        print(classification_report(llm_test_df["binary_label"], llm_test_df[f"{model}_pred"]))
    
    # Summary
    print("\n=== Summary of Results ===")
    results_df = pd.DataFrame(results)
    print(results_df.sort_values('Accuracy', ascending=False))
    
    return results_df


In [56]:
# Run benchmarks
url_df = url_df[["url","domain","label","set"]]
url_df["binary_label"] = url_df["label"].apply(lambda x: "news" if x == "News" else "not_news")
results_df = classify_urls(url_df)
#results_df = run_benchmarks(df)

# Display results
print("\n=== Final Comparison ===")
print(results_df.sort_values('Accuracy', ascending=False))

Starting URL Classification for News Detection...
Total samples: 31598
News samples: 26899
Non-news samples: 4699
No test set found, creating from train set...
Training samples: 25278
Testing samples: 6320

=== Approach 1: Using Lexical Features ===

Decision Tree (Lexical) Results:
Training time: 0.0535 seconds
Inference time: 0.0005 seconds
Accuracy: 0.8634
F1 Score: 0.8377
F2 Score: 0.8514

Classification Report:
              precision    recall  f1-score   support

        news       0.88      0.97      0.92      5380
    not_news       0.60      0.24      0.34       940

    accuracy                           0.86      6320
   macro avg       0.74      0.61      0.63      6320
weighted avg       0.84      0.86      0.84      6320


KNN (Lexical) Results:
Training time: 0.0098 seconds
Inference time: 0.1469 seconds
Accuracy: 0.8646
F1 Score: 0.8542
F2 Score: 0.8599

Classification Report:
              precision    recall  f1-score   support

        news       0.90      0.95     

In [57]:
results_df

Unnamed: 0,Model,Train Time (s),Inference Time (s),Accuracy,F1 Score,F2 Score
0,Decision Tree (Lexical),0.053504,0.000481,0.863449,0.837657,0.851373
1,KNN (Lexical),0.009754,0.146912,0.864557,0.85416,0.859859
2,Random Forest (Lexical),1.326974,0.07277,0.874842,0.861726,0.868674
3,Decision Tree (Char N-grams),3.410615,0.004884,0.856329,0.842262,0.84997
4,KNN (Char N-grams),0.011227,53.640871,0.885601,0.884276,0.885051
5,Random Forest (Char N-grams),16.590193,0.106071,0.887025,0.870851,0.87901
6,Decision Tree (Domain+Path),2.769504,0.003268,0.946835,0.947246,0.946987
7,KNN (Domain+Path),0.010121,12.495374,0.86962,0.87255,0.870703
8,Random Forest (Domain+Path),7.657918,0.0984,0.976424,0.975695,0.975946
9,gemma-2-9b-it-GGUF (LLM),6385.115657,6.385116,0.847,0.901608,0.85634


# Multi-label Classification

In [85]:
MAX_TREE_DEPTH = 100
MAX_NGRAM_FEATURES = 1000
MAX_ESTIMATORS_FOREST = 20
N_NEIGHBORS = 3
NGRAM_RANGE_MIN = 2
NGRAM_RANGE_MAX = 6

def classify_urls_multilabel(url_df):
    print("Starting URL Classification with Multi-class Labels...")
    
    # Check data
    print(f"Total samples: {len(url_df)}")
    print("Label distribution:")
    label_counts = url_df['label'].value_counts()
    for label, count in label_counts.items():
        print(f"  {label}: {count} samples")
    
    # Store the original multi-class label
    url_df["original_label"] = url_df["label"]
    
    # Handle NaN labels if any exist
    if url_df["label"].isna().any():
        print(f"Warning: Found {url_df['label'].isna().sum()} NaN labels. Replacing with 'Other'.")
        url_df["label"] = url_df["label"].fillna("Other")
    
    # Split data if no test set defined
    if 'set' not in url_df.columns or url_df['set'].isna().any():
        print("Creating train/test split...")
        train_df, test_df = train_test_split(url_df, test_size=0.2, stratify=url_df['label'], random_state=42)
    else:
        train_df = url_df[url_df['set'] == 'train']
        test_df = url_df[url_df['set'] == 'test']
        if len(test_df) == 0:  # If no test set exists
            print("No test set found, creating from train set...")
            train_df, test_df = train_test_split(train_df, test_size=0.2, stratify=train_df['label'], random_state=42)
    
    print(f"Training samples: {len(train_df)}")
    print(f"Testing samples: {len(test_df)}")
    
    # Prepare labels (using full multi-class labels)
    y_train = train_df['label']
    y_test = test_df['label']
    
    # Store results
    results = []
    
    # Approach 1: Lexical Features
    print("\n=== Approach 1: Using Lexical Features ===")
    X_train_lex = train_df['url'].apply(extract_url_features)
    X_test_lex = test_df['url'].apply(extract_url_features)
    
    # Scale features
    scaler = StandardScaler()
    X_train_lex_scaled = scaler.fit_transform(X_train_lex)
    X_test_lex_scaled = scaler.transform(X_test_lex)
    
    # Models
    dt_lex = DecisionTreeClassifier(max_depth=MAX_TREE_DEPTH, random_state=42)
    knn_lex = KNeighborsClassifier(n_neighbors=N_NEIGHBORS)
    rf_lex = RandomForestClassifier(n_estimators=MAX_ESTIMATORS_FOREST, random_state=42)
    
    # Evaluate
    results.append(evaluate_model("Decision Tree (Lexical)", dt_lex, X_train_lex_scaled, X_test_lex_scaled, y_train, y_test))
    results.append(evaluate_model("KNN (Lexical)", knn_lex, X_train_lex_scaled, X_test_lex_scaled, y_train, y_test))
    results.append(evaluate_model("Random Forest (Lexical)", rf_lex, X_train_lex_scaled, X_test_lex_scaled, y_train, y_test))

    # Print important features for Random Forest (Lexical)
    print("\n=== Most Important Lexical Features ===")
    rf_lex.fit(X_train_lex_scaled, y_train)
    # Assuming X_train_lex has named columns or we can get feature names
    feature_names_lex = X_train_lex.columns if hasattr(X_train_lex, 'columns') else [f"lexical_feature_{i}" for i in range(X_train_lex_scaled.shape[1])]
    print_feature_importance(rf_lex, feature_names_lex, "Random Forest - Lexical Features")
    
    # Approach 2: Character N-grams
    print("\n=== Approach 2: Using Character N-grams ===")
    char_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(NGRAM_RANGE_MIN, NGRAM_RANGE_MAX), max_features=MAX_NGRAM_FEATURES)
    X_train_char = char_vectorizer.fit_transform(train_df['url'])
    X_test_char = char_vectorizer.transform(test_df['url'])
    
    # Models
    dt_char = DecisionTreeClassifier(max_depth=MAX_TREE_DEPTH, random_state=42)
    knn_char = KNeighborsClassifier(n_neighbors=N_NEIGHBORS)
    rf_char = RandomForestClassifier(n_estimators=MAX_ESTIMATORS_FOREST, random_state=42)
    
    # Evaluate
    results.append(evaluate_model("Decision Tree (Char N-grams)", dt_char, X_train_char, X_test_char, y_train, y_test))
    results.append(evaluate_model("KNN (Char N-grams)", knn_char, X_train_char, X_test_char, y_train, y_test))
    results.append(evaluate_model("Random Forest (Char N-grams)", rf_char, X_train_char, X_test_char, y_train, y_test))
    
    # Print important features for Random Forest (Char N-grams)
    print("\n=== Most Important Character N-gram Features ===")
    rf_char.fit(X_train_char, y_train)
    feature_names_char = char_vectorizer.get_feature_names_out()
    print_feature_importance(rf_char, feature_names_char, "Random Forest - Character N-gram Features")
    
    # Approach 3: Domain Features
    print("\n=== Approach 3: Using Domain-Specific Features ===")
    # Extract URL parts
    train_df['domain'] = train_df['url'].apply(lambda x: urlparse(x).netloc)
    test_df['domain'] = test_df['url'].apply(lambda x: urlparse(x).netloc)
    train_df['path'] = train_df['url'].apply(lambda x: urlparse(x).path)
    test_df['path'] = test_df['url'].apply(lambda x: urlparse(x).path)
    
    # Create TF-IDF features for domains
    domain_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(NGRAM_RANGE_MIN, NGRAM_RANGE_MAX), max_features=MAX_NGRAM_FEATURES)
    X_train_domain = domain_vectorizer.fit_transform(train_df['domain'])
    X_test_domain = domain_vectorizer.transform(test_df['domain'])
    
    # Create TF-IDF features for paths
    path_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(NGRAM_RANGE_MIN, NGRAM_RANGE_MAX), max_features=MAX_NGRAM_FEATURES, token_pattern=r'[a-zA-Z0-9]+')
    X_train_path = path_vectorizer.fit_transform(train_df['path'])
    X_test_path = path_vectorizer.transform(test_df['path'])
    
    # Combine sparse matrices
    from scipy.sparse import hstack
    X_train_combined = hstack([X_train_domain, X_train_path])
    X_test_combined = hstack([X_test_domain, X_test_path])
    
    # Models
    dt_combined = DecisionTreeClassifier(max_depth=MAX_TREE_DEPTH, random_state=42)
    knn_combined = KNeighborsClassifier(n_neighbors=N_NEIGHBORS)
    rf_combined = RandomForestClassifier(n_estimators=MAX_ESTIMATORS_FOREST, random_state=42)
    
    # Evaluate
    results.append(evaluate_model("Decision Tree (Domain+Path)", dt_combined, X_train_combined, X_test_combined, y_train, y_test))
    results.append(evaluate_model("KNN (Domain+Path)", knn_combined, X_train_combined, X_test_combined, y_train, y_test))
    results.append(evaluate_model("Random Forest (Domain+Path)", rf_combined, X_train_combined, X_test_combined, y_train, y_test))
    
    # Print important features for Random Forest (Domain+Path)
    print("\n=== Most Important Domain+Path Features ===")
    rf_combined.fit(X_train_combined, y_train)
    # For combined features, we need to concatenate the feature names
    feature_names_domain = domain_vectorizer.get_feature_names_out()
    feature_names_path = path_vectorizer.get_feature_names_out()
    feature_names_combined = np.concatenate([
        [f"domain_{name}" for name in feature_names_domain],
        [f"path_{name}" for name in feature_names_path]
    ])
    print_feature_importance(rf_combined, feature_names_combined, "Random Forest - Domain+Path Features")
    
    results_df = pd.DataFrame(results)
    results_df.sort_values('Accuracy', ascending=False, inplace=True)
    return results_df

def print_feature_importance(model, feature_names, title, top_n=20):
    # Get feature importances
    importances = model.feature_importances_
    
    # Sort feature importances in descending order
    indices = np.argsort(importances)[::-1]
    
    # Select top N features
    top_indices = indices[:top_n]
    top_importances = importances[top_indices]
    top_feature_names = [feature_names[i] for i in top_indices]
    
    # Print feature ranking
    print(f"\nTop {top_n} features for {title}:")
    for i, idx in enumerate(top_indices):
        if i < top_n:
            print(f"{i+1}. {feature_names[idx]} ({importances[idx]:.4f})")

In [86]:
# Run benchmarks
url_df = url_df[["url","domain","label","set"]]
results_df = classify_urls_multilabel(url_df)
#results_df = run_benchmarks(df)

# Display results
print("\n=== Final Comparison ===")
print(results_df.sort_values('Accuracy', ascending=False))

Starting URL Classification with Multi-class Labels...
Total samples: 31598
Label distribution:
  News: 26899 samples
  Business & E-Commerce: 1344 samples
  Entertainment & Culture: 1000 samples
  Science, Academia, & Technology: 711 samples
  General Information & Education: 627 samples
  Social Media/Forums: 294 samples
  Legal & Policy: 237 samples
  Other: 192 samples
  Blogs: 184 samples
  Books: 110 samples
No test set found, creating from train set...
Training samples: 25278
Testing samples: 6320

=== Approach 1: Using Lexical Features ===

Decision Tree (Lexical) Results:
Training time: 0.1178 seconds
Inference time: 0.0008 seconds
Accuracy: 0.8119
F1 Score: 0.8154
F2 Score: 0.8132

Classification Report:
                                 precision    recall  f1-score   support

                          Blogs       0.17      0.24      0.20        37
                          Books       0.31      0.36      0.33        22
          Business & E-Commerce       0.26      0.29    

In [87]:
results_df

Unnamed: 0,Model,Train Time (s),Inference Time (s),Accuracy,F1 Score,F2 Score
8,Random Forest (Domain+Path),1.851177,0.029116,0.975475,0.974585,0.974904
6,Decision Tree (Domain+Path),4.058303,0.003392,0.950633,0.948755,0.949649
4,KNN (Char N-grams),0.012188,162.406031,0.899051,0.891622,0.895526
7,KNN (Domain+Path),0.01035,14.788654,0.887184,0.887006,0.887026
2,Random Forest (Lexical),0.287884,0.016178,0.868513,0.838972,0.854986
5,Random Forest (Char N-grams),6.529289,0.033837,0.866614,0.821311,0.845972
1,KNN (Lexical),0.00993,0.180885,0.841614,0.831255,0.837046
0,Decision Tree (Lexical),0.117758,0.000784,0.811867,0.815352,0.813195
3,Decision Tree (Char N-grams),17.585677,0.006134,0.811551,0.808816,0.810431
