In [1]:
import time
import pandas as pd
import dask.dataframe as dd
from scipy.sparse import hstack


from huggingface_hub import hf_hub_download, list_repo_files
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, fbeta_score, classification_report

# Modeling libraries
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

DATASETS = ["nhagar/zyda-2_urls_zyda_crossdeduped-filtered",
           ]

# read in ground truth labels
ground_truth_labels = pd.read_csv('../data/combined_domain_labels_16k_splits.csv')
ground_truth_labels.sample(5)

Unnamed: 0,domain,label,label_source,set
14101,thecoastnews.com,Local News,northeastern_domain_demo,val
8385,miamiherald.com,Local News,northeastern_domain_demo,train
5838,fairfaxconnection.com,Local News,northeastern_domain_demo,train
10096,redeyechicago.com,Local News,northeastern_domain_demo,train
12339,unionnews-exchange.com,Local News,northeastern_domain_demo,train


In [2]:
for dataset in tqdm(DATASETS):
    try:
        # Get files list from repo
        files = [f for f in list_repo_files(dataset, repo_type="dataset") 
                if f.endswith('.parquet')]
        if not files:
            print(f"No parquet files found for {dataset}, skipping")
            continue
            
        print(f"Found {len(files)} parquet files for {dataset}")
        print(f"Downloading all parquet files...")
        downloaded_files = []
        
        for file in files:
            download_path = hf_hub_download(
                repo_id=dataset,
                filename=file,
                repo_type="dataset",
                cache_dir="hf_cache"
            )
            downloaded_files.append(download_path)
        
        print(f"Downloaded {len(downloaded_files)} files")
        print(f"First file path: {downloaded_files[0]}")
        print("Processing...")
        df = dd.read_parquet(downloaded_files).compute()
    except Exception as e:
        print(f"Error with {dataset}: {str(e)}")

  0%|                                                     | 0/1 [00:00<?, ?it/s]

Found 1 parquet files for nhagar/zyda-2_urls_zyda_crossdeduped-filtered
Downloading all parquet files...
Downloaded 1 files
First file path: hf_cache/datasets--nhagar--zyda-2_urls_zyda_crossdeduped-filtered/snapshots/695209cf7133a596fc999304fa623e802439281f/batch_1.parquet
Processing...


100%|█████████████████████████████████████████████| 1/1 [00:07<00:00,  7.20s/it]


In [3]:
# filter function to get URLs from domains with a ground truth label

def filter_with_progress(df, domain_set, batch_size=100000):
    start_time = time.time()
    total_rows = len(df)
    filtered_rows = []
    
    for i in tqdm(range(0, total_rows, batch_size), desc="Filtering domains"):
        batch = df.iloc[i:min(i+batch_size, total_rows)]
        filtered_batch = batch[batch['domain'].isin(domain_set)]
        filtered_rows.append(filtered_batch)
        
        # Show additional progress info
        if (i + batch_size) % (batch_size * 10) == 0 or (i + batch_size) >= total_rows:
            elapsed = time.time() - start_time
            #print(f"Processed {min(i+batch_size, total_rows)}/{total_rows} rows ({(min(i+batch_size, total_rows)/total_rows)*100:.1f}%) in {elapsed:.1f}s")
    
    return pd.concat(filtered_rows, ignore_index=True)

In [4]:
domain_set = set(ground_truth_labels['domain'])
filtered_df = filter_with_progress(df, domain_set)

Filtering domains: 100%|████████████████████| 1912/1912 [01:37<00:00, 19.70it/s]


In [5]:
len(filtered_df)

42189780

In [6]:
filtered_df.sample(10)

Unnamed: 0,url,domain
21147334,http://www.hindustantimes.com/india/wife-of-ma...,hindustantimes.com
15195719,http://www.kfor1240.com/pages/15150062.php,kfor1240.com
21733231,http://www.kvia.com/news/man-crossing-street-s...,kvia.com
38129972,http://www.harpersbazaar.com/culture/film-tv/i...,harpersbazaar.com
4006794,https://www.stgeorgeutah.com/news/archive/2015...,stgeorgeutah.com
19866719,http://www.ktvu.com/news/congresswoman-speier-...,ktvu.com
24707185,http://ftalphaville.ft.com/blog/2008/10/15/170...,ft.com
42018735,https://www.foxnews.com/us/jury-selection-to-b...,foxnews.com
18092596,https://www.healthcentral.com/article/4-risk-f...,healthcentral.com
9876384,http://activate.metroactive.com/2014/02/morris...,metroactive.com


In [7]:
ground_truth_labels['is_news'] = ground_truth_labels['label'].apply(lambda x: 1 if 'News' in x else 0)
domain_news_map = ground_truth_labels[['domain','is_news']].copy()

filtered_df = filtered_df.merge(domain_news_map, on='domain', how='left')

# Prepare training and testing data
train_domains = ground_truth_labels[ground_truth_labels['set'].str.contains('train')]['domain']
test_domains = ground_truth_labels[ground_truth_labels['set'].str.contains('test')]['domain']

In [8]:
filtered_df.is_news.value_counts()

is_news
1    38574202
0     3615578
Name: count, dtype: int64

In [9]:
# sample for testing
filtered_df = filtered_df.sample(100000)

# Separate training and testing data
X_train = filtered_df[filtered_df['domain'].isin(train_domains)]
X_test = filtered_df[filtered_df['domain'].isin(test_domains)]

# Prepare labels
y_train = X_train['is_news']
y_test = X_test['is_news']

In [10]:
# experiment parameters

# Helper Functions for the Experiments

In [11]:
def create_vectorization_strategies():
    """
    Define different vectorization strategies
    
    Returns:
        Dictionary of vectorization strategies
    """
    return {
        # Domain Vectorization Strategies
        'Domain (Char 2-5 grams)': TfidfVectorizer(
            analyzer='char', 
            ngram_range=(2, 5), 
            max_features=500
        ),
        'Domain (Word 1-5 grams)': TfidfVectorizer(
            analyzer='word', 
            ngram_range=(1, 5), 
            max_features=500,
            token_pattern=r'[a-zA-Z0-9]+'
        ),
        
        # Path Vectorization Strategies
        'Path (Char 2-5 grams)': TfidfVectorizer(
            analyzer='char', 
            ngram_range=(2, 5), 
            max_features=500
        ),
        'Path (Word 1-5 grams)': TfidfVectorizer(
            analyzer='word', 
            ngram_range=(1, 5), 
            max_features=500,
            token_pattern=r'[a-zA-Z0-9]+'
        ),
        
        # Combined Feature Vectorizers
        'Combined (Domain Char + Path Word)': None  # Will be handled separately
    }

def prepare_features(train_df, test_df, vectorization_strategies):
    """
    Prepare features using different vectorization strategies
    
    Args:
        train_df: Training dataframe
        test_df: Testing dataframe
        vectorization_strategies: Dictionary of vectorization strategies
    
    Returns:
        Dictionary of feature matrices
    """
    features = {}
    
    # Individual Vectorization Strategies
    for strategy_name, vectorizer in vectorization_strategies.items():
        if strategy_name == 'Domain (Char 2-5 grams)':
            features[strategy_name] = {
                'X_train': vectorizer.fit_transform(train_df['domain']),
                'X_test': vectorizer.transform(test_df['domain'])
            }
        elif strategy_name == 'Domain (Word 1-5 grams)':
            features[strategy_name] = {
                'X_train': vectorizer.fit_transform(train_df['domain']),
                'X_test': vectorizer.transform(test_df['domain'])
            }
        elif strategy_name == 'Path (Char 2-5 grams)':
            features[strategy_name] = {
                'X_train': vectorizer.fit_transform(train_df['url']),
                'X_test': vectorizer.transform(test_df['url'])
            }
        elif strategy_name == 'Path (Word 1-5 grams)':
            features[strategy_name] = {
                'X_train': vectorizer.fit_transform(train_df['url']),
                'X_test': vectorizer.transform(test_df['url'])
            }
    
    # Combined Feature Strategy
    domain_char_vec = TfidfVectorizer(analyzer='char', ngram_range=(2, 5), max_features=500)
    path_word_vec = TfidfVectorizer(analyzer='word', ngram_range=(1, 5), max_features=500, token_pattern=r'[a-zA-Z0-9]+')
    
    features['Combined (Domain Char + Path Word)'] = {
        'X_train': hstack([
            domain_char_vec.fit_transform(train_df['domain']),
            path_word_vec.fit_transform(train_df['url'])
        ]),
        'X_test': hstack([
            domain_char_vec.transform(test_df['domain']),
            path_word_vec.transform(test_df['url'])
        ])
    }
    
    return features

def create_evaluation_pipeline(model):
    """
    Create a scikit-learn pipeline for model evaluation
    
    Args:
        model: Sklearn model to use
    
    Returns:
        sklearn Pipeline
    """
    return Pipeline([
        ('classifier', model)
    ])

def evaluate_model(name, X_train, X_test, y_train, y_test, model_factory=LogisticRegression):
    """
    Evaluate a machine learning model
    
    Args:
        name: Name of the model
        X_train, X_test: Training and testing features
        y_train, y_test: Training and testing labels
        model_factory: Function to create model instance
    
    Returns:
        Dictionary with model performance metrics
    """
    # Create model
    model = create_evaluation_pipeline(model_factory(random_state=42))
    
    # Training
    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time
    
    # Prediction
    start_time = time.time()
    y_pred = model.predict(X_test)
    inference_time = time.time() - start_time
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    f2 = fbeta_score(y_test, y_pred, beta=2, average='weighted')
    
    print(f"\n{name} Results:")
    print(f"Training time: {train_time:.4f} seconds")
    print(f"Inference time: {inference_time:.4f} seconds")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"F2 Score: {f2:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    return {
        'Vectorization Strategy': name,
        'Train Time (s)': train_time,
        'Inference Time (s)': inference_time,
        'Accuracy': accuracy,
        'F1 Score': f1,
        'F2 Score': f2
    }

def compare_vectorization_strategies(train_df, test_df, y_train, y_test, 
                                     models=None, 
                                     vectorization_strategies=None):
    """
    Compare different vectorization strategies and models
    
    Args:
        train_df: Training dataframe
        test_df: Testing dataframe
        y_train, y_test: Training and testing labels
        models: List of models to evaluate (default: Logistic Regression)
        vectorization_strategies: Dictionary of vectorization strategies
    
    Returns:
        DataFrame with performance metrics
    """
    # Default models if not provided
    if models is None:
        models = [
            ('Logistic Regression', LogisticRegression),
            ('Random Forest', RandomForestClassifier),
            ('SVM', SVC)
        ]
    
    # Default vectorization strategies if not provided
    if vectorization_strategies is None:
        vectorization_strategies = create_vectorization_strategies()
    
    # Prepare features
    feature_sets = prepare_features(train_df, test_df, vectorization_strategies)
    
    # Results storage
    all_results = []
    
    # Evaluate each vectorization strategy with each model
    for strategy_name, feature_set in feature_sets.items():
        for model_name, model_factory in models:
            full_name = f"{strategy_name} - {model_name}"
            
            # Evaluate model
            result = evaluate_model(
                full_name, 
                feature_set['X_train'], 
                feature_set['X_test'], 
                y_train, 
                y_test, 
                model_factory
            )
            all_results.append(result)
    
    # Convert to DataFrame
    results_df = pd.DataFrame(all_results)
    results_df = results_df.sort_values('Accuracy', ascending=False)
    
    print("\nVectorization and Model Comparison Summary:")
    print(results_df)
    
    return results_df

# Actual Experiments

In [None]:
results = compare_vectorization_strategies(
    X_train, 
    X_test, 
    y_train, 
    y_test,
    
    # Optional: customize models
    models=[
        ('Logistic Regression', LogisticRegression),
        ('Random Forest', RandomForestClassifier),
        ('SVM', SVC)
    ],
    
    # Optional: customize vectorization strategies
    vectorization_strategies=create_vectorization_strategies()
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Domain (Char 2-5 grams) - Logistic Regression Results:
Training time: 0.4472 seconds
Inference time: 0.0004 seconds
Accuracy: 0.9181
F1 Score: 0.9016
F2 Score: 0.9113

Classification Report:
              precision    recall  f1-score   support

           0       0.14      0.05      0.08       586
           1       0.94      0.98      0.96      8686

    accuracy                           0.92      9272
   macro avg       0.54      0.52      0.52      9272
weighted avg       0.89      0.92      0.90      9272


Domain (Char 2-5 grams) - Random Forest Results:
Training time: 19.2749 seconds
Inference time: 0.1363 seconds
Accuracy: 0.9363
F1 Score: 0.9280
F2 Score: 0.9326

Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.28      0.36       586
           1       0.95      0.98      0.97      8686

    accuracy                           0.94      9272
   macro avg       0.72      0.63      0.66      9272
weighted avg       0.9

In [None]:
results