In [1]:
import time
import pandas as pd
import dask.dataframe as dd

from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from huggingface_hub import hf_hub_download, list_repo_files
from tqdm import tqdm

import os
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier


DATASETS = [
            "nhagar/c4_urls_en.noclean"
           ]

# read in ground truth labels
ground_truth_labels = pd.read_csv('../data/combined_domain_labels_16k_splits.csv')
ground_truth_labels.sample(5)

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,domain,label,label_source,set
3203,vicksburgpost.com,Local News,northeastern_domain_demo,test
10739,southjerseylocalnews.com,Local News,northeastern_domain_demo,train
2061,fws.gov,Legal & Policy,data_provenance_init,train
15211,ranker.com,News,data_provenance_init,val
11218,thebellevuegazette.com,Local News,northeastern_domain_demo,train


In [2]:
for dataset in tqdm(DATASETS):
    try:
        # Get files list from repo
        files = [f for f in list_repo_files(dataset, repo_type="dataset") 
                if f.endswith('.parquet')]
        if not files:
            print(f"No parquet files found for {dataset}, skipping")
            continue
            
        print(f"Found {len(files)} parquet files for {dataset}")
        print(f"Downloading all parquet files...")
        downloaded_files = []
        
        for file in files:
            download_path = hf_hub_download(
                repo_id=dataset,
                filename=file,
                repo_type="dataset",
                cache_dir="hf_cache"
            )
            downloaded_files.append(download_path)
        
        print(f"Downloaded {len(downloaded_files)} files")
        print(f"First file path: {downloaded_files[0]}")
        print("Processing...")
        df = dd.read_parquet(downloaded_files).compute()
    except Exception as e:
        print(f"Error with {dataset}: {str(e)}")

  0%|                                                     | 0/1 [00:00<?, ?it/s]

Found 95 parquet files for nhagar/c4_urls_en.noclean
Downloading all parquet files...
Downloaded 95 files
First file path: hf_cache/datasets--nhagar--c4_urls_en.noclean/snapshots/d0df683760e65bda672bf230c2673d7b2e07bfe2/batch_0/train-00000-of-00001.parquet
Processing...


100%|█████████████████████████████████████████████| 1/1 [00:45<00:00, 45.88s/it]


In [3]:
# filter function to get URLs from domains with a ground truth label

non_news_tlds = ('.edu', '.ru', '.mil', '.gov',
                '.int','.museum','.travel','.shop',
                 '.post','.jobs','.pro','.tel','.xyz')


def filter_with_progress(df, domain_set, exclude_tlds=None, batch_size=100000):
    start_time = time.time()
    total_rows = len(df)
    filtered_rows = []

    for i in tqdm(range(0, total_rows, batch_size), desc="Filtering domains"):
        batch = df.iloc[i:min(i + batch_size, total_rows)]

        # Filter by domains in the provided set
        filtered_batch = batch[batch['domain'].isin(domain_set)]

        # Include domains explicitly marked as non-news by their TLD
        non_news_batch = batch[batch['domain'].str.endswith(tuple(non_news_tlds))]

        # Combine both
        combined_batch = pd.concat([filtered_batch, non_news_batch]).drop_duplicates()
        filtered_rows.append(combined_batch)

    return pd.concat(filtered_rows, ignore_index=True)

In [4]:
def get_n_urls_per_domain(df, n=7, batch_size=100_000, random_state=42):
    domain_count, results = {}, []

    for i in tqdm(range(0, len(df), batch_size), desc="Processing batches"):
        batch = df.iloc[i:i + batch_size]
        eligible = {d for d, c in domain_count.items() if c < n}
        filtered = batch if not eligible else batch[batch['domain'].isin(eligible)]

        def sample_n(g):
            domain = g.name if 'domain' not in g.columns else g['domain'].iloc[0]
            s = g.sample(n=min(n, len(g)), random_state=random_state)
            s['domain'] = domain
            return s

        sampled = filtered.groupby('domain', group_keys=False).apply(sample_n, include_groups=False)

        try:
            counts = sampled['domain'].value_counts()
        except KeyError:
            sampled = sampled.reset_index()  # recover if domain dropped
            counts = sampled['domain'].value_counts()

        for d, c in counts.items():
            domain_count[d] = domain_count.get(d, 0) + c

        results.append(sampled)

        if domain_count and all(v >= n for v in domain_count.values()):
            break

    return pd.concat(results, ignore_index=True)

In [5]:
domain_set = set(ground_truth_labels['domain'])
filtered_df = filter_with_progress(df, domain_set)

Filtering domains: 100%|████████████████████| 1894/1894 [06:04<00:00,  5.20it/s]


In [6]:
len(filtered_df)

20173318

In [7]:
test_df = get_n_urls_per_domain(filtered_df)
len(test_df)

Processing batches: 100%|█████████████████████| 202/202 [00:36<00:00,  5.51it/s]


82955

In [8]:
test_df.sample(10)

Unnamed: 0,url,domain
23400,http://abf-downloads.rosalinux.ru/jpisini_pers...,rosalinux.ru
73633,https://www.usagschweinfurt.jobs/jobs/healthca...,usagschweinfurt.jobs
619,https://www.airuniversity.af.edu/AFNC/Articles/,af.edu
18408,https://www.nba.com/wizards/video/teams/wizard...,nba.com
17798,http://sportsnetwork.msnbc.com/nba/boxscore.as...,msnbc.com
81961,https://www.betteryou.xyz/2018/09/10/how-to-le...,betteryou.xyz
52898,http://domydom.ru/map36.html,domydom.ru
23662,https://posting.sacurrent.com/sanantonio/Tools...,sacurrent.com
47311,https://www.consumer.ftc.gov/blog/2018/03/watc...,ftc.gov
65054,https://www.rosbank.ru/en/press_service/news/y...,rosbank.ru


In [9]:
start = time.time()
print("Starting domain news mapping...")

# Label 'is_news' based on presence of 'News' in 'label'
ground_truth_labels['is_news'] = ground_truth_labels['label'].apply(lambda x: 1 if 'News' in x else 0)

# Create mapping of domain to is_news
domain_news_map = ground_truth_labels[['domain', 'is_news']].copy()

# Merge with test_df
test_df = test_df.merge(domain_news_map, on='domain', how='left')

# Set is_news to 0 for known non-news TLDs
test_df.loc[test_df['domain'].str.endswith(non_news_tlds), 'is_news'] = 0

print(f"Domain news mapping completed in {time.time() - start:.2f} seconds.")

Starting domain news mapping...
Domain news mapping completed in 0.13 seconds.


In [10]:
test_df.is_news.value_counts()

is_news
0.0    50092
1.0    32863
Name: count, dtype: int64

In [11]:
len(test_df)

82955

In [12]:
test_df.url.str.len().describe()

count      82955.0
mean     76.298306
std      44.470288
min           16.0
25%           50.0
50%           68.0
75%           92.0
max         1896.0
Name: url, dtype: Float64

# Helper Functions for the Experiments

In [13]:
import re
from urllib.parse import urlparse
from sklearn.base import BaseEstimator, TransformerMixin

# Custom transformer for manual feature extraction
class URLFeatureExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X):
        feature_list = []
        for url in X:
            parsed = urlparse(url)
            domain = parsed.netloc
            path = parsed.path
            query = parsed.query
            
            features = {
                # --- Basic structural features ---
                'url_length': len(url),
                'domain_length': len(domain),
                'path_length': len(path),
                'num_slashes': url.count('/'),
                'num_dots': url.count('.'),
                'num_equal': url.count('='),
                'num_params': len(query.split('&')) if query else 0,
                'query_length': len(query),
                'has_query_string': int(bool(query)),
                'has_fragment': int(bool(parsed.fragment)),
                'fragment_length': len(parsed.fragment),
                'has_port': int(bool(parsed.port)),
            
                # --- Character-based stats ---
                'num_digits': sum(c.isdigit() for c in url),
                'digit_ratio': sum(c.isdigit() for c in url) / len(url) if len(url) > 0 else 0,
                'num_dashes': url.count('-'),
                'dash_ratio': url.count('-') / len(url) if len(url) > 0 else 0,
                'domain_dash_ratio': domain.count('-') / len(domain) if len(domain) > 0 else 0,
                'path_dash_ratio': path.count('-') / len(path) if len(path) > 0 else 0,
                'num_hyphens': url.count('-'),  # Duplicate of num_dashes, included for clarity
                'num_underscores': url.count('_'),
                'num_semicolons': url.count(';'),
                'num_at_symbols': url.count('@'),
                'num_percent': url.count('%'),
                'uppercase_ratio': sum(1 for c in url if c.isupper()) / len(url) if len(url) > 0 else 0,
                'non_alnum_path_ratio': sum(not c.isalnum() for c in path) / len(path) if len(path) > 0 else 0,
            
                # --- Domain and TLD ---
                'tld_length': len(domain.split('.')[-1]) if '.' in domain else 0,
                'subdomain_depth': domain.count('.') - 1,
                'domain_digit_ratio': sum(c.isdigit() for c in domain) / len(domain) if len(domain) > 0 else 0,
                'has_ip_address': int(bool(re.match(r'(\d{1,3}\.){3}\d{1,3}', domain))),
                'is_very_short_domain': int(len(domain) < 5),
            
                # --- Path and segment stats ---
                'num_path_segments': len([p for p in path.split('/') if p]),
                'path_to_url_ratio': len(path) / len(url) if len(url) > 0 else 0,
                'domain_to_url_ratio': len(domain) / len(url) if len(url) > 0 else 0,
                'avg_segment_length': sum(len(p) for p in path.split('/') if p) / len([p for p in path.split('/') if p]) if path.strip('/') else 0,
                'max_segment_length': max([len(p) for p in path.split('/') if p], default=0),
                'std_segment_length': np.std([len(p) for p in path.split('/') if p]) if path.strip('/') else 0,
                'numeric_segment_ratio': sum(1 for p in path.split('/') if p.isdigit()) / len([p for p in path.split('/') if p]) if path.strip('/') else 0,
                'mostly_numeric_path': int(sum(1 for c in path if c.isdigit()) / len(path) > 0.5 if len(path) > 0 else 0),
                'has_long_segment': int(any(len(p) > 20 for p in path.split('/') if p)),
                'has_repeated_chars': int(bool(re.search(r'(.)\1{2,}', path))),
                'starts_ends_with_slash': int(path.startswith('/') and path.endswith('/')),
                'path_digit_ratio': sum(c.isdigit() for c in path) / len(path) if len(path) > 0 else 0,
                'char_type_diversity': len(set([
                    'digit' if c.isdigit() else 'alpha' if c.isalpha() else 'other'
                    for c in path
                ])),
                'path_entropy': -sum(
                    (path.count(c) / len(path)) * np.log2(path.count(c) / len(path))
                    for c in set(path)
                ) if len(path) > 0 else 0,
                'first_segment_length': len(path.split('/')[1]) if len(path.split('/')) > 1 else 0,
                'last_segment_all_digits': int(path.split('/')[-1].isdigit() if path.split('/') else 0),
                'mixed_segment_ratio': sum(
                    bool(re.search(r'[a-zA-Z]', p)) and bool(re.search(r'\d', p))
                    for p in path.split('/') if p
                ) / len([p for p in path.split('/') if p]) if path.strip('/') else 0,
                'num_internal_slashes': path[1:].count('/') if path.startswith('/') else path.count('/'),
            
                # --- Keyword-derived (optional if canonicalized) ---
                'has_news_in_domain': int('news' in domain.lower()),
                'has_news_in_path': int('news' in path.lower()),
                'has_article_in_path': int('article' in path.lower()),
                'has_content_in_path': int('content' in path.lower()),
                'has_story_in_path': int('story' in path.lower()),
                'has_blog_in_url': int('blog' in url.lower()),
                'has_date_pattern': int(bool(re.search(r'/(19|20)\d{2}[-/](0[1-9]|1[0-2])[-/](0[1-9]|[12][0-9]|3[01])/', url))),
            }
            feature_list.append(features)
        return pd.DataFrame(feature_list).fillna(0)


In [14]:
N_FEATURES = 1000
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.metrics import accuracy_score, f1_score, fbeta_score, classification_report
from scipy.sparse import hstack
import pandas as pd
import inspect
import time

def create_vectorizers():
    return {
        'Full Path (Word)': TfidfVectorizer(analyzer='word', ngram_range=(1, 5), max_features=N_FEATURES, token_pattern=r'[a-zA-Z0-9]+'),
        'Full Path (Char)': TfidfVectorizer(analyzer='char', ngram_range=(2, 5), max_features=N_FEATURES),
        'Full Path (Custom Token)': TfidfVectorizer(analyzer='word',tokenizer=lambda x: x.split('/'),ngram_range=(1, 3),max_features=500),
        'Full Path (Hash)': HashingVectorizer(analyzer='char', ngram_range=(2, 5), n_features=N_FEATURES, alternate_sign=False),
        'Path Only (Char)': TfidfVectorizer(analyzer='char', ngram_range=(2, 5), max_features=N_FEATURES),
        'Path Only (Word)': TfidfVectorizer(analyzer='word', ngram_range=(1, 5), max_features=N_FEATURES, token_pattern=r'[a-zA-Z0-9]+'),
        'Manual Features': URLFeatureExtractor()
    }

def prepare_features(train_df, test_df, vectorizers):
    features = {}
    for name, vec in vectorizers.items():
        if 'Full Path' in name:
            col = 'url'
        elif 'Path Only' in name:
            col = 'url'  # Use full URL but remove domain
            train_df = train_df.copy()
            test_df = test_df.copy()
            train_df[col] = train_df['url'].apply(lambda x: urlparse(x).path)
            test_df[col] = test_df['url'].apply(lambda x: urlparse(x).path)
        elif 'Manual' in name:
            col = 'url'
        else:
            continue  # For now

        features[name] = {
            'X_train': vec.fit_transform(train_df[col]),
            'X_test': vec.transform(test_df[col])
        }
    return features

def get_class_metric(report, label, metric):
    for key in [label, str(label), float(label), str(float(label))]:
        if key in report and metric in report[key]:
            return report[key][metric]
    return None

def evaluate(name, X_train, X_test, y_train, y_test, model_cls):
    if inspect.isclass(model_cls) or isinstance(model_cls, type):
        sig = inspect.signature(model_cls)
        if 'random_state' in sig.parameters:
            model = Pipeline([('clf', model_cls(random_state=42))])
        else:
            model = Pipeline([('clf', model_cls())])
    else:
        # It's a lambda or factory function
        model = Pipeline([('clf', model_cls())])
    
    t0 = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - t0

    t0 = time.time()
    y_pred = model.predict(X_test)
    infer_time = time.time() - t0

    report = classification_report(y_test, y_pred, output_dict=True)

    print(f"\n{name}\nTrain: {train_time:.2f}s | Inference: {infer_time:.2f}s")
    print(classification_report(y_test, y_pred))

    results = {
        'Strategy': name,
        'Train Time (s)': train_time,
        'Inference Time (s)': infer_time,
        'Accuracy': accuracy_score(y_test, y_pred),
        'F1 (Class 0)': get_class_metric(report, 0, 'f1-score'),
        'Precision (Class 0)': get_class_metric(report, 0, 'precision'),
        'Recall (Class 0)': get_class_metric(report, 0, 'recall'),
        'F1 (Class 1)': get_class_metric(report, 1, 'f1-score'),
        'Precision (Class 1)': get_class_metric(report, 1, 'precision'),
        'Recall (Class 1)': get_class_metric(report, 1, 'recall'),
        'F2 (Class 0)': fbeta_score(y_test, y_pred, beta=2, pos_label=0),
        'F2 (Class 1)': fbeta_score(y_test, y_pred, beta=2, pos_label=1),
        'F1 (Weighted)': f1_score(y_test, y_pred, average='weighted'),
        'F2 (Weighted)': fbeta_score(y_test, y_pred, beta=2, average='weighted')
    }

    return results


# Actual Experiments

In [15]:
import numpy as np

domain_df = test_df.groupby('domain')['is_news'].agg(lambda x: x.mode()[0]).reset_index()

train_domains, test_domains = train_test_split(
    domain_df['domain'], 
    test_size=0.2, 
    stratify=domain_df['is_news'], 
    random_state=42
)

train_df = test_df[test_df['domain'].isin(train_domains)]
test_df = test_df[test_df['domain'].isin(test_domains)]

X_train = train_df[['url']]
y_train = train_df['is_news']
X_test = test_df[['url']]
y_test = test_df['is_news']


# Create vectorizers
vecs = create_vectorizers()

# Prepare features using vectorizers
features = prepare_features(X_train, X_test, vecs)

# Prepare combined features separately
combined_features = prepare_features(X_train, X_test, {'Combined': None})
features.update(combined_features)

# Define models
models = [
    ('LogReg', LogisticRegression),
    ('RandomForest', RandomForestClassifier),
    ('KNN (k=3)', lambda: KNeighborsClassifier(n_neighbors=3)),
    ('KNN (k=1)', lambda: KNeighborsClassifier(n_neighbors=1)),
    ('DecisionTree', DecisionTreeClassifier),
    #('MLP', lambda: MLPClassifier(hidden_layer_sizes=(64,), max_iter=200)),
    ('NaiveBayes', lambda: MultinomialNB())
]

# Create output directory
os.makedirs('saved_models', exist_ok=True)

# Evaluate and save models
results = []
for strat, feat in features.items():
    for name, cls in models:
        model_label = f"{strat} + {name}"
        print(f"Training: {model_label}")
        try:
            result = evaluate(model_label, feat['X_train'], feat['X_test'], y_train, y_test, cls)    
            model = cls() if not callable(cls) else cls()        
            model.fit(feat['X_train'], y_train)
        except Exception as e:
            print(f"Error fitting model '{name}': {e}")
        model_filename = f"saved_models/{strat}_{name.replace(' ', '_')}.joblib"
        joblib.dump(model, model_filename)

        results.append(result)



Training: Full Path (Word) + LogReg

Full Path (Word) + LogReg
Train: 0.27s | Inference: 0.00s
              precision    recall  f1-score   support

         0.0       0.95      0.91      0.93     10089
         1.0       0.87      0.93      0.90      6552

    accuracy                           0.92     16641
   macro avg       0.91      0.92      0.91     16641
weighted avg       0.92      0.92      0.92     16641

Training: Full Path (Word) + RandomForest

Full Path (Word) + RandomForest
Train: 40.05s | Inference: 0.33s
              precision    recall  f1-score   support

         0.0       0.96      0.90      0.93     10089
         1.0       0.86      0.94      0.90      6552

    accuracy                           0.91     16641
   macro avg       0.91      0.92      0.91     16641
weighted avg       0.92      0.91      0.92     16641

Training: Full Path (Word) + KNN (k=3)

Full Path (Word) + KNN (k=3)
Train: 0.01s | Inference: 34.72s
              precision    recall  f1-sco

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Manual Features + LogReg
Train: 0.97s | Inference: 0.02s
              precision    recall  f1-score   support

         0.0       0.73      0.87      0.80     10089
         1.0       0.72      0.51      0.60      6552

    accuracy                           0.73     16641
   macro avg       0.73      0.69      0.70     16641
weighted avg       0.73      0.73      0.72     16641



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training: Manual Features + RandomForest

Manual Features + RandomForest
Train: 9.59s | Inference: 0.31s
              precision    recall  f1-score   support

         0.0       0.80      0.83      0.82     10089
         1.0       0.73      0.68      0.70      6552

    accuracy                           0.77     16641
   macro avg       0.76      0.76      0.76     16641
weighted avg       0.77      0.77      0.77     16641

Training: Manual Features + KNN (k=3)

Manual Features + KNN (k=3)
Train: 0.02s | Inference: 1.90s
              precision    recall  f1-score   support

         0.0       0.78      0.77      0.78     10089
         1.0       0.66      0.67      0.66      6552

    accuracy                           0.73     16641
   macro avg       0.72      0.72      0.72     16641
weighted avg       0.73      0.73      0.73     16641

Training: Manual Features + KNN (k=1)

Manual Features + KNN (k=1)
Train: 0.05s | Inference: 1.77s
              precision    recall  f1-score

In [16]:
# Display results
results = pd.DataFrame(results).sort_values('Accuracy', ascending=False)
#print("\nSummary:\n", results)

results.sort_values('Accuracy',ascending=False,inplace=True)
#results.sort_values('F1 (Weighted)',ascending=False)
#results.sort_values('F2 (Weighted)',ascending=False)
results

Unnamed: 0,Strategy,Train Time (s),Inference Time (s),Accuracy,F1 (Class 0),Precision (Class 0),Recall (Class 0),F1 (Class 1),Precision (Class 1),Recall (Class 1),F2 (Class 0),F2 (Class 1),F1 (Weighted),F2 (Weighted)
6,Full Path (Char) + LogReg,0.783373,0.004522,0.919176,0.932156,0.949055,0.915849,0.900052,0.877046,0.924298,0.922303,0.914444,0.919516,0.919209
0,Full Path (Word) + LogReg,0.273773,0.000756,0.917673,0.930457,0.953595,0.908415,0.899131,0.868563,0.931929,0.917105,0.918527,0.918123,0.917665
1,Full Path (Word) + RandomForest,40.046579,0.334814,0.914488,0.927357,0.956105,0.900287,0.896078,0.859123,0.936355,0.910923,0.919818,0.915042,0.914425
7,Full Path (Char) + RandomForest,150.761759,0.35764,0.913767,0.926202,0.962484,0.892556,0.896293,0.851201,0.946429,0.905717,0.925716,0.914426,0.913591
18,Full Path (Hash) + LogReg,1.230849,0.059881,0.900367,0.91665,0.930021,0.903657,0.876176,0.857853,0.895299,0.90881,0.887551,0.900714,0.90044
4,Full Path (Word) + DecisionTree,7.515523,0.007131,0.891833,0.910215,0.916156,0.904351,0.863987,0.855582,0.872558,0.906688,0.869109,0.892014,0.891892
8,Full Path (Char) + KNN (k=3),0.028875,116.119892,0.891773,0.910429,0.913655,0.907226,0.863302,0.858674,0.867979,0.908504,0.866102,0.891874,0.89181
5,Full Path (Word) + NaiveBayes,0.020721,0.002521,0.890031,0.910373,0.899797,0.921201,0.857743,0.874049,0.842033,0.916839,0.848247,0.889651,0.889833
19,Full Path (Hash) + RandomForest,308.501812,0.665472,0.889189,0.906661,0.926451,0.887699,0.86367,0.837539,0.891484,0.895188,0.880146,0.889734,0.889266
10,Full Path (Char) + DecisionTree,47.994331,0.015811,0.888769,0.907704,0.913305,0.902171,0.860059,0.852135,0.868132,0.904376,0.864885,0.888945,0.888827


In [17]:
# latex table with formatting
def percentage_format(x):
    return f"{x*100:.1f}\\%"

latex_table = results.head(8)[['Strategy', 'Train Time (s)', 'Inference Time (s)', 'Accuracy', 'F1 (Weighted)', 'F2 (Weighted)']].to_latex(
    index=False,
    float_format=percentage_format
)

print(latex_table)

\begin{tabular}{lrrrrr}
\toprule
Strategy & Train Time (s) & Inference Time (s) & Accuracy & F1 (Weighted) & F2 (Weighted) \\
\midrule
Full Path (Char) + LogReg & 78.3\% & 0.5\% & 91.9\% & 92.0\% & 91.9\% \\
Full Path (Word) + LogReg & 27.4\% & 0.1\% & 91.8\% & 91.8\% & 91.8\% \\
Full Path (Word) + RandomForest & 4004.7\% & 33.5\% & 91.4\% & 91.5\% & 91.4\% \\
Full Path (Char) + RandomForest & 15076.2\% & 35.8\% & 91.4\% & 91.4\% & 91.4\% \\
Full Path (Hash) + LogReg & 123.1\% & 6.0\% & 90.0\% & 90.1\% & 90.0\% \\
Full Path (Word) + DecisionTree & 751.6\% & 0.7\% & 89.2\% & 89.2\% & 89.2\% \\
Full Path (Char) + KNN (k=3) & 2.9\% & 11612.0\% & 89.2\% & 89.2\% & 89.2\% \\
Full Path (Word) + NaiveBayes & 2.1\% & 0.3\% & 89.0\% & 89.0\% & 89.0\% \\
\bottomrule
\end{tabular}



In [20]:
# Create output directory for vectorizers
os.makedirs('saved_vectorizers', exist_ok=True)

# Save each vectorizer
for name, vectorizer in vecs.items():
    print(f"Saving {name}")
    if "Custom" not in name:
        vectorizer_filename = f"saved_vectorizers/{name}.joblib"
        joblib.dump(vectorizer, vectorizer_filename)

Saving Full Path (Word)
Saving Full Path (Char)
Saving Full Path (Custom Token)
Saving Full Path (Hash)
Saving Path Only (Char)
Saving Path Only (Word)
Saving Manual Features


In [21]:
test_df.sample(5)

Unnamed: 0,url,domain,is_news
11052,https://www.governing.com/topics/health-human-...,governing.com,1.0
70550,https://www.sundancetimes.com/story/2019/04/04...,sundancetimes.com,1.0
8349,http://seniorcenter.ellington-ct.gov/Jobs.aspx,ellington-ct.gov,0.0
60128,http://www.fox26houston.com/news/bill-would-pr...,fox26houston.com,1.0
27140,http://www.thecatholicconnection.org/?p=2081,thecatholicconnection.org,1.0


In [22]:
results.to_csv(f'saved_models/results_summary_{N_FEATURES}_features.csv',index=False)

In [23]:
len(results)

42