In [1]:
import time
import pandas as pd

start = time.time()
service_labels = pd.read_csv('../data/url_domain_1to1_mappings.csv')
news_labels = pd.read_csv('https://raw.githubusercontent.com/LazerLab/DomainDemo/refs/heads/main/data/existing_labels/news_local_natioanl_classification.csv')
print(f"Loaded datasets in {(time.time() - start):2f} seconds")

Loaded datasets in 0.335065 seconds


In [2]:
service_labels.columns = ['domain','label']
service_labels['domain'] = service_labels.domain.str.replace('www.','')
service_labels.sample(5)

Unnamed: 0,domain,label
1653,xcelfinancialgroup.com,Business & E-Commerce
1309,irv2.com,Social Media/Forums
334,gizmodo.com.au,News
2610,khinsider.com,News
3001,californiaspecialedlaw.com,Legal & Policy


In [3]:
news_labels.sample(5)

Unnamed: 0,domain,classification
6657,wfsu.org,local
2544,ctvn.org,local
4652,nephitimesnews.com,local
4498,my13palmsprings.com,local
11804,etcatholic.org,local


In [4]:
news_labels.classification.value_counts()

classification
local           12102
national          763
INCONSISTENT       40
Name: count, dtype: int64

In [5]:
# start with service labels
combined_labels = service_labels.copy()
combined_labels['label_source'] = 'data_provenance_init'

# separate local / general news from northeastern
all_northeastern_domains = set(news_labels['domain'])
local_news_domains = set(news_labels[news_labels.classification=='local']['domain'])
news_domains = set(news_labels[news_labels.classification!='local']['domain'])

# update labels
combined_labels.loc[combined_labels['domain'].isin(news_domains), 'label'] = "News"
combined_labels.loc[combined_labels['domain'].isin(local_news_domains), 'label'] = "Local News"
combined_labels.loc[combined_labels['domain'].isin(all_northeastern_domains), 'label_source'] = "northeastern_domain_demo"

# missing domains (in northeastern news_labels but not in combined_labels yet)
missing_domains = news_labels[~news_labels['domain'].isin(combined_labels['domain'])]

domains_to_add = missing_domains[['domain', 'classification']].copy()
domains_to_add['label'] = domains_to_add['classification'].map(
    {'local': 'Local News'}).fillna('News')
domains_to_add['label_source'] = 'northeastern_domain_demo'
domains_to_add = domains_to_add.drop('classification', axis=1)

# Append missing domains to combined_labels
combined_labels = pd.concat([combined_labels, domains_to_add], ignore_index=True)

# Verify no duplicates
assert len(combined_labels['domain'].unique()) == len(combined_labels)

In [6]:
combined_labels.sample(10)

Unnamed: 0,domain,label,label_source
115,dailyherald.com,Local News,northeastern_domain_demo
7133,freemansd.com,Local News,northeastern_domain_demo
666,tech.slashdot.org,Social Media/Forums,data_provenance_init
3832,katriinatalaslahti.com,News,data_provenance_init
522,triphobo.com,Entertainment & Culture,data_provenance_init
8313,mercyhurst.edu,Local News,northeastern_domain_demo
1673,elevate.com.au,Business & E-Commerce,data_provenance_init
13506,kcap.com,Local News,northeastern_domain_demo
13530,northernplainsindependent.com,Local News,northeastern_domain_demo
6776,dixonpilot.com,Local News,northeastern_domain_demo


In [7]:
combined_labels.label.value_counts()

label
Local News                         12102
News                                1791
Business & E-Commerce                780
Entertainment & Culture              597
Science, Academia, & Technology      471
General Information & Education      426
Social Media/Forums                  151
Legal & Policy                       136
Blogs                                135
Other                                109
Books                                 58
Name: count, dtype: int64

In [8]:
import numpy as np

combined_labels['set'] = 'train'

for label in combined_labels['label'].unique():
    label_indices = combined_labels[combined_labels['label'] == label].index.tolist()
    np.random.shuffle(label_indices)
    
    test_size = int(len(label_indices) * 0.10)
    val_size = int(len(label_indices) * 0.05)
    
    test_indices = label_indices[:test_size]
    val_indices = label_indices[test_size:test_size + val_size]
    
    combined_labels.loc[test_indices, 'set'] = 'test'
    combined_labels.loc[val_indices, 'set'] = 'val'

In [9]:
# clean and sort

combined_labels = combined_labels.dropna(subset=['label'])

combined_labels = combined_labels.sort_values(
    by=['label', 'label_source', 'set', 'domain'],
    ascending=[True, True, True, True]
).reset_index(drop=True)

In [10]:
combined_labels.to_csv('../data/combined_domain_labels_16k_splits.csv',index=False)

In [11]:
combined_labels.set.value_counts(normalize=True)

set
train    0.850621
test     0.099725
val      0.049654
Name: proportion, dtype: float64

In [12]:
combined_labels.sample(10)

Unnamed: 0,domain,label,label_source,set
14330,3riversepiscopal.blogspot.com,News,data_provenance_init,train
502,freewayprojects.com,Business & E-Commerce,data_provenance_init,train
74,myfamilynutrition.com,Blogs,data_provenance_init,train
5517,dreamhosters.com,Local News,northeastern_domain_demo,train
224,gozocabs.com,Business & E-Commerce,data_provenance_init,test
8833,mysugarhousejournal.com,Local News,northeastern_domain_demo,train
13625,zorak.monmouth.edu,Local News,northeastern_domain_demo,train
9003,newportnow.online,Local News,northeastern_domain_demo,train
15058,thedailystar.net,News,data_provenance_init,train
1098,boreaseindhoven.nl,Entertainment & Culture,data_provenance_init,train


# Stats for Paper

In [13]:
# how many unique domains?
len(combined_labels)

16756

In [14]:
# how many unique domains? (validation
len(combined_labels.domain.unique())

16756

In [15]:
service_labels.label.value_counts()

label
News                               1482
Business & E-Commerce               787
Entertainment & Culture             611
Science, Academia, & Technology     478
General Information & Education     429
Social Media/Forums                 156
Legal & Policy                      142
Blogs                               137
Other                               111
Books                                58
Name: count, dtype: int64

In [16]:
# how many non-news websites?
len(service_labels[service_labels.label!='News'])

2912