# List of CC Datasets

In [None]:
DATASETS = [
    "nhagar/CC_MAIN_2017_47_urls",
    "nhagar/CC_MAIN_2024_18_urls",
    "nhagar/CC-MAIN-2021-17_urls",
    "nhagar/CC-MAIN-2016-40_urls",
    "nhagar/CC-MAIN-2017-13_urls",
    "nhagar/CC-MAIN-2017-17_urls",
    "nhagar/CC-MAIN-2017-34_urls",
    "nhagar/CC-MAIN-2017-43_urls",
    "nhagar/CC-MAIN-2018-51_urls",
    "nhagar/CC-MAIN-2019-39_urls",
    "nhagar/CC-MAIN-2020-05_urls",
    "nhagar/CC-MAIN-2020-34_urls",
    "nhagar/CC-MAIN-2021-31_urls",
    "nhagar/CC-MAIN-2022-40_urls",
    "nhagar/CC-MAIN-2023-06_urls",
    "nhagar/CC-MAIN-2016-18_urls",
    "nhagar/CC-MAIN-2014-15_urls",
    "nhagar/CC-MAIN-2015-32_urls",
    "nhagar/CC-MAIN-2013-48_urls",
    "nhagar/CC-MAIN-2014-10_urls",
    "nhagar/CC-MAIN-2014-23_urls",
    "nhagar/CC-MAIN-2014-35_urls",
    "nhagar/CC-MAIN-2014-41_urls",
    "nhagar/CC-MAIN-2014-42_urls",
    "nhagar/CC-MAIN-2014-49_urls",
    "nhagar/CC-MAIN-2014-52_urls",
    "nhagar/CC-MAIN-2015-06_urls",
    "nhagar/CC-MAIN-2015-11_urls",
    "nhagar/CC-MAIN-2015-14_urls",
    "nhagar/CC-MAIN-2015-18_urls",
    "nhagar/CC-MAIN-2015-35_urls",
    "nhagar/CC-MAIN-2015-22_urls",
    "nhagar/CC-MAIN-2015-27_urls",
    "nhagar/CC-MAIN-2015-40_urls",
    "nhagar/CC-MAIN-2015-48_urls",
    "nhagar/CC-MAIN-2016-07_urls",
    "nhagar/CC-MAIN-2016-30_urls",
    "nhagar/CC-MAIN-2016-22_urls",
    "nhagar/CC-MAIN-2016-26_urls",
    "nhagar/CC-MAIN-2016-36_urls",
    "nhagar/CC-MAIN-2016-44_urls",
    "nhagar/CC-MAIN-2016-50_urls",
    "nhagar/CC-MAIN-2017-04_urls",
    "nhagar/CC-MAIN-2017-09_urls",
    "nhagar/CC-MAIN-2017-22_urls",
    "nhagar/CC-MAIN-2017-51_urls",
    "nhagar/CC-MAIN-2018-05_urls",
    "nhagar/CC-MAIN-2017-26_urls",
    "nhagar/CC-MAIN-2018-09_urls",
    "nhagar/CC-MAIN-2018-17_urls",
    "nhagar/CC-MAIN-2018-22_urls",
    "nhagar/CC-MAIN-2018-26_urls",
    "nhagar/CC-MAIN-2018-30_urls",
    "nhagar/CC-MAIN-2018-34_urls",
    "nhagar/CC-MAIN-2018-39_urls",
    "nhagar/CC-MAIN-2018-43_urls",
    "nhagar/CC-MAIN-2018-47_urls",
    "nhagar/CC-MAIN-2019-04_urls",
    "nhagar/CC-MAIN-2019-09_urls",
    "nhagar/CC-MAIN-2018-13_urls",
    "nhagar/CC-MAIN-2019-18_urls",
    "nhagar/CC-MAIN-2019-22_urls",
    "nhagar/CC-MAIN-2019-13_urls",
    "nhagar/CC-MAIN-2019-30_urls",
    "nhagar/CC-MAIN-2019-35_urls",
    "nhagar/CC-MAIN-2019-43_urls",
    "nhagar/CC-MAIN-2019-26_urls",
    "nhagar/CC-MAIN-2019-51_urls",
    "nhagar/CC-MAIN-2019-47_urls",
    "nhagar/CC-MAIN-2020-16_urls",
    "nhagar/CC-MAIN-2020-24_urls",
    "nhagar/CC-MAIN-2020-10_urls",
    "nhagar/CC-MAIN-2020-40_urls",
    "nhagar/CC-MAIN-2020-50_urls",
    "nhagar/CC-MAIN-2020-29_urls",
    "nhagar/CC-MAIN-2021-10_urls",
    "nhagar/CC-MAIN-2021-25_urls",
    "nhagar/CC-MAIN-2021-43_urls",
    "nhagar/CC-MAIN-2021-49_urls",
    "nhagar/CC-MAIN-2017-30_urls",
    "nhagar/CC-MAIN-2022-21_urls",
    "nhagar/CC-MAIN-2022-27_urls",
    "nhagar/CC-MAIN-2021-04_urls",
    "nhagar/CC-MAIN-2022-05_urls",
    "nhagar/CC-MAIN-2023-23_urls",
    "nhagar/CC-MAIN-2023-40_urls",
    "nhagar/CC-MAIN-2023-50_urls",
    "nhagar/CC-MAIN-2024-10_urls",
    "nhagar/CC-MAIN-2023-14_urls",
    "nhagar/CC-MAIN-2013-20_urls",
    "nhagar/CC-MAIN-2021-39_urls",
    "nhagar/CC-MAIN-2022-33_urls",
    "nhagar/CC-MAIN-2022-49_urls",
    "nhagar/CC-MAIN-2017-39_urls",
    "nhagar/CC-MAIN-2021-21_urls",
    "nhagar/CC-MAIN-2020-45_urls"
]

# Load Datasets

In [None]:
import pandas as pd
import dask.dataframe as dd
import os
import glob
from tqdm import tqdm
from huggingface_hub import hf_hub_download, list_repo_files

print(f"There are {len(DATASETS)} datasets")

In [None]:
# Create cache directory
os.makedirs("hf_cache", exist_ok=True)

# Initialize result dataframe
results_df = pd.DataFrame(columns=['url_host_name', 'url_count'])

# Process each dataset
for dataset in tqdm(DATASETS):
    try:
        # Get all parquet files from the repository
        files = [f for f in list_repo_files(dataset, repo_type="dataset") 
                if f.endswith('.parquet')]
        
        if not files:
            print(f"No parquet files found for {dataset}, skipping")
            continue
            
        print(f"Found {len(files)} parquet files for {dataset}")
        
        # Download first file to determine pattern
        first_file_path = hf_hub_download(
            repo_id=dataset,
            filename=files[0],
            repo_type="dataset",
            cache_dir="hf_cache"
        )
        
        # Download all remaining files
        print(f"Downloading all parquet files...")
        for file in files:
            hf_hub_download(
                repo_id=dataset,
                filename=file,
                repo_type="dataset",
                cache_dir="hf_cache"
            )

        print("Checking for matches...")
        # Set file pattern and check for matches
        base_dir = os.path.dirname(first_file_path)
        file_pattern = os.path.join(base_dir, "*.parquet")
        matching_files = glob.glob(file_pattern)
        
        if not matching_files:
            # Try broader search if no matches found
            base_dir = os.path.dirname(base_dir)
            file_pattern = os.path.join(base_dir, "**/*.parquet")
            matching_files = glob.glob(file_pattern, recursive=True)
            print(f"Using broader search: found {len(matching_files)} local files")
        
        # Process data with Dask
        print("Processing...")
        df = dd.read_parquet(file_pattern)
        current_data = df.groupby('url_host_name')['url_count'].sum().reset_index().compute()
        
        # Merge with existing results
        print("Concatenating...")
        results_df = pd.concat([results_df, current_data], ignore_index=True)
        results_df = results_df.groupby('url_host_name', as_index=False)['url_count'].sum()
        
        # Free memory
        print("Freeing memory...")
        del df, current_data
        import gc
        gc.collect()
        
    except Exception as e:
        print(f"Error with {dataset}: {str(e)}")
        # Save progress on error
        dataset_name = dataset.replace('nhagar/','')
        results_df.to_csv(f"progress_until_{dataset_name}.csv", index=False)

# Save final results
results_df.to_csv("final_combined_results.csv", index=False)

# Combine Domain Counts

In [None]:
results_df.sort_values(by='url_count', ascending=False, inplace=True)
results_df.to_csv('all_domains_sorted.csv',index=False)
results_df[['url_host_name']].to_csv('all_domains.csv',index=False)
results_df.to_parquet('all_domains.parquet',index=False)