<a href="https://colab.research.google.com/github/NHagar/cc-genealogy/blob/main/analysis/check_datasets_for_urls.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install
Because colab still doesn't have datasets by default smh

In [1]:
!pip install datasets



# Check for URLs in the dataset

In [2]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [3]:
ds_df = pd.read_csv('https://raw.githubusercontent.com/NHagar/cc-genealogy/refs/heads/main/hf_dataset_registry/dataset_pool_text_1m%2B.csv')

In [4]:
ds_df_sample = ds_df.sample(100)
ds_df_sample

Unnamed: 0,id,hf_url
653,alexantonov/chuvash_mono,https://huggingface.co/datasets/alexantonov/chuvash_mono
12406,NeutrinoPit/OpenSubtitles2024-en-ar-batch49,https://huggingface.co/datasets/NeutrinoPit/OpenSubtitles2024-en-ar-batch49
6715,yiyic/oscar_es_train,https://huggingface.co/datasets/yiyic/oscar_es_train
7901,gowitheflow/supervised-multilingual,https://huggingface.co/datasets/gowitheflow/supervised-multilingual
1499,Babelscape/multinerd,https://huggingface.co/datasets/Babelscape/multinerd
...,...,...
6411,mteb/eurlex-multilingual,https://huggingface.co/datasets/mteb/eurlex-multilingual
1437,FremyCompany/BioLORD-Dataset,https://huggingface.co/datasets/FremyCompany/BioLORD-Dataset
9164,KathirKs/CC-MAIN-2017-13_row_wise_20240926_154028,https://huggingface.co/datasets/KathirKs/CC-MAIN-2017-13_row_wise_20240926_154028
10101,1231czx/stage2_ep1_6_first8,https://huggingface.co/datasets/1231czx/stage2_ep1_6_first8


In [5]:
from datasets import load_dataset, get_dataset_split_names, get_dataset_config_names
import itertools
import logging
import re

logging.getLogger("datasets").setLevel(logging.ERROR)

# URL-related terms
url_related_terms = [
    'url', 'link', 'domain', 'website', 'site', 'web',
    'http', 'https', 'www', 'uri', 'href', 'address', 'hyperlink',
    'path', 'redirect', 'source', 'anchor', 'api'
]

url_regex = re.compile(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+|(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?://(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,}')

# regex pattern for faster matching
url_pattern = re.compile('|'.join(url_related_terms), re.IGNORECASE)
global total_errors
total_errors = 0

def check_for_urls_in_dataset(dataset_id):
    try:
        # check if the dataset requires a config
        config_name = None

        try:
            available_configs = get_dataset_config_names(dataset_id, trust_remote_code=True)
            if available_configs:
                # first available config as default
                config_name = available_configs[0]
                print(f"Using config '{config_name}' for dataset {dataset_id}")
        except Exception as config_error:
            print(f"Warning when checking configs for {dataset_id}: {config_error}")

        # check available splits - include config when checking splits too
        try:
            if config_name:
                # For datasets with configs, provide the config name
                available_splits = get_dataset_split_names(dataset_id, config_name=config_name, trust_remote_code=True)
            else:
                available_splits = get_dataset_split_names(dataset_id, trust_remote_code=True)

            split_to_use = 'train' if 'train' in available_splits else available_splits[0]
        except Exception as split_error:
            # default to 'train'
            split_to_use = 'train'
            print(f"Warning: Could not get splits for {dataset_id}: {split_error}")

        # When loading the dataset, include the config if needed
        if config_name:
            dataset = load_dataset(dataset_id, config_name, split=split_to_use,
                                  streaming=True, trust_remote_code=True)
        else:
            dataset = load_dataset(dataset_id, split=split_to_use,
                                  streaming=True, trust_remote_code=True)

        # check access to features
        if dataset.features is None:
            print(f"Warning: No features found for {dataset_id}")
            return False

        column_names = dataset.features.keys()

        # check if any column names suggest URL content
        url_related_columns = [column for column in column_names if url_pattern.search(column)]

        if url_related_columns:
            sample_size = 10
            sample_dataset = dataset.take(sample_size)

            # check if any URL-related columns actually contain URLs
            for column in url_related_columns:
                for row in sample_dataset:
                    value = row.get(column, "")
                    # if value is a string and contains a URL
                    if isinstance(value, str) and url_regex.search(value):
                        return True
            return False

        # no URL-related column names were found
        return False

    except Exception as e:
        global total_errors
        total_errors += 1
        print(f"Error {total_errors} processing {dataset_id}: {e}")
        return False

In [6]:
import logging
from tqdm.auto import tqdm

# Apply with progress bar
tqdm.pandas(desc="Checking for URLs")
ds_df_sample['does_have_urls'] = ds_df_sample['id'].progress_apply(check_for_urls_in_dataset)

# reorder columns
cols = ['does_have_urls'] + [col for col in ds_df_sample.columns if col != 'does_have_urls']
ds_df_sample = ds_df_sample[cols]
# display
ds_df_sample

Checking for URLs:   0%|          | 0/100 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/490 [00:00<?, ?B/s]

Using config 'default' for dataset alexantonov/chuvash_mono


README.md:   0%|          | 0.00/309 [00:00<?, ?B/s]

Using config 'default' for dataset NeutrinoPit/OpenSubtitles2024-en-ar-batch49


README.md:   0%|          | 0.00/286 [00:00<?, ?B/s]

Using config 'default' for dataset yiyic/oscar_es_train


README.md:   0%|          | 0.00/344 [00:00<?, ?B/s]

Using config 'default' for dataset gowitheflow/supervised-multilingual


README.md:   0%|          | 0.00/5.66k [00:00<?, ?B/s]

Using config 'default' for dataset Babelscape/multinerd


README.md:   0%|          | 0.00/5.88k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/214 [00:00<?, ?it/s]

Using config 'default' for dataset rainbowbridge/x_dataset_20722


Resolving data files:   0%|          | 0/214 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/214 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/562 [00:00<?, ?B/s]

Using config 'default' for dataset OEvortex/uncensored-vortex


README.md:   0%|          | 0.00/453 [00:00<?, ?B/s]

Using config 'default' for dataset crumbly/tinycode-a


README.md:   0%|          | 0.00/458 [00:00<?, ?B/s]

Using config 'default' for dataset MarkGG/Pierse-movie-dataset


README.md:   0%|          | 0.00/2.38k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

dataset_infos.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

Using config 'default' for dataset Bingsu/KcBERT_Pre-Training_Corpus


Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Using config 'default' for dataset Jamie762/b4


README.md:   0%|          | 0.00/5.77k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]

Using config 'default' for dataset LinkSoul/instruction_merge_set


Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

Using config 'default' for dataset Birchlabs/openai-prm800k-phase2_train-stepwise-critique


README.md:   0%|          | 0.00/312 [00:00<?, ?B/s]

Using config 'default' for dataset samreen27/combined_english_hindi


Resolving data files:   0%|          | 0/29 [00:00<?, ?it/s]

dataset_infos.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

Using config 'default' for dataset carlosejimenez/flickr30k_clip-ViT-B-32-caption_pairs


Resolving data files:   0%|          | 0/29 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/29 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/2.15k [00:00<?, ?B/s]

Using config 'train' for dataset open-vdb/sift-128-euclidean


README.md:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/33 [00:00<?, ?it/s]

Using config 'default' for dataset wormtooth/MNBVC-epubs


Resolving data files:   0%|          | 0/33 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/33 [00:00<?, ?it/s]



README.md:   0%|          | 0.00/5.33k [00:00<?, ?B/s]

Using config 'default' for dataset futuremoon/x_dataset_15


README.md:   0%|          | 0.00/506 [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/44 [00:00<?, ?it/s]

Using config 'default' for dataset KathirKs/CC-MAIN-2016-18_row_wise_20240904_113940


Resolving data files:   0%|          | 0/44 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/44 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/44 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/44 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/201 [00:00<?, ?B/s]

Using config 'default' for dataset fjcanyue/wikipedia-zh-cn


README.md:   0%|          | 0.00/6.08k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]

Using config 'aya_english' for dataset SotirisLegkas/kalamaki_corpora


Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/8.62k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/56 [00:00<?, ?it/s]

Using config 'default' for dataset DenyTranDFW/AmeriCredit_Automobile_Receivables_Trust_2018_1_1729361


Resolving data files:   0%|          | 0/56 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/56 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Using config 'default' for dataset lewington/laion2B-multi-joined-translated-to-en-smol


Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Using config 'default' for dataset BackpropBuff/CCAligned.en-es


README.md:   0%|          | 0.00/3.25k [00:00<?, ?B/s]

Using config 'default' for dataset Calvin-Xu/FLFL-Aozora-Speech-Train
Using config 'default' for dataset Vipplav/phase_3_1M


README.md:   0%|          | 0.00/716 [00:00<?, ?B/s]

Using config 'v2' for dataset asusevski/check_repeated_tokens


README.md:   0%|          | 0.00/5.93k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Using config 'default' for dataset zephyr-1111/x_dataset_0707238


Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/286 [00:00<?, ?B/s]

Using config 'default' for dataset ohsuz/fineweb-edu-2024-10-from-7M-to-8M


README.md:   0%|          | 0.00/773 [00:00<?, ?B/s]

Using config 'default' for dataset ContextSearchLM/context_search_vietnamese_english_prompt_97_minilmtok_finetune


Resolving data files:   0%|          | 0/2213 [00:00<?, ?it/s]

Using config 'default' for dataset cuifeng/flicker-cc


Resolving data files:   0%|          | 0/2213 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/2213 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

Using config 'default' for dataset Query-of-CC/Knowledge_Pile


Resolving data files:   0%|          | 0/359 [00:00<?, ?it/s]

Using config 'default' for dataset leiwx52/CC_eng_url


Resolving data files:   0%|          | 0/359 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/359 [00:00<?, ?it/s]



README.md:   0%|          | 0.00/5.71k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Using config 'default' for dataset marry-1111/x_dataset_050976


Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

Using config 'default' for dataset seonglae/data-rtsum


README.md:   0%|          | 0.00/587 [00:00<?, ?B/s]

Using config 'default' for dataset jeggers/more_crosswords


README.md:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Using config 'default' for dataset xinyu1205/recognize-anything-dataset-14m


README.md:   0%|          | 0.00/5.32k [00:00<?, ?B/s]

Using config 'default' for dataset kimbuja/x_dataset_1


README.md:   0%|          | 0.00/895 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Using config 'default' for dataset gyboo/ACN_Dataset


Repo card metadata block was not found. Setting CardData to empty.
Repo card metadata block was not found. Setting CardData to empty.


README.md:   0%|          | 0.00/570 [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/51 [00:00<?, ?it/s]

Using config 'default' for dataset DopeorNope/Pandora_source_no_math-v2


Resolving data files:   0%|          | 0/51 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/51 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/51 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/51 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/15.5k [00:00<?, ?B/s]

Using config 'default' for dataset chcaa/dagw-word-frequencies-by-domain


README.md:   0%|          | 0.00/449 [00:00<?, ?B/s]

Using config 'default' for dataset mickume/alt_pantheon


README.md:   0%|          | 0.00/474 [00:00<?, ?B/s]

Using config 'default' for dataset nsarrazin/lichess-games-2017-11


README.md:   0%|          | 0.00/384 [00:00<?, ?B/s]

Using config 'default' for dataset HugginJake/ccmatrix10-it-de


README.md:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Using config 'default' for dataset marcov/dbpedia_14_promptsource


README.md:   0%|          | 0.00/576 [00:00<?, ?B/s]

Using config 'default' for dataset lucianosb/cetacean-ptbr


README.md:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Using config 'default' for dataset Saxo/en_ko_translation_tech_science_linkbricks_single_dataset_with_prompt_text_huggingface


Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Using config 'default' for dataset BAAI/IndustryCorpus2_petrochemical


Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/756 [00:00<?, ?B/s]

Using config 'instructions' for dataset khangdzox/vietnamese-finetune-various


README.md:   0%|          | 0.00/20.0 [00:00<?, ?B/s]

dataset_infos.json:   0%|          | 0.00/1.54k [00:00<?, ?B/s]

Using config 'default' for dataset Plim/language_model_fr
Using config 'default' for dataset amongglue/books3-subset-raw


README.md:   0%|          | 0.00/447 [00:00<?, ?B/s]

Using config 'default' for dataset jerome-white/arena-bt-stan
Using config 'default' for dataset breadlicker45/test-bread-tokenizer


README.md:   0%|          | 0.00/5.64k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Using config 'default' for dataset SAVE0x0/reddit_dataset_218


Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

Using config 'default' for dataset pkuAI4M/test_extract_mathlib_notype


README.md:   0%|          | 0.00/95.0 [00:00<?, ?B/s]

Using config 'default' for dataset Linly-AI/Chinese-pretraining-dataset


README.md:   0%|          | 0.00/475 [00:00<?, ?B/s]

Using config 'default' for dataset tyzhu/synpre_copy_1M


README.md:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

Using config 'default' for dataset nateraw/us-accidents


README.md:   0%|          | 0.00/492 [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/53 [00:00<?, ?it/s]

Using config 'default' for dataset sade-adrien/redpajama_v2_sample_10M


Resolving data files:   0%|          | 0/53 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/53 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Using config 'default' for dataset apple/DataComp-12M


README.md:   0%|          | 0.00/363 [00:00<?, ?B/s]

Using config 'default' for dataset muhammadnoman76/lughaat-urdu-dataset-llm
Using config 'default' for dataset SetFit/amazon_polarity


README.md:   0%|          | 0.00/333 [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/223 [00:00<?, ?it/s]

Using config 'default' for dataset amuvarma/emilia-snac-merged-18m-mod7-delay


Resolving data files:   0%|          | 0/223 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/223 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/223 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/223 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/472 [00:00<?, ?B/s]

Using config 'default' for dataset nsarrazin/lichess-games-2016-07


README.md:   0%|          | 0.00/2.65k [00:00<?, ?B/s]

Using config 'lenti_MPRA_HepG2' for dataset gonzalobenegas/tang_koo_eval


README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/2559 [00:00<?, ?it/s]

Using config 'default' for dataset labofsahil/github-event-dataset-2014


Resolving data files:   0%|          | 0/2559 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/2559 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/442 [00:00<?, ?B/s]

Using config 'default' for dataset yxdu/Translation2019zh
Using config 'default' for dataset Tommert25/completemoroccorp


README.md:   0%|          | 0.00/283 [00:00<?, ?B/s]

Using config 'default' for dataset yiyic/oscar_cmn_Hani_train


README.md:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

Using config 'default' for dataset Owen1u/FinePersonas-emails


README.md:   0%|          | 0.00/3.72k [00:00<?, ?B/s]

Using config 'cleaned_formulas' for dataset OleehyO/latex-formulas


README.md:   0%|          | 0.00/375 [00:00<?, ?B/s]

Using config 'default' for dataset ibivibiv/math_instruct_smaller_18


README.md:   0%|          | 0.00/332 [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/22 [00:00<?, ?it/s]

Using config 'default' for dataset ncfrey/calm


Resolving data files:   0%|          | 0/22 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/22 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/22 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/22 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/477 [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/83 [00:00<?, ?it/s]

Using config 'default' for dataset jan-hq/instruction-convert-audio-whispervq-llama3.2-dedup


Resolving data files:   0%|          | 0/83 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/83 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/83 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/83 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/3.48k [00:00<?, ?B/s]

Using config 'default' for dataset Roronotalt/bluesky-ten-million
Using config 'default' for dataset zichengno1/GC_depth_1


README.md:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/53 [00:00<?, ?it/s]

Using config 'default' for dataset loubnabnl/kaggle_scripts_new_format_subset


Resolving data files:   0%|          | 0/53 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/53 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/53 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/53 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/510 [00:00<?, ?B/s]

Using config 'default' for dataset armanibadboy/kazllmdataset


README.md:   0%|          | 0.00/84.0 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Using config 'default' for dataset binwang/RSE-sentence-relational-data


Repo card metadata block was not found. Setting CardData to empty.
Repo card metadata block was not found. Setting CardData to empty.


README.md:   0%|          | 0.00/405 [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Using config 'default' for dataset OLAResearch/KOREAN-WEBTEXT


Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/367 [00:00<?, ?B/s]

Using config 'default' for dataset nhagar/CC-MAIN-2019-35_urls


README.md:   0%|          | 0.00/291 [00:00<?, ?B/s]

Using config 'default' for dataset gorkemsevinc/Customer_Support_on_Twitter


README.md:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Using config 'default' for dataset cahya/soda-id


README.md:   0%|          | 0.00/4.21k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]

dataset_infos.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Using config 'default' for dataset tomekkorbak/pile-pii-scrubadub


Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/334 [00:00<?, ?B/s]

Using config 'default' for dataset haor/OpenMid-Dataset


README.md:   0%|          | 0.00/651 [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/170 [00:00<?, ?it/s]

Using config 'human' for dataset bloyal/oas-paired-sequence-data


Resolving data files:   0%|          | 0/170 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/170 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/170 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/170 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

Error 1 processing fastx-ai/Fastx-Law-2016-2021-Full-Dataset: Dataset 'fastx-ai/Fastx-Law-2016-2021-Full-Dataset' is a gated dataset on the Hub. Visit the dataset page at https://huggingface.co/datasets/fastx-ai/Fastx-Law-2016-2021-Full-Dataset to ask for access.


dataset_infos.json:   0%|          | 0.00/767 [00:00<?, ?B/s]

Using config 'default' for dataset h4iku/coconut_java2006_preprocessed


README.md:   0%|          | 0.00/2.97k [00:00<?, ?B/s]

Using config 'default' for dataset Ganz00/reddit-comments-cleaned-for-tg


README.md:   0%|          | 0.00/417 [00:00<?, ?B/s]

Error 2 processing vietgpt/OSCAR-2201: Dataset 'vietgpt/OSCAR-2201' is a gated dataset on the Hub. Visit the dataset page at https://huggingface.co/datasets/vietgpt/OSCAR-2201 to ask for access.


README.md:   0%|          | 0.00/6.19k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/137 [00:00<?, ?it/s]

Using config 'subset_0' for dataset japanese-asr/en_asr.mls


Resolving data files:   0%|          | 0/137 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/137 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/137 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/137 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/325 [00:00<?, ?B/s]

Using config 'default' for dataset Geonmo/sam-llava-captions-only
Using config 'default' for dataset bromleyp/weather_data_bavaria


README.md:   0%|          | 0.00/5.20k [00:00<?, ?B/s]

Using config 'default' for dataset cthorrez/EsportsBenchTest
Using config 'default' for dataset Alignment-Lab-AI/Linearized-raw


README.md:   0%|          | 0.00/25.4k [00:00<?, ?B/s]

Using config 'bg' for dataset mteb/eurlex-multilingual


README.md:   0%|          | 0.00/6.30k [00:00<?, ?B/s]

Error 3 processing FremyCompany/BioLORD-Dataset: Dataset 'FremyCompany/BioLORD-Dataset' is a gated dataset on the Hub. Visit the dataset page at https://huggingface.co/datasets/FremyCompany/BioLORD-Dataset to ask for access.


README.md:   0%|          | 0.00/506 [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Using config 'default' for dataset KathirKs/CC-MAIN-2017-13_row_wise_20240926_154028


Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/517 [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/53 [00:00<?, ?it/s]

Using config 'default' for dataset 1231czx/stage2_ep1_6_first8


Resolving data files:   0%|          | 0/53 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/53 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/53 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/53 [00:00<?, ?it/s]

Using config 'default' for dataset hieule/news_corpus_v1_p1


Unnamed: 0,does_have_urls,id,hf_url
653,False,alexantonov/chuvash_mono,https://huggingface.co/datasets/alexantonov/chuvash_mono
12406,False,NeutrinoPit/OpenSubtitles2024-en-ar-batch49,https://huggingface.co/datasets/NeutrinoPit/OpenSubtitles2024-en-ar-batch49
6715,False,yiyic/oscar_es_train,https://huggingface.co/datasets/yiyic/oscar_es_train
7901,False,gowitheflow/supervised-multilingual,https://huggingface.co/datasets/gowitheflow/supervised-multilingual
1499,False,Babelscape/multinerd,https://huggingface.co/datasets/Babelscape/multinerd
...,...,...,...
6411,False,mteb/eurlex-multilingual,https://huggingface.co/datasets/mteb/eurlex-multilingual
1437,False,FremyCompany/BioLORD-Dataset,https://huggingface.co/datasets/FremyCompany/BioLORD-Dataset
9164,False,KathirKs/CC-MAIN-2017-13_row_wise_20240926_154028,https://huggingface.co/datasets/KathirKs/CC-MAIN-2017-13_row_wise_20240926_154028
10101,False,1231czx/stage2_ep1_6_first8,https://huggingface.co/datasets/1231czx/stage2_ep1_6_first8


In [8]:
ds_df_sample.sort_values(by="does_have_urls").to_csv('sample_with_url_labels.csv',index=False)
ds_df_sample.does_have_urls.value_counts()

Unnamed: 0_level_0,count
does_have_urls,Unnamed: 1_level_1
False,94
True,6


# Check for Multiple Sources