<a href="https://colab.research.google.com/github/NHagar/cc-genealogy/blob/automate-labels/analysis/get_hf_datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Get all TextGen Datasets

In [1]:
from huggingface_hub import HfApi

In [2]:
api = HfApi()

In [3]:
ds = list(api.list_datasets(
    task_categories=["text-generation"]
))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
dataset_urls = [f"https://huggingface.com/datasets/{i.id}" for i in ds]

In [5]:
import pandas as pd

df = pd.DataFrame(dataset_urls, columns=['url'])
#df.to_csv('../data/hf_datasets.csv', index=False)

In [6]:
#df.sample(n=100).to_csv('../data/hf_datasets_sample.csv', index=False)

# Get Datasets used by HF Models

In [7]:
# Get list of models
models = list(api.list_models(
    #TODO run without limit
    limit=100,
    task="text-generation"
))
error_count = 0
linked_datasets = []
for model in models:
    try:
        # Get model info
        info = api.model_info(model.id)

        # Check if model has dataset info
        if hasattr(info, 'cardData') and info.cardData.get('datasets'):
            # Create a row for each dataset
            for dataset in info.cardData['datasets']:
                linked_datasets.append({
                    'model': model.id,
                    'dataset_url': f"https://huggingface.co/datasets/{dataset}"
                })

    except Exception as e:
        error_count +=1
        print(f"Error processing model {model.id}: {str(e)}")
        continue

# Create DataFrame and save
print(f"Errors: {error_count}/{len(models)}")
df_from_models = pd.DataFrame(linked_datasets)
df_from_models
#df.to_csv('model_dataset_links.csv', index=False)

Errors: 0/100


Unnamed: 0,model,dataset_url
0,agentica-org/DeepScaleR-1.5B-Preview,https://huggingface.co/datasets/AI-MO/NuminaMa...
1,agentica-org/DeepScaleR-1.5B-Preview,https://huggingface.co/datasets/KbsdJames/Omni...
2,agentica-org/DeepScaleR-1.5B-Preview,https://huggingface.co/datasets/RUC-AIBOX/STIL...
3,agentica-org/DeepScaleR-1.5B-Preview,https://huggingface.co/datasets/hendrycks/comp...
4,smirki/UIGEN-T1-Qwen-7b,https://huggingface.co/datasets/smirki/UI_Reas...
...,...,...
138,prithivMLmods/SmolLM2_135M_Grpo_Checkpoint,https://huggingface.co/datasets/openai/gsm8k
139,TinyLlama/TinyLlama-1.1B-Chat-v1.0,https://huggingface.co/datasets/cerebras/SlimP...
140,TinyLlama/TinyLlama-1.1B-Chat-v1.0,https://huggingface.co/datasets/bigcode/starco...
141,TinyLlama/TinyLlama-1.1B-Chat-v1.0,https://huggingface.co/datasets/HuggingFaceH4/...


# Cross-Reference

In [8]:
print(f"Total datasets: {len(df)}")
df.sample(5)

Total datasets: 8039


Unnamed: 0,url
5955,https://huggingface.com/datasets/multimolecule...
7413,https://huggingface.com/datasets/james-1111/x_...
7542,https://huggingface.com/datasets/kimbuja/x_dat...
4007,https://huggingface.com/datasets/HachiML/ameno...
1584,https://huggingface.com/datasets/dmayhem93/Cha...


In [9]:
print(f"Datasets cited by HF models: {len(df_from_models.dataset_url.unique())}")
df_from_models.sample(5).dataset_url.tolist()
# https://huggingface.co/datasets/milashkaarshif/MoeGirlPedia_wikitext_raw_archive

Datasets cited by HF models: 109


['https://huggingface.co/datasets/OpenCoder-LLM/opc-sft-stage1',
 'https://huggingface.co/datasets/NousResearch/hermes-function-calling-v1',
 'https://huggingface.co/datasets/HuggingFaceTB/smoltalk',
 'https://huggingface.co/datasets/openai/gsm8k',
 'https://huggingface.co/datasets/PocketDoc/Dans-Taskmaxx-DataPrepper']

In [11]:
# Convert URLs to strings and create output list
model_urls = df_from_models['dataset_url'].astype(str).tolist()
ds_urls = df['url'].astype(str).tolist()

# Function to normalize URLs
def normalize_url(url):
    return url.replace('huggingface.com', 'huggingface.co').replace('huggingface.co', 'huggingface.co')

# Normalize all URLs in both lists
model_urls_normalized = [normalize_url(url) for url in model_urls]
ds_urls_normalized = [normalize_url(url) for url in ds_urls]

matching_rows = []

# Loop through each model-dataset pair
for index, row in df_from_models.iterrows():
    dataset_url = normalize_url(str(row['dataset_url']))
    model = row['model']

    # Check if this dataset URL exists in the filtered datasets
    if dataset_url in ds_urls_normalized:
        matching_rows.append({
            'model': model,
            'dataset_url': row['dataset_url']  # Keep original URL in output
        })

# Create new dataframe from matching rows
merged_df = pd.DataFrame(matching_rows)
merged_df

Unnamed: 0,model,dataset_url
0,agentica-org/DeepScaleR-1.5B-Preview,https://huggingface.co/datasets/AI-MO/NuminaMa...
1,cognitivecomputations/Dolphin3.0-R1-Mistral-24B,https://huggingface.co/datasets/NousResearch/h...
2,cognitivecomputations/Dolphin3.0-R1-Mistral-24B,https://huggingface.co/datasets/AI-MO/NuminaMa...
3,cognitivecomputations/Dolphin3.0-R1-Mistral-24B,https://huggingface.co/datasets/AI-MO/NuminaMa...
4,bartowski/agentica-org_DeepScaleR-1.5B-Preview...,https://huggingface.co/datasets/AI-MO/NuminaMa...
5,PocketDoc/Dans-PersonalityEngine-V1.2.0-24b,https://huggingface.co/datasets/PocketDoc/Dans...
6,AXCXEPT/phi-4-deepseek-R1K-RL-EZO,https://huggingface.co/datasets/AI-MO/NuminaMa...
7,cognitivecomputations/Dolphin3.0-Mistral-24B,https://huggingface.co/datasets/NousResearch/h...
8,cognitivecomputations/Dolphin3.0-Mistral-24B,https://huggingface.co/datasets/AI-MO/NuminaMa...
9,cognitivecomputations/Dolphin3.0-Mistral-24B,https://huggingface.co/datasets/AI-MO/NuminaMa...
