<a href="https://colab.research.google.com/github/NHagar/cc-genealogy/blob/automate-labels/analysis/get_hf_datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Get all TextGen Datasets

In [39]:
from huggingface_hub import HfApi
import pandas as pd
from tqdm import tqdm

In [2]:
api = HfApi()

In [3]:
ds = list(api.list_datasets(
    task_categories=["text-generation"]
))

# Get Datasets that meet criteria
- Used by at least 1 model

In [35]:
datasets_with_models = []

for data in tqdm(ds):
    models = list(api.list_models(trained_dataset=data.id))
    if len(models) > 0:
        datasets_with_models.append({data.id: [model.id for model in models]})

100%|██████████| 8106/8106 [09:47<00:00, 13.81it/s]  


# Cross-Reference and format for export

In [36]:
print(f"Total datasets: {len(ds)}")
ds[:5]

Total datasets: 8106


[DatasetInfo(id='Congliu/Chinese-DeepSeek-R1-Distill-data-110k', author='Congliu', sha='8520b649430617c2be4490f424d251d09d835ed3', created_at=datetime.datetime(2025, 2, 17, 11, 45, 9, tzinfo=datetime.timezone.utc), last_modified=datetime.datetime(2025, 2, 21, 2, 18, 8, tzinfo=datetime.timezone.utc), private=False, gated=False, disabled=False, downloads=2825, downloads_all_time=None, likes=340, paperswithcode_id=None, tags=['task_categories:text-generation', 'task_categories:text2text-generation', 'task_categories:question-answering', 'language:zh', 'license:apache-2.0', 'size_categories:100K<n<1M', 'format:json', 'modality:tabular', 'modality:text', 'library:datasets', 'library:pandas', 'library:mlcroissant', 'library:polars', 'region:us'], trending_score=324, card_data=None, siblings=None),
 DatasetInfo(id='facebook/natural_reasoning', author='facebook', sha='99eea5dc6bfa45a925eb42600e81dc90377ba237', created_at=datetime.datetime(2025, 1, 30, 23, 29, 32, tzinfo=datetime.timezone.utc),

In [37]:
print(f"Datasets cited by HF models: {len(datasets_with_models)}")
print(f"Percent of total: {len(datasets_with_models) / len(ds) * 100:.2f}%")
datasets_with_models[:5]

Datasets cited by HF models: 1729
Percent of total: 21.33%


[{'Congliu/Chinese-DeepSeek-R1-Distill-data-110k': ['Ansh989/Ansh',
   'Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v4-reasoning',
   'mradermacher/NQLSG-Qwen2.5-14B-MegaFusion-v4-reasoning-GGUF',
   'Ansh989/Chatbot',
   'YuRiVeRTi/VQ1',
   'Richie420/Richiejay',
   'fedoravel/test',
   'Abdelrahman-Ahmed-Sobhy-dev/Islamic-AI']},
 {'facebook/natural_reasoning': ['Albi96/iii']},
 {'Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT': ['YuRiVeRTi/VQ1']},
 {'FreedomIntelligence/medical-o1-reasoning-SFT': ['FreedomIntelligence/HuatuoGPT-o1-7B',
   'FreedomIntelligence/HuatuoGPT-o1-8B',
   'FreedomIntelligence/HuatuoGPT-o1-70B',
   'FreedomIntelligence/HuatuoGPT-o1-72B',
   'bartowski/HuatuoGPT-o1-8B-GGUF',
   'Triangle104/HuatuoGPT-o1-7B-Q4_K_M-GGUF',
   'bartowski/HuatuoGPT-o1-7B-GGUF',
   'QuantFactory/HuatuoGPT-o1-8B-GGUF',
   'QuantFactory/HuatuoGPT-o1-7B-GGUF',
   'mlx-community/HuatuoGPT-o1-72B-4bit',
   'carsenk/llama3.2_1b_2025_uncensored_v2',
   'kingabzpro/DeepSeek-R1-Medical-COT',
   '

In [48]:
# make dataframe of dataset to model mapping
df = pd.DataFrame([{"dataset": k, "models": v} for d in datasets_with_models for k, v in d.items()])
df.columns = ["dataset", "models"]
# make column transforming dataset IDs to URLs
df["dataset_url"] = df["dataset"].apply(lambda x: f"https://huggingface.co/datasets/{x}")

# make separate dataset of unique dataset URLs
df_datasets = df[["dataset_url"]].drop_duplicates()
# save to CSV
df_datasets.to_csv("datasets_with_models.csv", index=False)