# 创建自己的数据集

## 1、获取数据

In [1]:
import requests

url = "https://api.github.com/repos/huggingface/datasets/issues?page=1&per_page=1"
response = requests.get(url)

In [2]:
response.status_code

200

In [3]:
response.json()

[{'url': 'https://api.github.com/repos/huggingface/datasets/issues/6641',
  'repository_url': 'https://api.github.com/repos/huggingface/datasets',
  'labels_url': 'https://api.github.com/repos/huggingface/datasets/issues/6641/labels{/name}',
  'comments_url': 'https://api.github.com/repos/huggingface/datasets/issues/6641/comments',
  'events_url': 'https://api.github.com/repos/huggingface/datasets/issues/6641/events',
  'html_url': 'https://github.com/huggingface/datasets/issues/6641',
  'id': 2116963132,
  'node_id': 'I_kwDODunzps5-Lks8',
  'number': 6641,
  'title': "unicodedecodeerror: 'utf-8' codec can't decode byte 0xac in position 25: invalid start byte",
  'user': {'login': 'Hughhuh',
   'id': 109789057,
   'node_id': 'U_kgDOBos_gQ',
   'avatar_url': 'https://avatars.githubusercontent.com/u/109789057?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/Hughhuh',
   'html_url': 'https://github.com/Hughhuh',
   'followers_url': 'https://api.github.com/users/Hughhuh/

In [4]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

In [5]:
import os
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")  # Copy your GitHub token here
headers = {"Authorization": f"token {GITHUB_TOKEN}"}

In [6]:
import time
import math
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm


def fetch_issues(
    owner="huggingface",
    repo="datasets",
    num_issues=10_000,
    rate_limit=5_000,
    issues_path=Path("."),
):
    if not issues_path.is_dir():
        issues_path.mkdir(exist_ok=True)

    batch = []
    all_issues = []
    per_page = 100  # Number of issues to return per page
    num_pages = math.ceil(num_issues / per_page)
    base_url = "https://api.github.com/repos"

    for page in tqdm(range(num_pages)):
        # Query with state=all to get both open and closed issues
        query = f"issues?page={page}&per_page={per_page}&state=all"
        issues = requests.get(
            f"{base_url}/{owner}/{repo}/{query}", headers=headers)
        batch.extend(issues.json())

        if len(batch) > rate_limit and len(all_issues) < num_issues:
            all_issues.extend(batch)
            batch = []  # Flush batch for next time period
            print(f"Reached GitHub rate limit. Sleeping for one hour ...")
            time.sleep(60 * 60 + 1)

    all_issues.extend(batch)
    df = pd.DataFrame.from_records(all_issues)
    df.to_json(f"{issues_path}/{repo}-issues.jsonl",
               orient="records", lines=True)
    print(
        f"Downloaded all the issues for {repo}! Dataset stored at {issues_path}/{repo}-issues.jsonl"
    )

In [8]:
# Depending on your internet connection, this can take several minutes to run...
fetch_issues()

  0%|          | 0/100 [00:00<?, ?it/s]

Reached GitHub rate limit. Sleeping for one hour ...
Downloaded all the issues for datasets! Dataset stored at ./datasets-issues.jsonl


In [13]:
from datasets import load_dataset
issues_dataset = load_dataset(
    "json", data_files="./data/issues.jsonl", split="train")
issues_dataset

Generating train split: 0 examples [00:00, ? examples/s]

DatasetGenerationError: An error occurred while generating the dataset

## 2、清理数据

In [None]:
sample = issues_dataset.shuffle(seed=666).select(range(3))

# Print out the URL and pull request entries
for url, pr in zip(sample["html_url"], sample["pull_request"]):
    print(f">> URL: {url}")
    print(f">> Pull request: {pr}\n")

In [None]:
def get_comments(issue_number):
    url = f"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments"
    response = requests.get(url, headers=headers)
    return [r["body"] for r in response.json()]


# Test our function works as expected
get_comments(2792)

In [None]:
# Depending on your internet connection, this can take a few minutes...
issues_with_comments_dataset = issues_dataset.map(
    lambda x: {"comments": get_comments(x["number"])}
)

## 3、扩充数据

In [None]:
issue_number = 2792
url = f"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments"
response = requests.get(url, headers=headers)
response.json()

## 4、上传huggingface

登录huggingface

In [None]:
from huggingface_hub import notebook_login

notebook_login()

# 命令行输入
# huggingface-cli login

上传数据

In [None]:
issues_with_comments_dataset.push_to_hub("github-issues")

下载使用

In [None]:
remote_dataset = load_dataset("lewtun/github-issues", split="train")
remote_dataset

创建数据集卡片

创建信息性数据集卡片的指南
https://github.com/huggingface/datasets/blob/main/templates/README_guide.md