In [1]:
import os

In [2]:
GITHUB_TOKEN = os.getenv('GITHUB_TOKEN')


In [None]:
import requests
headers = {"Authorization": f"token {GITHUB_TOKEN}"}

In [5]:
import time
import math
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm


def fetch_issues(
    owner="huggingface",
    repo="datasets",
    num_issues=10_000,
    rate_limit=5_000,
    issues_path=Path("."),
):
    if not issues_path.is_dir():
        issues_path.mkdir(exist_ok=True)

    batch = []
    all_issues = []
    per_page = 100  ## 每页返回的 issue 的数量
    num_pages = math.ceil(num_issues / per_page)
    base_url = "https://api.github.com/repos"

    for page in tqdm(range(num_pages)):
        # 使用 state=all 进行查询来获取 open 和 closed 的issue
        query = f"issues?page={page}&per_page={per_page}&state=all"
        issues = requests.get(f"{base_url}/{owner}/{repo}/{query}", headers=headers)
        batch.extend(issues.json())

        if len(batch) > rate_limit and len(all_issues) < num_issues:
            all_issues.extend(batch)
            batch = []  # 重置batch
            print(f"Reached GitHub rate limit. Sleeping for one hour ...")
            time.sleep(60 * 60 + 1)

    all_issues.extend(batch)
    df = pd.DataFrame.from_records(all_issues)
    df.to_json(f"{issues_path}/{repo}-issues.jsonl", orient="records", lines=True)
    print(
        f"Downloaded all the issues for {repo}! Dataset stored at {issues_path}/{repo}-issues.jsonl"
    )

In [6]:
from datasets import load_dataset

In [7]:
issues_dataset = load_dataset('json',data_files='datasets-issues.jsonl',split='train')

In [8]:
issues_dataset

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'type', 'active_lock_reason', 'draft', 'pull_request', 'body', 'closed_by', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason', 'sub_issues_summary', 'issue_dependencies_summary'],
    num_rows: 5000
})

In [9]:
sample = issues_dataset.shuffle(seed=666).select(range(10))

In [10]:
sample

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'type', 'active_lock_reason', 'draft', 'pull_request', 'body', 'closed_by', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason', 'sub_issues_summary', 'issue_dependencies_summary'],
    num_rows: 10
})

In [11]:
for url,pr in zip(sample['html_url'],sample['pull_request']):
    print(f">> URL: {url}")
    print(f">> Pull request: {pr}\n")

>> URL: https://github.com/huggingface/datasets/pull/7209
>> Pull request: {'url': 'https://api.github.com/repos/huggingface/datasets/pulls/7209', 'html_url': 'https://github.com/huggingface/datasets/pull/7209', 'diff_url': 'https://github.com/huggingface/datasets/pull/7209.diff', 'patch_url': 'https://github.com/huggingface/datasets/pull/7209.patch', 'merged_at': datetime.datetime(2024, 10, 9, 16, 4, 7)}

>> URL: https://github.com/huggingface/datasets/pull/4659
>> Pull request: {'url': 'https://api.github.com/repos/huggingface/datasets/pulls/4659', 'html_url': 'https://github.com/huggingface/datasets/pull/4659', 'diff_url': 'https://github.com/huggingface/datasets/pull/4659.diff', 'patch_url': 'https://github.com/huggingface/datasets/pull/4659.patch', 'merged_at': datetime.datetime(2022, 7, 12, 11, 18, 25)}

>> URL: https://github.com/huggingface/datasets/pull/7052
>> Pull request: {'url': 'https://api.github.com/repos/huggingface/datasets/pulls/7052', 'html_url': 'https://github.com

In [12]:
issues_dataset = issues_dataset.map(lambda x: {'is_pull-rerquest': False if x['pull_request'] is None else True})

In [13]:
issues_dataset[0]['number']

7933

In [14]:
issue_number = 7933

In [15]:
url = f'https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments'

In [16]:
response2 = requests.get(url,headers=headers)

In [17]:
def get_comments(issue_number):
    url = f'https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments'
    response = requests.get(url,headers=headers)
    return [r['body'] for r in response.json()]

In [19]:
issues_with_comments_dataset = issues_dataset.map(lambda x: {'comments': get_comments(x['number'])})

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [21]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [22]:
issues_with_comments_dataset.push_to_hub('github-issues')


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/datasets/YiMeng-SYSU/github-issues/commit/3d0748ad9c580c769b2e1f56db8d7f3b44715e11', commit_message='Upload dataset', commit_description='', oid='3d0748ad9c580c769b2e1f56db8d7f3b44715e11', pr_url=None, repo_url=RepoUrl('https://hf-mirror.com/datasets/YiMeng-SYSU/github-issues', endpoint='https://hf-mirror.com', repo_type='dataset', repo_id='YiMeng-SYSU/github-issues'), pr_revision=None, pr_num=None)

In [23]:
remote_dataset = load_dataset('YiMeng-SYSU/github-issues',split='train')
remote_dataset

README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/9.71M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'type', 'active_lock_reason', 'draft', 'pull_request', 'body', 'closed_by', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason', 'sub_issues_summary', 'issue_dependencies_summary', 'is_pull-rerquest'],
    num_rows: 5000
})