In [None]:
import requests
url = 'https://api.github.com/repos/huggingface/datasets/issues?page=1&per_page=1'
response = requests.get(url=url)

In [None]:
response.json()

In [None]:
github_token = 'xxxxxxxxxxxxxxxxxxxxxxx'
headers = {'Authorization':f'token {github_token}'}

In [None]:
import time
import math
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm


def fetch_issues(
    owner="huggingface",
    repo="datasets",
    num_issues=10_000,
    rate_limit=5_000,
    issues_path=Path("."),
):
    if not issues_path.is_dir():
        issues_path.mkdir(exist_ok=True)

    batch = []
    all_issues = []
    per_page = 100  # Number of issues to return per page
    num_pages = math.ceil(num_issues / per_page)
    base_url = "https://api.github.com/repos"

    for page in tqdm(range(num_pages)):
        # Query with state=all to get both open and closed issues
        query = f"issues?page={page}&per_page={per_page}&state=all"
        issues = requests.get(f"{base_url}/{owner}/{repo}/{query}", headers=headers)
        batch.extend(issues.json())

        if len(batch) > rate_limit and len(all_issues) < num_issues:
            all_issues.extend(batch)
            batch = []  # Flush batch for next time period
            print(f"Reached GitHub rate limit. Sleeping for one hour ...")
            time.sleep(60 * 60 + 1)

    all_issues.extend(batch)
    df = pd.DataFrame.from_records(all_issues)
    df.to_json(f"{issues_path}/{repo}-issues.jsonl", orient="records", lines=True)
    print(
        f"Downloaded all the issues for {repo}! Dataset stored at {issues_path}/{repo}-issues.jsonl"
    )

In [None]:
fetch_issues()

In [None]:
import json
output_keys = ['url',  'id', 'number', 'title', 'user', 'labels', 'state', 'comments', 'closed_at', 'pull_request', 'body','html_url']
with open('datasets-issues.jsonl','r',encoding='utf-8') as fin,\
     open('github-issues-no-timestamp.jsonl','w',encoding='utf-8') as fout:
    for line in fin:
        record = json.loads(line.strip())
        l = list(set(record.keys()) - set(output_keys))
        for field in l:
            record.pop(field,None)
        fout.write(json.dumps(record)+'\n')    

In [None]:
with open('github-issues-no-timestamp.jsonl','r',encoding='utf-8') as f:
    for i,line in enumerate(f):
        if i >= 1:
            break
        record = json.loads(line.strip())
        print(record)

In [None]:
from datasets import load_dataset
issues_dataset = load_dataset('json',data_files="github-issues-no-timestamp.jsonl",split='train')

In [None]:
sample = issues_dataset.shuffle(seed=42).select(range(3))
for item1,item2 in zip(sample['html_url'],sample['pull_request']):
    print(f'>> url:{item1}')
    print(f'>> pull_request:{item2}\n')

In [None]:
issues_dataset = issues_dataset.map(lambda x:{'is_pull_request':False if x['pull_request'] is None else True})

In [None]:
import time
def get_comments(issue_number):
    url = f'https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments'
    response = requests.get(url=url,headers=headers)
    time.sleep(1)  # 每次请求后暂停1秒
    return [r['body'] for r in response.json()]

In [None]:
issues_dataset_with_comments = issues_dataset.map(lambda x:{'comments':get_comments(x['number'])})