# 创建GitHub问题语料库

## 1、获取数据

### Python中发出HTTP请求的标准方式，通过requests库

In [1]:
import requests

url = "https://api.github.com/repos/huggingface/datasets/issues?page=1&per_page=1"
response = requests.get(url)

### 该对象包含许多有关请求的有用信息，包括 HTTP 状态代码

In [2]:
response.status_code

200

### HTTP状态代码列表
https://en.wikipedia.org/wiki/List_of_HTTP_status_codes

In [3]:
response.json()

[{'url': 'https://api.github.com/repos/huggingface/datasets/issues/5814',
  'repository_url': 'https://api.github.com/repos/huggingface/datasets',
  'labels_url': 'https://api.github.com/repos/huggingface/datasets/issues/5814/labels{/name}',
  'comments_url': 'https://api.github.com/repos/huggingface/datasets/issues/5814/comments',
  'events_url': 'https://api.github.com/repos/huggingface/datasets/issues/5814/events',
  'html_url': 'https://github.com/huggingface/datasets/pull/5814',
  'id': 1693216778,
  'node_id': 'PR_kwDODunzps5PoOQ9',
  'number': 5814,
  'title': 'Repro windows crash',
  'user': {'login': 'maddiedawson',
   'id': 106995444,
   'node_id': 'U_kgDOBmCe9A',
   'avatar_url': 'https://avatars.githubusercontent.com/u/106995444?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/maddiedawson',
   'html_url': 'https://github.com/maddiedawson',
   'followers_url': 'https://api.github.com/users/maddiedawson/followers',
   'following_url': 'https://api.github.c

### GitHub每小时限制60个请求，创建个人访问令牌则可以每小时限制5000个请求

In [None]:
GITHUB_TOKEN = xxx  # Copy your GitHub token here
headers = {"Authorization": f"token {GITHUB_TOKEN}"}

### 为防止GITHUB_TOKEN泄露，可以将token存入.env文件，并使用python-dotenv库作为环境变量自动加载该文件

In [4]:
from dotenv import load_dotenv

load_dotenv(dotenv_path="./data/.env", verbose=True)

True

In [7]:
from dotenv import dotenv_values

config = dotenv_values("./data/.env")
config

OrderedDict([('TOKEN', 'my_token')])

In [8]:
import os
GITHUB_TOKEN = os.getenv("TOKEN")
GITHUB_TOKEN

'my_token'

### 下载数据
    结果存储在repository_name-issues.jsonl，每一行都是一个JSON对象，表示一个问题

In [None]:
import time
import math
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm


def fetch_issues(
    owner="huggingface",
    repo="datasets",
    num_issues=10_000,
    rate_limit=5_000,
    issues_path=Path("."),
):
    if not issues_path.is_dir():
        issues_path.mkdir(exist_ok=True)

    batch = []
    all_issues = []
    per_page = 100  # Number of issues to return per page
    num_pages = math.ceil(num_issues / per_page)
    base_url = "https://api.github.com/repos"

    for page in tqdm(range(num_pages)):
        # Query with state=all to get both open and closed issues
        query = f"issues?page={page}&per_page={per_page}&state=all"
        issues = requests.get(f"{base_url}/{owner}/{repo}/{query}", headers=headers)
        batch.extend(issues.json())

        if len(batch) > rate_limit and len(all_issues) < num_issues:
            all_issues.extend(batch)
            batch = []  # Flush batch for next time period
            print(f"Reached GitHub rate limit. Sleeping for one hour ...")
            time.sleep(60 * 60 + 1)

    all_issues.extend(batch)
    df = pd.DataFrame.from_records(all_issues)
    df.to_json(f"{issues_path}/{repo}-issues.jsonl", orient="records", lines=True)
    print(
        f"Downloaded all the issues for {repo}! Dataset stored at {issues_path}/{repo}-issues.jsonl"
    )

In [None]:
# Depending on your internet connection, this can take several minutes to run...
fetch_issues()

In [None]:
issues_dataset = load_dataset("json", data_files="datasets-issues.jsonl", split="train")
issues_dataset

In [None]:
"""
Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app'],
    num_rows: 3019
})
"""

## 2、清理数据

### pull请求也是问题，pull_request可以区分问题和pull请求

In [None]:
sample = issues_dataset.shuffle(seed=666).select(range(3))

# Print out the URL and pull request entries
for url, pr in zip(sample["html_url"], sample["pull_request"]):
    print(f">> URL: {url}")
    print(f">> Pull request: {pr}\n")

In [None]:
"""
>> URL: https://github.com/huggingface/datasets/pull/850
>> Pull request: {'url': 'https://api.github.com/repos/huggingface/datasets/pulls/850', 'html_url': 'https://github.com/huggingface/datasets/pull/850', 'diff_url': 'https://github.com/huggingface/datasets/pull/850.diff', 'patch_url': 'https://github.com/huggingface/datasets/pull/850.patch'}

>> URL: https://github.com/huggingface/datasets/issues/2773
>> Pull request: None

>> URL: https://github.com/huggingface/datasets/pull/783
>> Pull request: {'url': 'https://api.github.com/repos/huggingface/datasets/pulls/783', 'html_url': 'https://github.com/huggingface/datasets/pull/783', 'diff_url': 'https://github.com/huggingface/datasets/pull/783.diff', 'patch_url': 'https://github.com/huggingface/datasets/pull/783.patch'}
"""

### 通过Pull request区分问题与pull请求

In [None]:
# 创建一个新的列is_pull_request，用于检查pull_request字段是否为Nothing
issues_dataset = issues_dataset.map(
    lambda x: {"is_pull_request": False if x["pull_request"] is None else True}
)

1. Dataset.filter()函数对于过滤掉拉请求和打开问题非常有用
2. Dataset.set_format()函数将数据集转换为DataFrame，可以轻松地操作create_at和close_at时间戳。或者加分，计算关闭请求所需的平均时间。

### 虽然我们可以通过删除或重命名一些列来进一步清理数据集，但在这个阶段可能保持数据集的原始状态会更好

## 3、增强数据集

### 增加注释以丰富信息来源
    GitHub REST API提供了一个Comments端点，该端点返回与一个问题号相关联的所有注释。

In [None]:
issue_number = 2792
url = f"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments"
response = requests.get(url, headers=headers)
response.json()

In [None]:
"""
[{'url': 'https://api.github.com/repos/huggingface/datasets/issues/comments/897594128',
  'html_url': 'https://github.com/huggingface/datasets/pull/2792#issuecomment-897594128',
  'issue_url': 'https://api.github.com/repos/huggingface/datasets/issues/2792',
  'id': 897594128,
  'node_id': 'IC_kwDODunzps41gDMQ',
  'user': {'login': 'bhavitvyamalik',
   'id': 19718818,
   'node_id': 'MDQ6VXNlcjE5NzE4ODE4',
   'avatar_url': 'https://avatars.githubusercontent.com/u/19718818?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/bhavitvyamalik',
   'html_url': 'https://github.com/bhavitvyamalik',
   'followers_url': 'https://api.github.com/users/bhavitvyamalik/followers',
   'following_url': 'https://api.github.com/users/bhavitvyamalik/following{/other_user}',
   'gists_url': 'https://api.github.com/users/bhavitvyamalik/gists{/gist_id}',
   'starred_url': 'https://api.github.com/users/bhavitvyamalik/starred{/owner}{/repo}',
   'subscriptions_url': 'https://api.github.com/users/bhavitvyamalik/subscriptions',
   'organizations_url': 'https://api.github.com/users/bhavitvyamalik/orgs',
   'repos_url': 'https://api.github.com/users/bhavitvyamalik/repos',
   'events_url': 'https://api.github.com/users/bhavitvyamalik/events{/privacy}',
   'received_events_url': 'https://api.github.com/users/bhavitvyamalik/received_events',
   'type': 'User',
   'site_admin': False},
  'created_at': '2021-08-12T12:21:52Z',
  'updated_at': '2021-08-12T12:31:17Z',
  'author_association': 'CONTRIBUTOR',
  'body': "@albertvillanova my tests are failing here:\r\n```\r\ndataset_name = 'gooaq'\r\n\r\n    def test_load_dataset(self, dataset_name):\r\n        configs = self.dataset_tester.load_all_configs(dataset_name, is_local=True)[:1]\r\n>       self.dataset_tester.check_load_dataset(dataset_name, configs, is_local=True, use_local_dummy_data=True)\r\n\r\ntests/test_dataset_common.py:234: \r\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \r\ntests/test_dataset_common.py:187: in check_load_dataset\r\n    self.parent.assertTrue(len(dataset[split]) > 0)\r\nE   AssertionError: False is not true\r\n```\r\nWhen I try loading dataset on local machine it works fine. Any suggestions on how can I avoid this error?",
  'performed_via_github_app': None}]
"""

### 注释在body字段中

In [None]:
def get_comments(issue_number):
    url = f"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments"
    response = requests.get(url, headers=headers)
    return [r["body"] for r in response.json()]


# Test our function works as expected
get_comments(2792)

In [None]:
"""
["@albertvillanova my tests are failing here:\r\n```\r\ndataset_name = 'gooaq'\r\n\r\n    def test_load_dataset(self, dataset_name):\r\n        configs = self.dataset_tester.load_all_configs(dataset_name, is_local=True)[:1]\r\n>       self.dataset_tester.check_load_dataset(dataset_name, configs, is_local=True, use_local_dummy_data=True)\r\n\r\ntests/test_dataset_common.py:234: \r\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \r\ntests/test_dataset_common.py:187: in check_load_dataset\r\n    self.parent.assertTrue(len(dataset[split]) > 0)\r\nE   AssertionError: False is not true\r\n```\r\nWhen I try loading dataset on local machine it works fine. Any suggestions on how can I avoid this error?"]
"""

### 为每个问题添加一个新的注释列

In [None]:
# Depending on your internet connection, this can take a few minutes...
issues_with_comments_dataset = issues_dataset.map(
    lambda x: {"comments": get_comments(x["number"])}
)

## 4、将数据集上传到Hugging Face Hub

### 登录Hugging Face Hub

In [None]:
from huggingface_hub import notebook_login

notebook_login()
# Hugging Face Hub登录，输入用户名和密码，API token存储在~/.huggingface/token
# 终端中运行   huggingface-cli login

### 上传数据集

In [None]:
issues_with_comments_dataset.push_to_hub("github-issues")

### 数据集下载

In [None]:
remote_dataset = load_dataset("lewtun/github-issues", split="train")
remote_dataset

In [None]:
"""
Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'performed_via_github_app', 'is_pull_request'],
    num_rows: 2855
})
"""

## 5、创建数据集卡

1. 创建YAML格式的元数据标记，用于各种搜素功能，便于查找。需要克隆https://huggingface.co/spaces/huggingface/datasets-tagging 到本地运行
2. 创建数据卡指南  https://github.com/huggingface/datasets/blob/main/templates/README_guide.md
3. 创建README.md