In [92]:
import pandas as pd
from github import Github
from datetime import datetime
import time
from github.GithubException import RateLimitExceededException

DATA_DIR = './data'
DATASETS_DIR = './data/datasets'
OAUTH_TOKEN = 'secret'

In [8]:
g = Github(OAUTH_TOKEN)

In [106]:
AMBIGUOUS_NAMES = {
    'apache-incubator-superset': 'apache-incubator/superset',
    'keras-team-keras': 'keras-team/keras',
    'pandas-dev-pandas': 'pandas-dev/pandas',
    'rust-lang-rust': 'rust-lang/rust',
    'scikit-learn-scikit-learn': 'scikit-learn/scikit-learn'
}
DIR_GITHUB_MAPPING = {}

for dir_name in os.listdir(DATASETS_DIR):
    if dir_name in AMBIGUOUS_NAMES:
        github_path = AMBIGUOUS_NAMES[dir_name]
    else:
        github_path = dir_name.replace('-', '/')
    DIR_GITHUB_MAPPING[dir_name] = github_path

In [116]:
issues_data = {}
comments_data = {}

descriptors = ['body', 'closed_at', 'comments', 'created_at', 'html_url', 'number', 'state', 'title', 'updated_at']
comments_descriptors = ['body', 'created_at', 'updated_at']

In [None]:
def extract_issue_data(issues_data, comments_data, issue, github_path):
    for comment in issue.get_comments():
        comment_data = {}
        for descriptor in comments_descriptors:
            comment_data[descriptor] = getattr(comment, descriptor)
        comment_data['parent'] = issue.id
        comments_data[comment.id] = comment_data
    issue_data = {}
    for descriptor in descriptors:
        issue_data[descriptor] = getattr(issue, descriptor)
    issue_data['closed_by'] = issue.closed_by.login if issue.closed_by else None
    issue_data['user'] = issue.user.login
    issue_data['assignee'] = issue.assignee.login if issue.assignee else None
    issue_data['assignees'] = [user.login for user in issue.assignees] if issue.assignees else None
    issue_data['labels'] = [label.name for label in issue.labels] if issue.labels else None
    issue_data['milestone'] = issue.milestone.title if issue.milestone else None
    issue_data['pull_request'] = issue.pull_request.html_url if issue.pull_request else None
    issue_data['project'] = github_path
    issues_data[issue.id] = issue_data

for dir_name, github_path in DIR_GITHUB_MAPPING.items():
    print('Fetching issues for {}'.format(github_path))
    repo = g.get_repo(github_path)
    for issue in repo.get_issues(state='all'):
        success = False
        while not success:
            try:
                if issue.id not in issues_data:
                    extract_issue_data(issues_data, comments_data, issue, github_path)
                success = True
            except RateLimitExceededException as exc:
                print('Rate limit exceeded. Sleeping (time: {})... ({})'.format(datetime.now(), github_path))
                remaining = 0
                while remaining < 4000:
                    time.sleep(10 * 60)
                    remaining = g.get_rate_limit().core.remaining
            except GithubException as exc:
                print(exc)
                time.sleep(60)

Fetching issues for pandas-dev/pandas
Rate limit exceeded. Sleeping (time: 2018-11-25 00:48:16.554675)... (pandas-dev/pandas)
Rate limit exceeded. Sleeping (time: 2018-11-25 01:47:02.831549)... (pandas-dev/pandas)


In [109]:
issues_df = pd.DataFrame.from_dict(issues_data, orient='index')

In [110]:
comments_df = pd.DataFrame.from_dict(comments_data, orient='index')

In [111]:
# Count number of retrieved issues from each project.
issues_df.html_url.str.split('/').apply(lambda x: x[3] + '/' + x[4]).value_counts()

pandas-dev/pandas    1963
Name: html_url, dtype: int64

In [113]:
comments_df.to_csv('data/github_comments.csv')
issues_df.to_csv('data/github_issues.csv')