# 1. Arranging required packages

## 1.1. Installing packages



> Install PyGithub package :



In [1]:
%%capture
pip install PyGithub


## 1.2. Importing packages



> Import necessary packages :



In [2]:
from github import UnknownObjectException
from github import Github
from time import sleep
import csv
import sys


# 2. Environment preparation

## 2.1. GitHub API Token



> Use the token from your GitHub account :



In [3]:
ACCESS_TOKEN = '' # Put your GitHub API Token here


## 2.2. Initialize Github Client

In [4]:
g = Github(ACCESS_TOKEN)


## 2.3. Utility Functions

> Write CSV file header :

In [5]:
def save_header(file_path, header):
  with open(file_path, 'w', newline='') as f:
    writer = csv.writer(f)
    # write the header
    writer.writerow(header)


> Save in CSV :

In [6]:
def save_csv(file_path, row):
  with open(file_path, 'a', newline='') as f:
    writer = csv.writer(f)
    # write the data
    writer.writerow(row)




> Change the PATH accordingly to match your Drive folder :



In [7]:
PATH = './Example/Data/'


# 3. Extract Data for [Users]

This section is related to the download of **Users** Data :

In [8]:
def save_user(file_path, user):
    data = [
            user.login,
            user.type,
            user.site_admin,
            user.created_at,
            user.updated_at,
            user.public_repos,
            user.public_gists,
            user.followers,
            user.following,
            user.email
    ]
    save_csv(file_path, data)


In [9]:
src_path = PATH + 'Users/Input.csv'
res_path = PATH + 'Users/Output.csv'
log_path = PATH + 'Users/Log.csv'


## 3.1. First Time Execution



> Run this block ONLY the first time, it writes the headers of the CSV file. Otherwise, it overwrites the downloaded data !



In [10]:
header = ['Author', 'Type', 'Site_admin', 'Created_at', 'Updated_at', 'Public_repos', 'Public_gists', 'Followers', 'Following', 'Email']
log_header = ['Author']
save_header(res_path, header)
save_header(log_path, log_header)


## 3.2. Data Extraction

In [11]:
count = -1
with open(src_path, 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        if count == -1:
            # Skip the header
            count += 1
            continue
        try:
            user = g.get_user(row[0])
            save_user(res_path, user)
        except:
            save_csv(log_path, [row[0]])
        count += 1
        if (count % 60 == 0):
            # Due to API Rate Limit
            sleep(60)


# 4. Extract Data for [Repos], [PRs] & [Commits]

This section is related to the download of **Repositories**, **Pull Requests** and **Commits** Data :

In [12]:
def find_repo(repositories, owner, repo_name):
    for repo in repositories:
        if (repo.owner.login == owner and repo.name == repo_name):
            return repo
    return None


In [13]:
src_path      = PATH + 'PRs/Input.csv'

pr_res_path   = PATH + 'PRs/Output.csv'
pr_log_path   = PATH + 'PRs/Log.csv'

repo_res_path = PATH + 'Repos/Output.csv'
repo_log_path = PATH + 'Repos/Log.csv'

comm_res_path = PATH + 'Commits/Output.csv'
comm_log_path = PATH + 'Commits/Log.csv'


## 4.1. First Time Execution



> Run this block ONLY the first time, it writes the headers of the CSV files. Otherwise, it overwrites the downloaded data !



In [14]:
pr_header = ['Owner', 'Repo', 'Number', 'Author', 'Created_at', 'Updated_at', 'Closed_at', 'Merged_at', 'State',
             'Title', 'Body', 'Assignee', 'Assignees', 'Requested_reviewers', 'Labels', 'Mergeable', 
             'Mergeable_state', 'Comments', 'Review_comments', 'Commits', 'Additions', 'Deletions', 'Changed_files']
pr_log_header = ['Owner', 'Repo', 'Pull']
save_header(pr_res_path, pr_header)
save_header(pr_log_path, pr_log_header)


In [15]:
repo_header = ['Owner', 'Repo', 'Private', 'Description', 'Fork', 'Created_at', 'Updated_at', 'Pushed_at', 'Size', 
               'Stars', 'Watchers', 'Language', 'Archived', 'Forks', 'Open_issues']
repo_log_header = ['Owner', 'Repo']
save_header(repo_res_path, repo_header)
save_header(repo_log_path, repo_log_header)


In [16]:
comm_header = ['Owner', 'Repo', 'Number', 'SHA', 'Author', 'A_Date', 'Committer', 'C_Date', 'Message']
comm_log_header = ['Owner', 'Repo', 'Pull']
save_header(comm_res_path, comm_header)
save_header(comm_log_path, comm_log_header)


## 4.2. Data Extraction :



> Run this block to download data and continue the download where you left off :



### 4.2.1. Functions :

In [17]:
def save_repo(file_path, repo):
    description = None
    if not(repo.description is None):
        description = repo.description.replace("\n", " ").replace("\r", " ").replace("\b", " ").replace(";", ",")
    data = [
            repo.owner.login,
            repo.name,
            repo.private,
            description,
            repo.fork,
            repo.created_at,
            repo.updated_at,
            repo.pushed_at,
            repo.size,
            repo.stargazers_count,
            repo.watchers_count,
            repo.language,
            repo.archived,
            repo.forks,
            repo.open_issues
    ]
    save_csv(file_path, data)


In [18]:
def save_pull(file_path, repo, pull):
    labels = []
    for label in pull.get_labels():
        labels.append(label.name)
    assignee = None
    if not(pull.assignee is None):
        assignee = pull.assignee.login
    body = None
    if not(pull.body is None):
        body = pull.body.replace("\n", " ").replace("\r", " ").replace("\b", " ").replace(";", ",")
    title = None
    if not(pull.title is None):
        title = pull.title.replace("\n", " ").replace("\r", " ").replace("\b", " ").replace(";", ",")
    data = [
            repo.owner.login,
            repo.name,
            pull.number,
            pull.user.login,
            pull.created_at,
            pull.updated_at,
            pull.closed_at,
            pull.merged_at,
            pull.state,
            title,
            len(body),
            assignee,
            len(pull.assignees),
            len(pull.get_review_requests()),
            labels,
            pull.mergeable,
            pull.mergeable_state,
            pull.comments,
            pull.review_comments,
            pull.commits,
            pull.additions,
            pull.deletions,
            pull.changed_files
    ]
    save_csv(file_path, data)


In [19]:
def save_commit(file_path, repo, pull, commit):
    message = None
    if not(commit.commit.message is None):
        message = commit.commit.message.replace("\n", " ").replace("\r", " ").replace("\b", " ").replace(";", ",")
    data = [
            repo.owner.login,
            repo.name,
            pull.number,
            commit.sha,
            commit.commit.author.name,
            commit.commit.author.date,
            commit.commit.committer.name,
            commit.commit.committer.date,
            message
    ]
    save_csv(file_path, data)


### 4.2.2. Execution:

In [20]:
repositories = []




> This subsection allows to skip the lines that were already handled :



In [21]:
maxInt = sys.maxsize
while True:
    try:
        csv.field_size_limit(maxInt)
        break
    except OverflowError:
        maxInt = int(maxInt/10)


In [22]:
output_file = open(pr_res_path)
output_read = csv.reader(output_file)
output_size = len(list(output_read)) - 1
print(output_size)


0


> Start the extraction :

In [23]:
requested = False
index = 0
count = -1
with open(src_path, 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        requested = False
        if (index < output_size):
            index += 1
            continue
        print([count, row[0], row[1], row[2]])
        if count == -1:
            count += 1
            continue
        repo = find_repo(repositories, row[0], row[1])
        try:
          if (repo is None):
              requested = True
              repo = g.get_repo(row[0] + '/' + row[1])
              save_repo(repo_res_path, repo)
              repositories.append(repo)
              count += 1
        except UnknownObjectException:
          print("Repository not found !")
          save_csv(repo_log_path, [row[0], row[1]])
          count += 1
          continue
        try:
          requested = True
          pull = repo.get_pull(int(row[2]))
          save_pull(pr_res_path, repo, pull)
          count += 1
        except UnknownObjectException:
          print("Pull Request not found !")
          count += 1
          save_csv(pr_log_path, [row[0], row[1], row[2]])
          continue
        commits = pull.get_commits()
        for commit in commits:
            save_commit(comm_res_path, repo, pull, commit)
        if (count % 60 == 0 and requested):
            print("Waiting ...")
            # Due to API Rate Limit
            sleep(60)


[-1, 'owner', 'repo', 'number']
[0, 'rajeshdh', 'twittersentiments', '79']
[2, 'shobhitsingh29', 'saga_multiple_page', '33']
[4, 'danish45007', 'maddit', '14']
[6, 'developer-student-club-thapar', 'officialWebsite', '438']
[8, 'STAC-IITMandi', 'STAC-IITMandi.github.io', '210']
[10, 'k2glyph', 'react-material-admin-template', '40']
[12, 'sambhav2612', 'keystone', '74']
[14, 'khrj', 'teledrive', '70']
[16, 'rajeshdh', 'jsStarterKit', '60']
[18, 'Jorge-Bill', 'blog', '52']
[20, 'TrigenSoftware', 'i18n-for-browser', '370']
[22, 'keindev', 'string-lookup-manager', '493']
[24, 'TrigenSoftware', 'i18n-for-react', '237']
[26, '2001-800080', 'graceshopperrepo', '126']
[28, 'freedomlang', 'VueWordCloud', '152']
[30, 'react-component', 'gulp-jsx2example', '97']
[32, 'geeeger', 'ws-uploader', '160']
[34, 'ngoctrantl', 'erc20-explorer', '29']
[36, 'DIYgod', 'diygod.me', '365']
[38, 'jeremyrajan', 'json-markdown', '134']
[40, 'Badi-Cal', 'badi-events-app', '41']
[42, 'SweakNetwork', 'chikuwagoddon',

# 5. Extract Data for PR [Merged by] Who ?

This section allows to download data related to the user who **merged** the PRs :

## 5.1. Environment Preparation :

In [25]:
%%capture
!pip install octokitpy
from octokit import Octokit
import requests




> Use the token from your GitHub account :



In [26]:
octo = Octokit(auth='token', token='') # Put your GitHub API Token here


In [27]:
PATH = './Example/Data/By/Merged/'
src_path = PATH + 'Input.csv'
res_path = PATH + 'Output.csv'


## 5.2. First Time Execution

In [28]:
header = ['owner', 'repo', 'number', 'merged_by', 'comment_url', 'html_url']
save_header(res_path, header)


## 5.3. Data Extraction :

### 5.3.1. Functions :

In [29]:
def save_add_info(owner, repo, number, merged_by, comment_url, html_url):
    data = [
            owner,
            repo,
            number,
            merged_by,
            comment_url,
            html_url
            ]
    save_csv(res_path, data)


In [30]:
def handle_request(owner, repo, num):
    pr = octo.pulls.get(owner=owner, repo=repo, pull_number=num)
    merged_by = None
    if not(pr.json.get('merged_by') is None):
        merged_by = pr.json.get('merged_by').get('login')
    comments_url = pr.json.get('comments_url')
    html_url = pr.json.get('html_url')
    save_add_info(owner, repo, num, merged_by, comments_url, html_url)




### 5.3.2. Execution :



In [31]:
maxInt = sys.maxsize
while True:
    try:
        csv.field_size_limit(maxInt)
        break
    except OverflowError:
        maxInt = int(maxInt/10)


In [32]:
output_file = open(res_path)
output_read = csv.reader(output_file)
output_size = len(list(output_read)) - 1
print(output_size)


0


In [34]:
requested = False
index = 0
count = -1
with open(src_path, 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        requested = False
        if (index < output_size):
            index += 1
            continue
        print([(count + 1), row[0], row[1], row[2]])
        if count == -1:
            count += 1
            continue
        try:
          handle_request(row[0], row[1], row[2])
          requested = True
          count += 1
        except UnknownObjectException:
          print("Pull Request not found !")
          requested = True
          count += 1
        if (count % 60 == 0 and requested):
            print("Waiting ...")
            sleep(60)


[0, '\ufeffOwner', 'Repo', 'Number']
[1, 'DIYgod', 'diygod.me', '365']
[2, 'ruevko', 'slides', '1']
[3, 'Vyvy-vi-archives', 'lets-troll-ryan', '6']
[4, 'Vyvy-vi-archives', 'lets-troll-ryan', '3']
[5, 'mondeja', 'remove-labels-gh-action', '3']
[6, 'aemc', 'goal-coach', '22']
[7, 'uwpokerclub', 'app', '32']
[8, 'aemc', 'goal-coach', '21']
[9, 'piemasters', 'MiniToTheMax', '1539']
[10, 'decentraland', 'builder', '1288']
[11, 'remotehour', 'rrule-duration', '10']
[12, 'ember-learn', 'deprecation-app', '831']


# 6. Extract Data for PR [Closed by] Who ?

This section allows to download data to determine the user who **closed** the PRs :

## 6.1. Environment Preparation :

In [35]:
PATH = './Example/Data/By/Closed/'
src_path = PATH + 'Input.csv'
res_path = PATH + 'Output.csv'


## 6.2. First Time Execution

In [36]:
header = ['owner', 'repo', 'number', 'comment_by', 'comment_body', 'html_url']
save_header(res_path, header)


## 6.3. Data Extraction :

### 6.3.1. Functions :

In [37]:
def save_add_info(owner, repo, number, comment_by, comment_body, html_url):
    data = [
            owner,
            repo,
            number,
            comment_by,
            comment_body,
            html_url
            ]
    save_csv(res_path, data)


In [38]:
def handle_request(owner, repo, num):
    repository = g.get_repo(owner + '/' + repo)
    issue = repository.get_issue(int(num))
    comments = issue.get_comments()
    if (comments.totalCount > 0):
        comment = comments[(comments.totalCount - 1)]
        comment_by = comment.user.login
        comment_body = []
        comment_body.append(comment.body.replace("\n", " ").replace("\r", " ").replace("\b", " ").replace(";", ","))
    else:
        comment_by = None
        comment_body = None
    html_url = 'https://github.com/' + owner + '/' + repo + '/pull/' + num
    save_add_info(owner, repo, num, comment_by, comment_body, html_url)


### 6.3.2. Execution :

In [39]:
maxInt = sys.maxsize
while True:
    try:
        csv.field_size_limit(maxInt)
        break
    except OverflowError:
        maxInt = int(maxInt/10)


In [40]:
output_file = open(res_path)
output_read = csv.reader(output_file)
output_size = len(list(output_read)) - 1
print(output_size)


0


In [41]:
requested = False
index = 0
count = -1
with open(src_path, 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        requested = False
        if (index < output_size):
            index += 1
            continue
        print([(count + 1), row[0], row[1], row[2]])
        if count == -1:
            count += 1
            continue
        try:
          handle_request(row[0], row[1], row[2])
          requested = True
          count += 3
        except Exception as e:
          print(e)
          requested = True
        if (count % 60 == 0 and requested):
            print("Waiting ...")
            sleep(60)


[0, '\ufeffOwner', 'Repo', 'Number']
[1, 'shobhitsingh29', 'saga_multiple_page', '33']
[4, 'danish45007', 'maddit', '14']
[7, 'developer-student-club-thapar', 'officialWebsite', '438']
[10, 'STAC-IITMandi', 'STAC-IITMandi.github.io', '210']
[13, 'k2glyph', 'react-material-admin-template', '40']
[16, 'khrj', 'teledrive', '70']
[19, 'Jorge-Bill', 'blog', '52']
[22, 'TrigenSoftware', 'i18n-for-browser', '370']
[25, 'keindev', 'string-lookup-manager', '493']
[28, 'TrigenSoftware', 'i18n-for-react', '237']
[31, 'react-component', 'gulp-jsx2example', '97']
[34, 'geeeger', 'ws-uploader', '160']
[37, 'ngoctrantl', 'erc20-explorer', '29']
[40, 'jeremyrajan', 'json-markdown', '134']
[43, 'Badi-Cal', 'badi-events-app', '41']
[46, 'seijikohara', 'json-tree-view-vue3', '110']
[49, 'takumi091111', 'website-nuxtjs', '160']
[52, 'react-component', 'rc-server', '43']
[55, 'webxmsj', 'blog-admin', '130']
[58, 'TrigenSoftware', 'flexis-favicons', '364']
Waiting ...
[61, 'aaronhayes', 'hasura-sdk', '69']
