In [1]:
import pandas
import matplotlib
import requests
import warnings

import sys
sys.path.append('..')

from helper import load_data , load_repo , GIT_API
warnings.filterwarnings('ignore')

%matplotlib inline

import os
import json
import time

In [2]:
json_store_dir = '..'+os.sep+'data'+os.sep+'json'

In [3]:
testrepo = 'https://github.com/rust-lang/cargo'

ISSUE = 'issue'
PULL = 'pull'

time_sleep = 0.1

OPEN = 'open'
CLOSED = 'closed'

In [4]:
# this function extracts number of open and closed issues of passed repository
def get_open_closed_issues(repo):
    add = repo + '/issues'
    time.sleep(time_sleep)
    req = requests.get(add)
    data = req.text
    
    open_issues = int(data.split('<svg class="octicon octicon-issue-opened"')[2].split('Open')[0].split('</svg>')[1].strip().replace(",",""))
    closed_issues = int(data.split('<svg class="octicon octicon-check"')[1].split('Closed')[0].split('</svg>')[1].strip().replace(",",""))
    
    return open_issues, closed_issues

In [5]:
#this function fetches comment of issues and returns a dataframe of comments
def get_comments(repo,issue_id,is_pull_issue):
    df = pandas.DataFrame(columns=["project_name","issue_number","comment_id","user_login","created_at","updated_at","author_assoc"])
    
    repo_parts = repo.split('/')
    repo_full = repo_parts[-2]+'/'+repo_parts[-1]
    print(repo_full)
    
    if is_pull_issue==ISSUE:
        par_folder = json_store_dir+os.sep+'issue_comments'+os.sep+repo_full.replace('/',os.sep)
        json_store_location = par_folder+os.sep+str(issue_id)+'.json'
    elif is_pull_issue==PULL:
        par_folder = json_store_dir+os.sep+'pull_comments'+os.sep+repo_full.replace('/',os.sep)
        json_store_location = par_folder+os.sep+str(issue_id)+'.json'
        
    print(json_store_location)
    if os.path.exists(json_store_location):
        with open(json_store_location) as f:
            comments = json.load(f)
    else:
        issue_comment = 'https://api.github.com/repos/{}/{}/issues/{}/comments?client_id={}&client_secret={}'.format(
                repo.split('/')[3] ,
                repo.split('/')[4] ,
                issue_id,
                GIT_API[11].split(':')[0] , GIT_API[11].split(':')[1])
        print(issue_comment)
        time.sleep(time_sleep)
        req = requests.get(issue_comment)
        comments = req.json()
        if not os.path.exists(par_folder):
            os.makedirs(par_folder)
        with open(json_store_location, 'w') as outfile:
            json.dump(comments, outfile)
            
    for comment in comments:
        df = df.append({
            "project_name": repo,
            "issue_number": issue_id,
            "comment_id": comment['id'],
            "user_login": comment['user']['login'],
            "created_at": comment['created_at'],
            "updated_at": comment['updated_at'],
            "author_assoc": comment['author_association'] },ignore_index=True)
    return df

In [6]:
# this function extracts number of open and closed pull requests of passed repository
def get_open_closed_pulls(repo):
    add = repo + '/pulls'
    time.sleep(time_sleep)
    req = requests.get(add)
    data = req.text
    
    open_issues = int(data.split('<svg class="octicon octicon-git-pull-request"')[2].split('Open')[0].split('</svg>')[1].strip().replace(",",""))
    closed_issues = int(data.split('<svg class="octicon octicon-check"')[1].split('Closed')[0].split('</svg>')[1].strip().replace(",",""))
    
    return open_issues, closed_issues

In [7]:
#this function fetches review comment of pull request and returns a dataframe of review comments
def get_pull_comments(repo,pr_id):
    df = pandas.DataFrame(columns=["project_name",
                                   "pr_id",
                                   "pr_review_id",
                                   "comment_id",
                                   "commit_id",
                                   "original_commit_id",
                                   "user_login",
                                   "created_at",
                                   "updated_at",
                                   "author_assoc"])
    
    repo_parts = repo.split('/')
    repo_full = repo_parts[-2]+'/'+repo_parts[-1]
    print(repo_full)
    
    par_folder = json_store_dir+os.sep+'code_review_comments'+os.sep+repo_full.replace('/',os.sep)
    json_store_location = par_folder+os.sep+str(pr_id)+'.json'
        
    if os.path.exists(json_store_location):
        with open(json_store_location) as f:
            comments = json.load(f)
    else:
        pr_comment = 'https://api.github.com/repos/{}/{}/pulls/{}/comments?client_id={}&client_secret={}'.format(
                repo.split('/')[3] ,
                repo.split('/')[4] ,
                pr_id,
                GIT_API[11].split(':')[0] , GIT_API[11].split(':')[1])
        time.sleep(time_sleep)
        req = requests.get(pr_comment)
        comments = req.json()
        
        if not os.path.exists(par_folder):
            os.makedirs(par_folder)
        with open(json_store_location, 'w') as outfile:
            json.dump(comments, outfile)
        
        
    for comment in comments:
        df = df.append({
            "project_name": repo,
            "pr_id": pr_id,
            "pr_review_id" : comment['pull_request_review_id'],
            "comment_id": comment['id'],
            "commit_id" : comment['commit_id'],
            "original_commit_id" : comment['original_commit_id'],
            "user_login": comment['user']['login'],
            "created_at": comment['created_at'],
            "updated_at": comment['updated_at'],
            "author_assoc": comment['author_association'] },ignore_index=True)
    return df

# Dataframes

In [23]:
issue_comments = pandas.DataFrame(columns=["project_name","issue_number","comment_id","user_login","created_at","updated_at","author_assoc"])
pulls_comments = pandas.DataFrame(columns=["project_name","issue_number","comment_id","user_login","created_at","updated_at","author_assoc"])
pulls_review_comments = pandas.DataFrame(columns=["project_name","pr_id","pr_review_id","comment_id","commit_id","original_commit_id","user_login","created_at","updated_at","author_assoc"])


In [24]:
# extracting number of issues and prs for proposed repo
open_issue , closed_issue = get_open_closed_issues(testrepo)  
open_pr , closed_pr = get_open_closed_pulls(testrepo)  

# Open/Closed Issues Comments

In [36]:
def get_issues_comments(repo,is_open_closed,issues_number):
    issue_comments = pandas.DataFrame()
    for i in range(1,(issues_number//25)+2):

        issues_page_url = repo + ('/issues?page={}&q=is%3Aissue+is%3A'+is_open_closed).format(i)
        print(issues_page_url)
        time.sleep(time_sleep)
        req = requests.get(issues_page_url)
        data = req.text
        issues = data.split('id="issue_')
        for issue in issues:
            if 'DOCTYPE' in issue:
                continue

            if 'octicon octicon-comment' in issue:
                issue_id = issue.split('"')[0]
                issue_comments = issue_comments.append(get_comments(repo,issue_id,ISSUE),ignore_index=True)
    return issue_comments
open_issue_comments = get_issues_comments(testrepo,OPEN,open_issue)
open_issue_comments['is_open']=1
closed_issue_comments = get_issues_comments(testrepo,CLOSED,closed_issue)
closed_issue_comments['is_open']=0

issues = open_issue_comments.append(closed_issue_comments)

https://github.com/rust-lang/cargo/issues?page=1&q=is%3Aissue+is%3Aopen
rust-lang/cargo
../data/json/issue_comments/rust-lang/cargo/6214.json
rust-lang/cargo
../data/json/issue_comments/rust-lang/cargo/6211.json
rust-lang/cargo
../data/json/issue_comments/rust-lang/cargo/6208.json
rust-lang/cargo
../data/json/issue_comments/rust-lang/cargo/6207.json
rust-lang/cargo
../data/json/issue_comments/rust-lang/cargo/6205.json
rust-lang/cargo
../data/json/issue_comments/rust-lang/cargo/6204.json
rust-lang/cargo
../data/json/issue_comments/rust-lang/cargo/6199.json
rust-lang/cargo
../data/json/issue_comments/rust-lang/cargo/6198.json
rust-lang/cargo
../data/json/issue_comments/rust-lang/cargo/6197.json
rust-lang/cargo
../data/json/issue_comments/rust-lang/cargo/6195.json
rust-lang/cargo
../data/json/issue_comments/rust-lang/cargo/6189.json
rust-lang/cargo
../data/json/issue_comments/rust-lang/cargo/6184.json
rust-lang/cargo
../data/json/issue_comments/rust-lang/cargo/6180.json
rust-lang/cargo
..

# Open/Closed Pull Request Comments

In [48]:
def get_pulls_comments(repo,is_open_closed,pulls_number):
    pulls_comments = pandas.DataFrame(columns=["project_name","issue_number","comment_id","user_login","created_at","updated_at","author_assoc"])
    pulls_review_comments = pandas.DataFrame(columns=["project_name","pr_id","pr_review_id","comment_id","commit_id","original_commit_id","user_login","created_at","updated_at","author_assoc"])
    for i in range(1,(pulls_number//25)+2):
        pull_page_url = repo + ('/pulls?page={}&q=is%3Apr+is%3A'+is_open_closed).format(i)
        time.sleep(time_sleep)
        req = requests.get(pull_page_url)
        data = req.text
        issues = data.split('id="issue_')
        for issue in issues:
            if 'DOCTYPE' in issue:
                continue
            if 'octicon octicon-comment' in issue:
                pr_id = issue.split('"')[0]
                pulls_comments = pulls_comments.append(get_comments(repo,pr_id,PULL),ignore_index=True)
                pulls_review_comments = pulls_review_comments.append(get_pull_comments(repo,pr_id),ignore_index=True)
        break
    return pulls_comments, pulls_review_comments
                
open_pull_comemnts, open_pr_revs = get_pulls_comments(testrepo,OPEN,open_pr)
open_pr_revs['is_open']=1
open_pull_comemnts['is_open']=1
closed_pull_comemnts, closed_pr_revs = get_pulls_comments(testrepo,CLOSED,closed_pr)
closed_pr_revs['is_open']=0
closed_pull_comemnts['is_open']=0

pr_revs = open_pr_revs.append(closed_pr_revs)
pull_comments = open_pull_comemnts.append(closed_pull_comemnts)

rust-lang/cargo
../data/json/pull_comments/rust-lang/cargo/6213.json
rust-lang/cargo
rust-lang/cargo
../data/json/pull_comments/rust-lang/cargo/6212.json
rust-lang/cargo
rust-lang/cargo
../data/json/pull_comments/rust-lang/cargo/6194.json
rust-lang/cargo
rust-lang/cargo
../data/json/pull_comments/rust-lang/cargo/6150.json
rust-lang/cargo
rust-lang/cargo
../data/json/pull_comments/rust-lang/cargo/6133.json
rust-lang/cargo
rust-lang/cargo
../data/json/pull_comments/rust-lang/cargo/6130.json
rust-lang/cargo
rust-lang/cargo
../data/json/pull_comments/rust-lang/cargo/6070.json
rust-lang/cargo
rust-lang/cargo
../data/json/pull_comments/rust-lang/cargo/6019.json
rust-lang/cargo
rust-lang/cargo
../data/json/pull_comments/rust-lang/cargo/5915.json
rust-lang/cargo
rust-lang/cargo
../data/json/pull_comments/rust-lang/cargo/5728.json
rust-lang/cargo
rust-lang/cargo
../data/json/pull_comments/rust-lang/cargo/5725.json
rust-lang/cargo
rust-lang/cargo
../data/json/pull_comments/rust-lang/cargo/5611.j

In [52]:
pull_comments.head(n=5)

Unnamed: 0,project_name,issue_number,comment_id,user_login,created_at,updated_at,author_assoc,is_open
0,https://github.com/rust-lang/cargo,6213,432533987,Xanewok,2018-10-24T06:49:45Z,2018-10-24T06:49:45Z,MEMBER,1
1,https://github.com/rust-lang/cargo,6212,432356610,rust-highfive,2018-10-23T18:05:19Z,2018-10-23T18:05:19Z,NONE,1
2,https://github.com/rust-lang/cargo,6194,431582586,rust-highfive,2018-10-20T13:34:46Z,2018-10-20T13:34:46Z,NONE,1
3,https://github.com/rust-lang/cargo,6194,431613647,alexcrichton,2018-10-20T20:01:25Z,2018-10-20T20:01:25Z,MEMBER,1
4,https://github.com/rust-lang/cargo,6194,431616673,bors,2018-10-20T20:44:44Z,2018-10-20T20:44:44Z,CONTRIBUTOR,1


In [53]:
pr_revs.head(n=5)

Unnamed: 0,project_name,pr_id,pr_review_id,comment_id,commit_id,original_commit_id,user_login,created_at,updated_at,author_assoc,is_open
0,https://github.com/rust-lang/cargo,6070,157767771,219556287,2b7a7d3161c0ef506caab0362ff2da9aeb2c0322,c1288f344d18d032c1221e2a5acdfada95180179,matthiaskrgr,2018-09-21T16:32:08Z,2018-09-26T22:40:09Z,CONTRIBUTOR,1
1,https://github.com/rust-lang/cargo,6070,157767771,219556616,2b7a7d3161c0ef506caab0362ff2da9aeb2c0322,c1288f344d18d032c1221e2a5acdfada95180179,matthiaskrgr,2018-09-21T16:33:29Z,2018-09-26T22:40:09Z,CONTRIBUTOR,1
2,https://github.com/rust-lang/cargo,5915,149837279,213077061,8dfdc197f8f92a3c97537beae42c58caaa778123,8dfdc197f8f92a3c97537beae42c58caaa778123,sfackler,2018-08-27T18:52:39Z,2018-08-27T18:52:39Z,MEMBER,1
3,https://github.com/rust-lang/cargo,5728,144188270,208388055,e6a64ea1e4f80fd3995fb052b87285d030047b91,4652dcaaf90d4169f0c853e380d57d3f3ca08eb9,dwijnand,2018-08-07T21:15:56Z,2018-09-21T08:31:28Z,MEMBER,1
4,https://github.com/rust-lang/cargo,5728,144191057,208390306,e6a64ea1e4f80fd3995fb052b87285d030047b91,4652dcaaf90d4169f0c853e380d57d3f3ca08eb9,detrumi,2018-08-07T21:24:31Z,2018-09-21T08:31:28Z,NONE,1
