In [1]:
import pandas
import matplotlib
import requests
import warnings

import sys
sys.path.append('..')

from helper import load_data , load_repo , GIT_API
warnings.filterwarnings('ignore')

%matplotlib inline

import os
import json
import time

In [2]:
json_store_dir = '..'+os.sep+'data'+os.sep+'json'

In [3]:
testrepo = 'https://github.com/rust-lang/cargo'

ISSUE = 'issue'
PULL = 'pull'

time_sleep = 0.1

OPEN = 'open'
CLOSED = 'closed'

In [4]:
# this function extracts number of open and closed issues of passed repository
def get_open_closed_issues(repo):
    add = repo + '/issues'
    time.sleep(time_sleep)
    req = requests.get(add)
    data = req.text
    
    try:
        open_issues = int(data.split('<svg class="octicon octicon-issue-opened"')[2].split('Open')[0].split('</svg>')[1].strip().replace(",",""))
        closed_issues = int(data.split('<svg class="octicon octicon-check"')[1].split('Closed')[0].split('</svg>')[1].strip().replace(",",""))
    except:
        open_issues = 0
        closed_issues = 0
        
    return open_issues, closed_issues

In [5]:
#this function fetches comment of issues and returns a dataframe of comments
def get_comments(repo,issue_id,is_pull_issue,is_open):
    df = pandas.DataFrame(columns=["project_name","issue_number","comment_id","user_login","created_at","updated_at","author_assoc","is_open"])
    
    repo_parts = repo.split('/')
    repo_full = repo_parts[-2]+'/'+repo_parts[-1]
    
    if is_pull_issue==ISSUE:
        par_folder = json_store_dir+os.sep+'issue_comments'+os.sep+repo_full.replace('/',os.sep)
        json_store_location = par_folder+os.sep+str(issue_id)+'.json'
    elif is_pull_issue==PULL:
        par_folder = json_store_dir+os.sep+'pull_comments'+os.sep+repo_full.replace('/',os.sep)
        json_store_location = par_folder+os.sep+str(issue_id)+'.json'
        
    if os.path.exists(json_store_location):
        with open(json_store_location) as f:
            comments = json.load(f)
    else:
        issue_comment = 'https://api.github.com/repos/{}/{}/issues/{}/comments?client_id={}&client_secret={}'.format(
                repo.split('/')[3] ,
                repo.split('/')[4] ,
                issue_id,
                GIT_API[4].split(':')[0] , GIT_API[4].split(':')[1])
        
        print(issue_comment)
        time.sleep(time_sleep)
        req = requests.get(issue_comment)
        comments = req.json()
        print(req.headers['X-RateLimit-Remaining'])
        if not os.path.exists(par_folder):
            os.makedirs(par_folder)
        with open(json_store_location, 'w') as outfile:
            json.dump(comments, outfile)
            
    for comment in comments:
        df = df.append({
            "project_name": repo,
            "issue_number": issue_id,
            "comment_id": comment['id'],
            "user_login": comment['user']['login'] if comment['user'] else 'null',
            "created_at": comment['created_at'],
            "updated_at": comment['updated_at'],
            "author_assoc": comment['author_association'],
            "is_open": 1 if is_open else 0 },ignore_index=True)
    return df

In [6]:
# this function extracts number of open and closed pull requests of passed repository
def get_open_closed_pulls(repo):
    add = repo + '/pulls'
    time.sleep(time_sleep)
    req = requests.get(add)
    data = req.text
    
    try:
        open_pulls = int(data.split('<svg class="octicon octicon-git-pull-request"')[2].split('Open')[0].split('</svg>')[1].strip().replace(",",""))
        closed_pulls = int(data.split('<svg class="octicon octicon-check"')[1].split('Closed')[0].split('</svg>')[1].strip().replace(",",""))
    except:
        open_pulls = 0
        closed_pulls = 0
        
    return open_pulls, closed_pulls

In [7]:
#this function fetches review comment of pull request and returns a dataframe of review comments
def get_pull_comments(repo,pr_id,is_open):
    df = pandas.DataFrame(columns=["project_name",
                                   "pr_id",
                                   "pr_review_id",
                                   "comment_id",
                                   "commit_id",
                                   "original_commit_id",
                                   "user_login",
                                   "created_at",
                                   "updated_at",
                                   "author_assoc",
                                   "is_open"])
    
    repo_parts = repo.split('/')
    repo_full = repo_parts[-2]+'/'+repo_parts[-1]
    
    par_folder = json_store_dir+os.sep+'code_review_comments'+os.sep+repo_full.replace('/',os.sep)
    json_store_location = par_folder+os.sep+str(pr_id)+'.json'
        
    if os.path.exists(json_store_location):
        with open(json_store_location) as f:
            comments = json.load(f)
    else:
        pr_comment = 'https://api.github.com/repos/{}/{}/pulls/{}/comments?client_id={}&client_secret={}'.format(
                repo.split('/')[3] ,
                repo.split('/')[4] ,
                pr_id,
                GIT_API[6].split(':')[0] , GIT_API[6].split(':')[1])
        time.sleep(time_sleep)
        req = requests.get(pr_comment)
        print(req.headers['X-RateLimit-Remaining'])
        comments = req.json()
        print(pr_comment)
        if not os.path.exists(par_folder):
            os.makedirs(par_folder)
        with open(json_store_location, 'w') as outfile:
            json.dump(comments, outfile)
        
        
    for comment in comments:
        df = df.append({
            "project_name": repo,
            "pr_id": pr_id,
            "pr_review_id" : comment['pull_request_review_id'],
            "comment_id": comment['id'],
            "commit_id" : comment['commit_id'],
            "original_commit_id" : comment['original_commit_id'],
            "user_login": comment['user']['login'] if comment['user'] else 'null',
            "created_at": comment['created_at'],
            "updated_at": comment['updated_at'],
            "author_assoc": comment['author_association'],
            "is_open": 1 if is_open else 0},ignore_index=True)
    return df

# Dataframes

In [8]:
# extracting number of issues and prs for proposed repo
open_issue , closed_issue = get_open_closed_issues(testrepo)  
open_pr , closed_pr = get_open_closed_pulls(testrepo)  

# Open/Closed Issues Comments

In [9]:
def get_issues_comments(repo,is_open_closed,issues_number):
    issue_comments = pandas.DataFrame()
    for i in range(1,(issues_number//25)+2):
        issues_page_url = repo + ('/issues?page={}&q=is%3Aissue+is%3A'+is_open_closed).format(i)
        print(str(i)+ " -- "+ issues_page_url)
        if repo in prcoessed_addresses.url.unique():
            continue
        if issues_page_url in prcoessed_addresses.url.unique():
            continue
            
        time.sleep(time_sleep)
        req = requests.get(issues_page_url)
        data = req.text
        issues = data.split('id="issue_')
        for issue in issues:
            if 'DOCTYPE' in issue:
                continue
            if 'octicon octicon-comment' in issue:
                issue_id = issue.split('"')[0]
                issue_comments = issue_comments.append(get_comments(repo,issue_id,ISSUE,is_open_closed),ignore_index=True)
        add_address(issues_page_url)
    return issue_comments

# Open/Closed Pull Request Comments

In [10]:
def get_pulls_comments(repo,is_open_closed,pulls_number):
    pulls_comments = pandas.DataFrame()
    pulls_review_comments = pandas.DataFrame()
    for i in range(1,(pulls_number//25)+2):
        pull_page_url = repo + ('/pulls?page={}&q=is%3Apr+is%3A'+is_open_closed).format(i)
        print(str(i) + " -- " +pull_page_url)
        if repo in prcoessed_addresses.url.unique():
            continue
        if pull_page_url in prcoessed_addresses.url.unique():
            continue
        
        time.sleep(time_sleep)
        req = requests.get(pull_page_url)
        data = req.text
        issues = data.split('id="issue_')
        for issue in issues:
            if 'DOCTYPE' in issue:
                continue
            if 'octicon octicon-comment' in issue:
                pr_id = issue.split('"')[0]
                pulls_comments = pulls_comments.append(get_comments(repo,pr_id,PULL,is_open_closed),ignore_index=True)
                pulls_review_comments = pulls_review_comments.append(get_pull_comments(repo,pr_id,is_open_closed),ignore_index=True)
        add_address(pull_page_url)
    return pulls_comments, pulls_review_comments
                
#open_pull_comemnts, open_pr_revs = get_pulls_comments(testrepo,OPEN,open_pr)
#open_pr_revs['is_open']=1
#open_pull_comemnts['is_open']=1
#closed_pull_comemnts, closed_pr_revs = get_pulls_comments(testrepo,CLOSED,closed_pr)
#closed_pr_revs['is_open']=0
#closed_pull_comemnts['is_open']=0

#pr_revs = open_pr_revs.append(closed_pr_revs)
#pull_comments = open_pull_comemnts.append(closed_pull_comemnts)

In [11]:
try:
    prcoessed_addresses = pandas.read_csv('../data/padd.csv')
except:
    prcoessed_addresses = pandas.DataFrame(columns=["url"])
def add_address(add):
    global prcoessed_addresses
    prcoessed_addresses = prcoessed_addresses.append({
            "url": add},ignore_index=True)
    

In [12]:
try:
    issue_comments = pandas.read_csv('../data/issue_comments.csv.gz',compression='gzip')
except:
    issue_comments = pandas.DataFrame()
    
try:
    pulls_comments = pandas.read_csv('../data/pulls_comments.csv.gz',compression='gzip')
except:
    pulls_comments = pandas.DataFrame()

try:
    pulls_review_comments = pandas.read_csv('../data/pulls_review_comments.csv.gz',compression='gzip')
except:
    pulls_review_comments = pandas.DataFrame()

In [15]:
selected_repos = pandas.read_csv('../data/to_get_comments.csv')[::-1]

In [None]:


for repo in selected_repos.project_name.unique():
    # extracting number of issues and prs for proposed repo
    open_issue , closed_issue = get_open_closed_issues(repo)
    open_pr , closed_pr = get_open_closed_pulls(repo)
    
    print(repo )
    print("open issues : " + str(open_issue))
    print("closed issues : " + str(closed_issue))
    print("open pulls : " + str(open_pr))
    print("closed pulls : " + str(closed_pr))
    
    issue_comments = issue_comments.append(get_issues_comments(repo,OPEN,open_issue),ignore_index=True)
    issue_comments = issue_comments.append(get_issues_comments(repo,CLOSED,closed_issue),ignore_index=True)
    issue_comments.to_csv('../data/issue_comments.csv.gz',compression='gzip')
    
    pc, prc = get_pulls_comments(repo,OPEN,open_pr)
    pulls_comments = pulls_comments.append(pc,ignore_index=True)
    pulls_review_comments = pulls_review_comments.append(prc,ignore_index=True)
    pc, prc = get_pulls_comments(repo,CLOSED,closed_pr)
    pulls_comments = pulls_comments.append(pc,ignore_index=True)
    pulls_review_comments = pulls_review_comments.append(prc,ignore_index=True)
    pulls_comments.to_csv('../data/pulls_comments.csv.gz',compression='gzip')
    pulls_review_comments.to_csv('../data/pulls_review_comments.csv.gz',compression='gzip')
    prcoessed_addresses.to_csv('../data/padd.csv')

https://github.com/kimond/ferris-print
open issues : 0
closed issues : 0
open pulls : 0
closed pulls : 2
1 -- https://github.com/kimond/ferris-print/issues?page=1&q=is%3Aissue+is%3Aopen
1 -- https://github.com/kimond/ferris-print/issues?page=1&q=is%3Aissue+is%3Aclosed
1 -- https://github.com/kimond/ferris-print/pulls?page=1&q=is%3Apr+is%3Aopen
1 -- https://github.com/kimond/ferris-print/pulls?page=1&q=is%3Apr+is%3Aclosed
https://api.github.com/repos/kimond/ferris-print/issues/2/comments?client_id=76997fa7dd9cf9f4872f&client_secret=d8754621a63f1d33d5c0427b2efafb94141b8c70
4999
4999
https://api.github.com/repos/kimond/ferris-print/pulls/2/comments?client_id=093a7ed958a206f60a37&client_secret=3dc925411cdea302056f7d22a5123efe2dbad079
https://api.github.com/repos/kimond/ferris-print/issues/1/comments?client_id=76997fa7dd9cf9f4872f&client_secret=d8754621a63f1d33d5c0427b2efafb94141b8c70
4998
4998
https://api.github.com/repos/kimond/ferris-print/pulls/1/comments?client_id=093a7ed958a206f60a37&