# Pull Request Analysis

In [2]:
import psycopg2
import pandas as pd 
# from sqlalchemy.types import Integer, Text, String, DateTime
import sqlalchemy as s
import matplotlib
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json

with open("config.json") as config_file:
    config = json.load(config_file)

database_connection_string = 'postgres+psycopg2://{}:{}@{}:{}/{}'.format(config['user'], config['password'], config['host'], config['port'], config['database'])

dbschema='augur_data'
engine = s.create_engine(
    database_connection_string,
    connect_args={'options': '-csearch_path={}'.format(dbschema)})

## Pull Request Filter

In [3]:
## List of repository IDs for the report
repo_set = {25760, 25663}

# This query describes the total number of issues opened in a repository, and the average, maximum and minimum number of comments on an issue. 

## Getting the Data

In [7]:
pr_all = pd.DataFrame()
# 
for repo_id in repo_set: 

    pr_query = salc.sql.text(f"""
                      SELECT
        repo.repo_id,
        repo.repo_name,
        repo_groups.rg_name,
        E.issues_count,
        AVG ( D.comment_count ) AS average_comments,
        MAX ( D.comment_count ) AS max_comments,
        MIN ( D.comment_count ) AS min_comments,
        stddev( D.comment_count ) AS stddev_comments 
    FROM
        repo
        LEFT OUTER JOIN (
        SELECT
            issues.issue_id,
            issues.repo_id,
            COUNT ( K.issue_msg_ref_id ) AS comment_count 
        FROM
            issues
            LEFT OUTER JOIN issue_message_ref K ON issues.issue_id = K.issue_id 
        WHERE
            pull_request IS NULL -- GitHub provides pull requests in their issues API, as well as their pull requests API. We do not exclude this data from collection because it would make the provenance of the data we collect less transparent. We apply filters in queries and API endpoints, but not collection.
            
        GROUP BY
            issues.issue_id,
            issues.repo_id 
        ORDER BY
            issues.repo_id 
        ) D ON repo.repo_id = D.repo_id,
        repo_groups,
        ( -- subquery table to provide issues count in context 
        SELECT
            repo.repo_id,
            COUNT ( issue_id ) AS issues_count 
        FROM
            repo
            LEFT OUTER JOIN (
            SELECT
                repo.repo_id,
                issues.issue_id --the "double left outer join here seems puzzling. TO preserve "one row per repo" and exclude pull requests, we FIRST need to get a list of issues that are not pull requests, then count those. WIthout the "double left outer join", we would exclude repos that use pull requests, but not issues on GitHub
                
            FROM
                repo
                LEFT OUTER JOIN issues ON issues.repo_id = repo.repo_id 
            WHERE
                issues.pull_request IS NULL -- here again, excluding pull_requests at data analysis, but preserving GitHub API Provenance
                
            ) K ON repo.repo_id = K.repo_id 
        GROUP BY
            repo.repo_id 
        ) E -- this subquery table is what gives us the issue count per repo as context for deciding if repos with very small issue counts are excluded from some analyses.
        
    WHERE
        repo.repo_group_id = repo_groups.repo_group_id 
        AND repo.repo_id = E.repo_id 
        AND repo.repo_id = {repo_id}
    GROUP BY
        repo.repo_id,
        repo.repo_name,
        repo_groups.rg_name,
        repo_groups.repo_group_id,
        E.issues_count 
    ORDER BY
        rg_name,
        repo_name;

        """)
    pr_a = pd.read_sql(pr_query, con=engine)
    if not pr_all.empty: 
        pr_all = pd.concat([pr_all, pr_a]) 
    else: 
        # first repo
        pr_all = pr_a
        


In [8]:
print(pr_all)

   repo_id         repo_name          rg_name  issues_count  average_comments  \
0    25760       spring-boot  Spring-projects         16275          0.378679   
0    25663  spring-framework  Spring-projects         19216          0.309221   

   max_comments  min_comments  stddev_comments  
0           725             0        15.007403  
0           547             0        12.372669  
