In [9]:
import psycopg2
import pandas as pd 
# from sqlalchemy.types import Integer, Text, String, DateTime
import sqlalchemy as s
import matplotlib
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json

with open("config.json") as config_file:
    config = json.load(config_file)

database_connection_string = 'postgres+psycopg2://{}:{}@{}:{}/{}'.format(config['user'], config['password'], config['host'], config['port'], config['database'])

dbschema='augur_data'
engine = s.create_engine(
    database_connection_string,
    connect_args={'options': '-csearch_path={}'.format(dbschema)})

In [10]:
repo_list = pd.DataFrame()
repo_list_query = f"""
SELECT repo_id, repo_name, repo_path from repo
WHERE repo_name = 'concourse' OR repo_name = 'postfacto' or repo_name = 'clarity' or repo_name = 'gpdb' or
      repo_name = 'kpack';
    """
repo_list = pd.read_sql_query(repo_list_query, con=engine)
print(repo_list)

   repo_id  repo_name                              repo_path
0    26235  concourse               github.com/pcfdev-forks/
1    28051  concourse                  github.com/concourse/
2    27913    clarity                     github.com/vmware/
3    26983  postfacto                    github.com/pivotal/
4    27169       gpdb                github.com/pivotal-gss/
5    25857       gpdb               github.com/greenplum-db/
6    26600       gpdb  github.com/Pivotal-Field-Engineering/
7    27043      kpack                    github.com/pivotal/


In [31]:
repo_set = {26983}

In [32]:
pr_all = pd.DataFrame()
# 
for repo_id in repo_set: 

    pr_query = f"""
    SELECT
        repo.repo_id,
        repo.repo_name,
        repo_groups.rg_name,
        E.issues_count
    FROM
        repo
        LEFT OUTER JOIN (
        SELECT
            issues.issue_id,
            issues.repo_id 
        FROM
            issues
            LEFT OUTER JOIN issue_message_ref K ON issues.issue_id = K.issue_id 
        WHERE
            pull_request IS NULL -- GitHub provides pull requests in their issues API, as well as their pull requests API. We do not exclude this data from collection because it would make the provenance of the data we collect less transparent. We apply filters in queries and API endpoints, but not collection.
            
        GROUP BY
            issues.issue_id,
            issues.repo_id 
        ORDER BY
            issues.repo_id 
        ) D ON repo.repo_id = D.repo_id,
        repo_groups,
        ( -- subquery table to provide issues count in context 
        SELECT
            repo.repo_id,
            COUNT ( issue_id ) AS issues_count 
        FROM
            repo
            LEFT OUTER JOIN (
            SELECT
                repo.repo_id,
                issues.issue_id --the "double left outer join here seems puzzling. TO preserve "one row per repo" and exclude pull requests, we FIRST need to get a list of issues that are not pull requests, then count those. WIthout the "double left outer join", we would exclude repos that use pull requests, but not issues on GitHub
                
            FROM
                repo
                LEFT OUTER JOIN issues ON issues.repo_id = repo.repo_id 
            WHERE
                issues.pull_request IS NULL -- here again, excluding pull_requests at data analysis, but preserving GitHub API Provenance
                
            ) K ON repo.repo_id = K.repo_id 
        GROUP BY
            repo.repo_id 
        ) E -- this subquery table is what gives us the issue count per repo as context for deciding if repos with very small issue counts are excluded from some analyses.
        
    WHERE
        repo.repo_group_id = repo_groups.repo_group_id 
        AND repo.repo_id = E.repo_id 
        AND repo.repo_id = {repo_id}
    GROUP BY
        repo.repo_id,
        repo.repo_name,
        repo_groups.rg_name,
        repo_groups.repo_group_id,
        E.issues_count 
    ORDER BY
        rg_name,
        repo_name;

        """
    pr_a = pd.read_sql(pr_query, con=engine)
    if not pr_all.empty: 
        pr_all = pd.concat([pr_all, pr_a]) 
    else: 
        # first repo
        pr_all = pr_a
        
print(pr_all)

   repo_id  repo_name  rg_name  issues_count
0    26983  postfacto  pivotal           110


In [33]:
#from datetime import date
import datetime 

current = datetime.date.today()
today = "'" + str(current) + "'"
print(today)

first_current = current.replace(day=1)
last_month = first_current - datetime.timedelta(days=1)
end_date = "'" + str(last_month) + "'"
print(end_date)

print

start = last_month - datetime.timedelta(days=365)
year_ago = "'" + str(start) + "'"
print(year_ago)

'2020-03-12'
'2020-02-29'
'2019-03-01'


In [39]:
pr_all = pd.DataFrame()
# 
for repo_id in repo_set: 

    pr_query = f"""
    SELECT issue_id, issue_title, repo_id, pull_request, created_at, updated_at FROM issues
    WHERE
        repo_id = {repo_id}
        AND pull_request is NULL
        AND created_at >= {year_ago}
        AND created_at <= {end_date}
    ORDER BY
        created_at;
    """
    pr_a = pd.read_sql(pr_query, con=engine)
    if not pr_all.empty: 
        pr_all = pd.concat([pr_all, pr_a]) 
    else: 
        # first repo
        pr_all = pr_a
        
print(pr_all)

    issue_id                                        issue_title  repo_id  \
0     387660  Heroku deploy doesn't allow users to log in to...    26983   
1     387653  Retro list page errors when auth token out of ...    26983   
2     387650                Investigate "forced logout" feature    26983   
3     387649  Felicity wants creating an action to leave her...    26983   
4     387648                                 Oh no! It's broken    26983   
5     387647            Use of sed in deploy.sh is mac-specific    26983   
6     387646  Deploy.sh incorrectly filling/missing placehol...    26983   
7     387645  PG::UndefinedColumn: ERROR:  column "auth_toke...    26983   
8     387643                  Heroku deploy script doesn't work    26983   
9     387642                       Ruby version 2.6.1 not found    26983   
10    387639  Ability to vote on items while another one is ...    26983   
11    387638  PWS: frontend and backend cannot connect becau...    26983   
12    387636

In [48]:
pr_all = pd.DataFrame()
# 
for repo_id in repo_set: 

    pr_query = f"""
    SELECT issues.issue_id, issues.created_at, issue_events.action, issue_events.created_at
    FROM issues, issue_events
    WHERE
        issues.issue_id = issue_events.issue_id
        AND issues.repo_id = {repo_id}
        AND issues.pull_request is NULL
        AND issues.created_at >= {year_ago}
        AND issues.created_at <= {end_date}
    ORDER BY
        issues.created_at;
    """
    pr_a = pd.read_sql(pr_query, con=engine)
    if not pr_all.empty: 
        pr_all = pd.concat([pr_all, pr_a]) 
    else: 
        # first repo
        pr_all = pr_a
        
print(pr_all)

    issue_id          created_at            action          created_at
0     387567 2020-01-15 13:57:15         mentioned 2020-01-30 14:15:21
1     387567 2020-01-15 13:57:15        subscribed 2020-01-30 14:15:21
2     387567 2020-01-15 13:57:15         mentioned 2020-01-30 15:15:56
3     387567 2020-01-15 13:57:15        subscribed 2020-01-30 15:15:56
4     387567 2020-01-15 13:57:15  added_to_project 2020-02-03 18:43:34
5     387567 2020-01-15 13:57:15           labeled 2020-02-03 18:43:38
6     387567 2020-01-15 13:57:15         mentioned 2020-02-03 18:43:41
7     387567 2020-01-15 13:57:15        subscribed 2020-02-03 18:43:41
8     387567 2020-01-15 13:57:15         mentioned 2020-02-03 19:06:21
9     387567 2020-01-15 13:57:15        subscribed 2020-02-03 19:06:21
10    387567 2020-01-15 13:57:15         mentioned 2020-02-03 19:06:21
11    387567 2020-01-15 13:57:15        subscribed 2020-02-03 19:06:21
12    387567 2020-01-15 13:57:15         mentioned 2020-02-08 10:46:46
13    