# Supporting code and data for "..."

In [1]:
%matplotlib inline

import os
import sys
print(f'Python {sys.version}')

import IPython
from IPython.core.display import display, HTML
print(f'IPython {IPython.__version__}')

print('\nLibraries:\n')

import csv
print(f'csv {csv.__version__}')

import matplotlib
import matplotlib.pyplot as plt
print(f'matplotlib {matplotlib.__version__}')

import numpy as np
print(f'numpy {np.__version__}')

import pandas as pd
from pandas.plotting import register_matplotlib_converters
print(f'pandas {pd.__version__}')

import re
print(f're {re.__version__}')

import requests
print(f'requests {requests.__version__}')

#import scipy
#import scipy.stats
#print(f'scipy {scipy.__version__}')


#import statsmodels
#import statsmodels.formula.api as smf
#from statsmodels.stats.outliers_influence import summary_table
#print(f'statsmodels {statsmodels.__version__}')

Python 3.9.6 (default, Jun 28 2021, 08:57:49) 
[GCC 10.3.0]
IPython 7.24.1

Libraries:

csv 1.0
matplotlib 3.4.2
numpy 1.20.3
pandas 1.2.4
re 2.2.1
requests 2.25.1


## Data collection

We use the GitHub GraphQL API because it allows fetching only the information we need, and at a much faster rate (we can get up to 100 nodes in a single request). Getting all the objects of a certain type requires then to repeat the request to go through all the pages of results.

You need to provide a personal `api_token` if you want to get fresh data from GitHub. Otherwise, this notebook will skip the data collection step and load the CSV files from the local filesystem.

In [2]:
api_token = ''

In [3]:
def requestAllPages(query,rows_and_next_variables,filename,columns):
  if api_token == '':
    return
  headers = {'Authorization': f'token {api_token}'}
  url = 'https://api.github.com/graphql'
  rows, variables = rows_and_next_variables(None)
  while len(variables)>0:
    json = {'query':query,'variables':variables.pop()}
    r = requests.post(url=url, json=json, headers=headers)
    if r.status_code == 403:
      print('Unauthorized request:')
      print(json)
    r.raise_for_status() # Abort if unsuccessful request
    new_rows, next_variables = rows_and_next_variables(r.json()['data'])
    rows += new_rows
    variables += next_variables
  if len(rows) > 0:
    with open(filename, 'w') as f:
      writer = csv.writer(f)
      writer.writerow(columns)
      writer.writerows(rows)

We look for all the pull requests where the bot has proposed to minimize the CI failures. We get them by searching for the words "coqbot ci minimize".
This query is redundant with the next one, but useful if one only wants the list of PRs.
Uncomment the last line and make sure to provide an `api_token` to run it.

In [4]:
def fetch_prs():

  query = """
    query getPullRequestList($cursor: String) {
      search(query: "repo:coq/coq coqbot ci minimize", type:ISSUE, first: 100, after: $cursor) {
        nodes {
          ... on PullRequest {
            number
          }
        }
        pageInfo {
          hasNextPage
          endCursor
        }
      }
    }
  """

  def rows_and_next_variables(data):
    if data is None:
      return [], [{}]
    else:
      rows = []
      pullRequests = data['search']
      for node in pullRequests['nodes']:
        if 'number' in node:
            rows.append([node['number']])
      if pullRequests['pageInfo']['hasNextPage']:
        return rows, [{'cursor':pullRequests['pageInfo']['endCursor']}]
      else:
        return rows, []

  requestAllPages(
      query,
      rows_and_next_variables,
      'pullrequests.csv',
      ['number']
  )

# fetch_prs()

Here, we search again for all PRs where CI minimization was proposed but we retrieve all the comments to know what happened. Uncomment the last line and make sure to provide an `api_token` to re-run this.

In [5]:
def fetch_pr_comments():

  query = """
    query commentQuery($number: Int!, $single: Boolean!, $prCursor: String, $commentCursor: String) {
      search(query: "repo:coq/coq coqbot ci minimize", type:ISSUE, first: 10, after: $prCursor) @skip (if: $single) {
        pageInfo {
          endCursor
          hasNextPage
        }
        nodes {
          ... pullRequest
        }
      }
      repository(owner: "coq", name: "coq") @include (if: $single) {
        pullRequest(number: $number) {
          ... pullRequest
        }
      }
    }

    fragment pullRequest on PullRequest {
      number
      comments(first: 50, after: $commentCursor) {
        pageInfo {
          endCursor
          hasNextPage
        }
        nodes {
          createdAt
          author { login }
          bodyText
        }
      }
    }
  """

  def treat_pr(pr):
    rows, variables = [], []
    number = pr['number']
    for comment in pr['comments']['nodes']:
      date = pd.to_datetime(comment['createdAt']).tz_localize(None)
      body = comment['bodyText'][:500].replace('\n','\\n')
      rows.append([number,date,comment['author']['login'],body])
    if pr['comments']['pageInfo']['hasNextPage']:
      variables += [{
          'single':True,
          'number':number,
          'commentCursor':pr['comments']['pageInfo']['endCursor']
      }]
    return rows, variables

  def rows_and_next_variables(data):
    if data is None:
      return [], [{'single':False,'number':0}]
    else:
      if 'search' in data:
        prs = data['search']
        rows, variables = [], []
        for pr in prs['nodes']:
          if 'number' in pr:
            new_rows, new_variables = treat_pr(pr)
            rows += new_rows
            variables += new_variables
        if prs['pageInfo']['hasNextPage']:
          variables += [{
              'single':False,
              'number':0,
              'prCursor':prs['pageInfo']['endCursor']
          }]
        return rows, variables
      else:
        return treat_pr(data['repository']['pullRequest'])

  requestAllPages(
      query,
      rows_and_next_variables,
      'pr_comments.csv',
      ['number','date','author','body']
  )

# fetch_pr_comments()

## Data processing

In [6]:
def load_csv(filename):
  
  df = pd.read_csv(filename,parse_dates=True,index_col=1)
  print(f'File retrieved from local file system: {filename}')
  return df

In [7]:
pr_comments = load_csv('pr_comments.csv')

File retrieved from local file system: pr_comments.csv


In [8]:
pr_comments

Unnamed: 0_level_0,number,author,body
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-08-05 22:01:30,14748,coqbot-app,"Hey, I have detected that there were CI failur..."
2021-08-14 13:41:48,14748,JasonGross,@coqbot ci minimize ci-hott
2021-08-14 13:41:49,14748,coqbot-app,I am now running minimization at commit 77e336...
2021-08-14 13:45:05,14748,coqbot-app,Error: Could not minimize file (from ci-hott)...
2021-08-14 15:41:17,14748,coqbot-app,"Hey, I have detected that there were CI failur..."
...,...,...,...
2021-08-14 22:09:47,14579,coqbot-app,Minimized File /github/workspace/builds/coq/co...
2021-08-14 23:40:50,14579,coqbot-app,Minimized File /github/workspace/builds/coq/co...
2021-08-15 01:47:43,14579,coqbot-app,Minimized File /github/workspace/builds/coq/co...
2021-09-16 02:16:01,14579,coqbot-app,"The ""needs: rebase"" label was set more than 30..."


In [9]:
ci_minimize_comments = pr_comments[pr_comments['body'].str.contains('@coqbot:? ci minimize', case=False) & ~ pr_comments['author'].isin(['coqbot-app', 'JasonGross', 'Zimmi48'])].sort_values('number')
ci_minimize_comments

Unnamed: 0_level_0,number,author,body
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-07-25 16:58:40,11966,olaure01,@coqbot ci minimize
2021-08-31 21:07:07,12512,ppedrot,@coqbot ci minimize
2021-05-27 15:31:10,13895,ana-borges,@coqbot ci minimize
2021-05-26 12:15:38,13969,mattam82,@coqbot ci minimize ci-fourcolor
2021-06-14 09:42:44,14137,SkySkimmer,@coqbot ci minimize
2021-05-16 20:50:22,14234,liyishuai,@coqbot ci minimize\n(Did the bot just talk to...
2021-06-05 20:15:02,14234,liyishuai,@coqbot ci minimize
2021-05-15 23:18:53,14252,jfehrle,@coqbot: ci minimize
2021-05-26 13:51:37,14391,SkySkimmer,@coqbot ci minimize fiat-crypto-legacy
2021-06-10 17:23:25,14480,SkySkimmer,@coqbot ci minimize ci-iris


In [10]:
ci_minimize_comments.drop_duplicates(subset=['number', 'author']).sort_values('author')

Unnamed: 0_level_0,number,author,body
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-09-02 13:49:34,14819,Alizter,@coqbot ci minimize
2021-08-13 12:20:58,14777,Alizter,@coqbot ci minimize
2021-08-11 21:54:13,14758,Alizter,@coqbot ci minimize
2021-08-04 14:04:24,14733,SkySkimmer,@coqbot ci minimize
2021-08-14 11:39:06,14785,SkySkimmer,@coqbot ci minimize
2021-08-14 07:59:42,14783,SkySkimmer,@coqbot ci minimize\n(error at https://github....
2021-09-24 10:00:58,14924,SkySkimmer,@coqbot ci minimize bedrock2
2021-06-10 17:23:25,14480,SkySkimmer,@coqbot ci minimize ci-iris
2021-10-05 17:13:30,14986,SkySkimmer,@coqbot ci minimize
2021-06-14 09:42:44,14137,SkySkimmer,@coqbot ci minimize
