# Supporting code and data for "..."

In [1]:
%matplotlib inline

import os
import sys
print(f'Python {sys.version}')

import IPython
from IPython.core.display import display, HTML
print(f'IPython {IPython.__version__}')

print('\nLibraries:\n')

import csv
print(f'csv {csv.__version__}')

import matplotlib
import matplotlib.pyplot as plt
print(f'matplotlib {matplotlib.__version__}')

import numpy as np
print(f'numpy {np.__version__}')

import pandas as pd
from pandas.plotting import register_matplotlib_converters
print(f'pandas {pd.__version__}')

import re
print(f're {re.__version__}')

import requests
print(f'requests {requests.__version__}')

#import scipy
#import scipy.stats
#print(f'scipy {scipy.__version__}')


#import statsmodels
#import statsmodels.formula.api as smf
#from statsmodels.stats.outliers_influence import summary_table
#print(f'statsmodels {statsmodels.__version__}')

Python 3.9.6 (default, Jun 28 2021, 08:57:49) 
[GCC 10.3.0]
IPython 7.24.1

Libraries:

csv 1.0
matplotlib 3.4.2
numpy 1.20.3
pandas 1.2.4
re 2.2.1
requests 2.25.1


## Data collection

We use the GitHub GraphQL API because it allows fetching only the information we need, and at a much faster rate (we can get up to 100 nodes in a single request). Getting all the objects of a certain type requires then to repeat the request to go through all the pages of results.

You need to provide a personal `api_token` if you want to get fresh data from GitHub. Otherwise, this notebook will skip the data collection step and load the CSV files from the local filesystem.

In [2]:
api_token = ''

In [3]:
def requestAllPages(query,rows_and_next_variables,filename,columns):
  if api_token == '':
    return
  headers = {'Authorization': f'token {api_token}'}
  url = 'https://api.github.com/graphql'
  rows, variables = rows_and_next_variables(None)
  while len(variables)>0:
    json = {'query':query,'variables':variables.pop()}
    r = requests.post(url=url, json=json, headers=headers)
    if r.status_code == 403:
      print('Unauthorized request:')
      print(json)
    r.raise_for_status() # Abort if unsuccessful request
    new_rows, next_variables = rows_and_next_variables(r.json()['data'])
    rows += new_rows
    variables += next_variables
  if len(rows) > 0:
    with open(filename, 'w') as f:
      writer = csv.writer(f)
      writer.writerow(columns)
      writer.writerows(rows)

We search all PRs where CI minimization was proposed (excluding those authored by Jason Gross, which were mostly to debug the minimizer) and we retrieve all the comments from coqbot-app to know what happened. We only keep the first 15 lines of each comment, to reduce the size of the CSV file, because these lines will contain all the information we need.

Make sure to uncomment the last line and to provide an `api_token` to re-run this.

In [4]:
def fetch_pr_comments():

  query = """
    query commentQuery($number: Int!, $single: Boolean!, $prCursor: String, $commentCursor: String) {
      search(query: "repo:coq/coq coqbot ci minimize", type:ISSUE, first: 10, after: $prCursor) @skip (if: $single) {
        pageInfo {
          endCursor
          hasNextPage
        }
        nodes {
          ... pullRequest
        }
      }
      repository(owner: "coq", name: "coq") @include (if: $single) {
        pullRequest(number: $number) {
          ... pullRequest
        }
      }
    }

    fragment pullRequest on PullRequest {
      number
      author { login }
      comments(first: 50, after: $commentCursor) {
        pageInfo {
          endCursor
          hasNextPage
        }
        nodes {
          createdAt
          author { login }
          bodyText
          databaseId
        }
      }
    }
  """

  def treat_pr(pr):
    rows, variables = [], []
    number = pr['number']
    if pr['author']['login'] != 'JasonGross':
      for comment in pr['comments']['nodes']:
        if comment['author']['login'] == "coqbot-app":
          date = pd.to_datetime(comment['createdAt']).tz_localize(None)
          body = '\\n'.join(comment['bodyText'].split('\n')[:15])
          rows.append([comment['databaseId'],number,date,body])
    if pr['comments']['pageInfo']['hasNextPage']:
      variables += [{
          'single':True,
          'number':number,
          'commentCursor':pr['comments']['pageInfo']['endCursor']
      }]
    return rows, variables

  def rows_and_next_variables(data):
    if data is None:
      return [], [{'single':False,'number':0}]
    else:
      if 'search' in data:
        prs = data['search']
        rows, variables = [], []
        for pr in prs['nodes']:
          if 'number' in pr:
            new_rows, new_variables = treat_pr(pr)
            rows += new_rows
            variables += new_variables
        if prs['pageInfo']['hasNextPage']:
          variables += [{
              'single':False,
              'number':0,
              'prCursor':prs['pageInfo']['endCursor']
          }]
        return rows, variables
      else:
        return treat_pr(data['repository']['pullRequest'])

  requestAllPages(
      query,
      rows_and_next_variables,
      'pr_comments.csv',
      ['id','number','date','body']
  )

# fetch_pr_comments()

## Data processing

In [5]:
coqbot_comments = pd.read_csv('pr_comments.csv',parse_dates=['date'],index_col=0)

In [6]:
ci_minimize_results = coqbot_comments[coqbot_comments['body'].str.contains('Minimized File')].sort_values('number')
ci_minimize_results

Unnamed: 0_level_0,number,date,body
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
986083609,11966,2021-12-04 19:51:01,Minimized File /github/workspace/builds/coq/co...
985996564,11966,2021-12-04 09:16:38,Minimized File /github/workspace/builds/coq/co...
985961859,11966,2021-12-04 03:59:43,Minimized File /github/workspace/builds/coq/co...
985891061,11966,2021-12-03 22:42:41,Minimized File /github/workspace/builds/coq/co...
886395867,11966,2021-07-26 05:49:12,Minimized File /github/workspace/builds/coq/co...
...,...,...,...
1017955960,15518,2022-01-20 21:47:05,Minimized File /github/workspace/builds/coq/co...
1017957113,15518,2022-01-20 21:48:56,Minimized File /github/workspace/builds/coq/co...
1017959688,15518,2022-01-20 21:53:07,Minimized File /github/workspace/builds/coq/co...
1018009908,15518,2022-01-20 23:14:59,Minimized File /github/workspace/builds/coq/co...


In [7]:
minimized_prs = ci_minimize_results['number'].drop_duplicates()
len(minimized_prs)

50

### Matching CI minimize comments

We look for comments marking the beginning and the end of the minimization. We only keep the first run for each pull request and minimized project to avoid double counting.

In [8]:
minimization_started_comments = coqbot_comments[coqbot_comments['body'].str.match(r'I (?:have initiated|am now running) minimization at commit [a-z0-9]* (?:for the suggested targets? | on)?')]
minimization_started_comments = minimization_started_comments[~minimization_started_comments.index.duplicated(keep='first')]
targets = minimization_started_comments['body'].str.extractall(r'(?P<target>ci-[^,.\s]*)')
minimization_started_comments = targets.join(minimization_started_comments).sort_values('date').drop_duplicates(subset=['target','number']).set_index(['target','number'])[['date']]
minimization_started_comments

Unnamed: 0_level_0,Unnamed: 1_level_0,date
target,number,Unnamed: 2_level_1
ci-mathcomp,13969,2021-05-21 22:34:08
ci-fourcolor,13969,2021-05-26 14:10:29
ci-equations,13969,2021-05-26 14:10:29
ci-iris,13969,2021-05-26 14:10:29
ci-perennial,13969,2021-05-26 14:10:29
...,...,...
ci-quickchick,15518,2022-01-20 21:17:11
ci-perennial,15518,2022-01-20 21:17:11
ci-metacoq,15518,2022-01-20 21:17:11
ci-math_classes,15518,2022-01-20 21:17:11


In [9]:
minimization_success_comments = coqbot_comments[coqbot_comments['body'].str.startswith('Minimized File') & ~coqbot_comments['body'].str.contains('interrupted by timeout, being automatically continued')]
minimization_success_comments = minimization_success_comments.assign(runtime=minimization_success_comments['body'].str.extract(r'Expected coqc runtime on this file: (?P<runtime>[0-9\.]*) sec'))
minimization_success_comments

Unnamed: 0_level_0,number,date,body,runtime
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
996116369,15274,2021-12-16 19:31:28,Minimized File /github/workspace/builds/coq/co...,0.213
1001010494,15400,2021-12-25 12:16:25,Minimized File /github/workspace/builds/coq/co...,1.018
1017939088,15518,2022-01-20 21:22:16,Minimized File /github/workspace/builds/coq/co...,
1017939694,15518,2022-01-20 21:23:10,Minimized File /github/workspace/builds/coq/co...,0.117
1017939985,15518,2022-01-20 21:23:38,Minimized File /github/workspace/builds/coq/co...,0.134
...,...,...,...,...
872424689,13107,2021-07-01 17:27:36,Minimized File /github/workspace/builds/coq/co...,
872633626,13107,2021-07-02 00:39:37,Minimized File /github/workspace/builds/coq/co...,
872648442,13107,2021-07-02 01:28:51,Minimized File /github/workspace/builds/coq/co...,
872665027,13107,2021-07-02 02:22:08,Minimized File /github/workspace/builds/coq/co...,


In [10]:
minimization_failure_comments = coqbot_comments[coqbot_comments['body'].str.startswith('Error: Could not minimize file')]
minimization_failure_comments

Unnamed: 0_level_0,number,date,body
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1017937526,15518,2022-01-20 21:20:13,Error: Could not minimize file (from ci-sf) (...
1017937646,15518,2022-01-20 21:20:22,Error: Could not minimize file (from ci-itaut...
1017937663,15518,2022-01-20 21:20:24,Error: Could not minimize file (from ci-relat...
1017937985,15518,2022-01-20 21:20:54,Error: Could not minimize file (from ci-metac...
1017944365,15518,2022-01-20 21:29:39,Error: Could not minimize file (from ci-categ...
1018010794,15518,2022-01-20 23:16:34,Error: Could not minimize file (from ci-categ...
1018023671,15518,2022-01-20 23:41:13,Error: Could not minimize file (from ci-sf) (...
1018023737,15518,2022-01-20 23:41:23,Error: Could not minimize file (from ci-metac...
1018023811,15518,2022-01-20 23:41:33,Error: Could not minimize file (from ci-relat...
1018023841,15518,2022-01-20 23:41:38,Error: Could not minimize file (from ci-categ...


In [11]:
minimization_finished_comments = minimization_success_comments.assign(success=True).append(minimization_failure_comments.assign(success=False))
targets = minimization_finished_comments['body'].str.extract(r'(?P<target>ci-[^)]*)')
minimization_finished_comments = targets.join(minimization_finished_comments).sort_values('date').drop_duplicates(subset=['target','number']).set_index(['target','number'])[['date','success','runtime']]
minimization_finished_comments

Unnamed: 0_level_0,Unnamed: 1_level_0,date,success,runtime
target,number,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ci-mathcomp,13969,2021-05-21 22:37:15,False,
ci-iris,13969,2021-05-26 14:13:55,False,
ci-equations,13969,2021-05-26 14:14:19,True,
ci-fourcolor,13969,2021-05-26 14:15:27,True,
ci-perennial,13969,2021-05-26 14:16:08,True,
...,...,...,...,...
ci-iris,15518,2022-01-20 21:48:56,True,0.265
ci-compcert,15518,2022-01-20 21:53:07,True,0.339
ci-coqprime,15518,2022-01-20 22:25:52,True,0.488
ci-argosy,15518,2022-01-20 23:14:59,True,0.714


In [12]:
minimization_pairs = minimization_started_comments.join(minimization_finished_comments,lsuffix='_start',rsuffix='_end')
minimization_pairs = minimization_pairs.assign(duration=(minimization_pairs['date_end'] - minimization_pairs['date_start']).dt.seconds)
minimization_pairs

Unnamed: 0_level_0,Unnamed: 1_level_0,date_start,date_end,success,runtime,duration
target,number,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ci-mathcomp,13969,2021-05-21 22:34:08,2021-05-21 22:37:15,False,,187.0
ci-fourcolor,13969,2021-05-26 14:10:29,2021-05-26 14:15:27,True,,298.0
ci-equations,13969,2021-05-26 14:10:29,2021-05-26 14:14:19,True,,230.0
ci-iris,13969,2021-05-26 14:10:29,2021-05-26 14:13:55,False,,206.0
ci-perennial,13969,2021-05-26 14:10:29,2021-05-26 14:16:08,True,,339.0
...,...,...,...,...,...,...
ci-quickchick,15518,2022-01-20 21:17:11,2022-01-20 21:25:12,True,0.130,481.0
ci-perennial,15518,2022-01-20 21:17:11,2022-01-20 21:47:05,True,0.256,1794.0
ci-metacoq,15518,2022-01-20 21:17:11,2022-01-20 21:20:54,False,,223.0
ci-math_classes,15518,2022-01-20 21:17:11,2022-01-20 21:32:19,True,0.169,908.0


In [13]:
successful_minimization_pairs = minimization_pairs[minimization_pairs['success'] == True][['duration','runtime']]
successful_minimization_pairs

Unnamed: 0_level_0,Unnamed: 1_level_0,duration,runtime
target,number,Unnamed: 2_level_1,Unnamed: 3_level_1
ci-fourcolor,13969,298.0,
ci-equations,13969,230.0,
ci-perennial,13969,339.0,
ci-quickchick,13969,534.0,
ci-interval,13895,353.0,
...,...,...,...
ci-unimath,15518,562.0,0.150
ci-rewriter,15518,359.0,0.117
ci-quickchick,15518,481.0,0.130
ci-perennial,15518,1794.0,0.256


Proportion of the time the minimizer was able to produce a minimized file:

In [14]:
len(successful_minimization_pairs)/len(minimization_pairs)

0.8391959798994975

In [15]:
len(minimization_success_comments.dropna(subset=['runtime']))

54

In [16]:
successful_minimization_pairs.groupby(['target']).agg(['median','count'])

Unnamed: 0_level_0,duration,duration
Unnamed: 0_level_1,median,count
target,Unnamed: 1_level_2,Unnamed: 2_level_2
ci-aac_tactics,232.0,1
ci-argosy,3163.5,4
ci-bbv,619.0,3
ci-bedrock2,671.0,11
ci-bignums,8529.0,1
ci-category_theory,457.0,5
ci-color,2564.0,6
ci-compcert,1197.0,5
ci-coq_performance_tests,232.0,1
ci-coqprime,4121.0,1


In [17]:
minimization_pairs_reset = minimization_pairs.reset_index()
minimization_pairs_reset[minimization_pairs_reset['target'].isin(['ci-bignums'])]

Unnamed: 0,target,number,date_start,date_end,success,runtime,duration
51,ci-bignums,13107,2021-06-29 14:41:10,2021-06-29 17:03:19,True,,8529.0


In [18]:
successful_minimization_pairs['duration'].describe(percentiles=[0.5,0.6,0.7,0.8,0.9])

count      167.000000
mean      5419.125749
std      11047.054423
min        165.000000
50%        919.000000
60%       1506.400000
70%       3191.200000
80%       6133.000000
90%      19204.000000
max      73072.000000
Name: duration, dtype: float64