# Cumulative time spent in browser test step for each PR

## 🚧 Work in Progress 🚧

In [47]:
# All imports live here.

import pandas as pd
pd.options.plotting.backend = 'plotly'

from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Common utility functions
def describe_detail(df):
    df.describe()

In [48]:
# Load and parse raw data.
with open('../data/raw/jobs.json') as f:
    jobs = pd.read_json(f)

print('Loaded jobs with %d rows. Columns:' % (jobs.shape[0],))
print(jobs.dtypes)

Loaded jobs with 171742 rows. Columns:
workflow_id                           int64
workflow_run_attempt                  int64
job_status                           object
job_conclusion                       object
job_started_at          datetime64[ns, UTC]
job_completed_at        datetime64[ns, UTC]
job_name                             object
step_name                            object
step_status                          object
step_conclusion                      object
step_started_at                      object
step_completed_at                    object
dtype: object


In [49]:
# Load and parse raw data.
with open('../data/raw/playwright.json') as f:
    tests = pd.read_json(f)
print('Loaded tests with %d rows. Columns:' % (tests.shape[0],))
print(tests.dtypes)


Loaded tests with 145119 rows. Columns:
meta_id                                  int64
workflow_id                              int64
workflow_run_number                      int64
workflow_run_attempt                     int64
workflow_created_at        datetime64[ns, UTC]
workflow_event                          object
workflow_head_branch                    object
commit_sha                              object
build_flavor                            object
composite                               object
pw_suite_title                          object
pw_spec_title                           object
pw_test_project_name                    object
pw_test_expected_status                 object
pw_test_status                          object
pw_result_status                        object
pw_result_duration                       int64
pw_result_retry                          int64
dtype: object


In [50]:
# Add simple inferred columns to raw data

jobs['job_started_date'] = jobs['job_started_at'].dt.normalize()
jobs['job_duration'] = jobs['job_completed_at'] - jobs['job_started_at']

jobs['workflow_attempt_uid'] = jobs['workflow_id'].astype(str) + '_' + jobs['workflow_run_attempt'].astype(str)

e2e_dimensions = {
    'Call Composite automation test (stable)': ('call', 'stable'),
    'Call Composite automation test (beta)': ('call', 'beta'),
    'Call With Chat Composite automation test (stable)': ('callWithChat', 'stable'),
    'Call With Chat Composite automation test (beta)': ('callWithChat', 'beta'),
    'Chat Composite automation test (stable)': ('chat', 'stable'),
    'Chat Composite automation test (beta)': ('chat', 'beta'),
}
jobs['e2e-composite'] = jobs['job_name'].map(lambda j: e2e_dimensions.get(j, (None, None))[0])
jobs['e2e-flavor'] = jobs['job_name'].map(lambda j: e2e_dimensions.get(j, (None, None))[1])

print('Augmented jobs with inferred columns:')
print(jobs.dtypes)

Augmented jobs with inferred columns:
workflow_id                           int64
workflow_run_attempt                  int64
job_status                           object
job_conclusion                       object
job_started_at          datetime64[ns, UTC]
job_completed_at        datetime64[ns, UTC]
job_name                             object
step_name                            object
step_status                          object
step_conclusion                      object
step_started_at                      object
step_completed_at                    object
job_started_date        datetime64[ns, UTC]
job_duration                timedelta64[ns]
workflow_attempt_uid                 object
e2e-composite                        object
e2e-flavor                           object
dtype: object


In [51]:
# Filter down to test jobs

e2e_jobs = jobs[jobs['e2e-composite'].notnull()]
# Each job has multiple steps. Drop down to one row per job.
e2e_jobs_no_steps = e2e_jobs[[
    'workflow_id',
    'workflow_run_attempt',
    'job_status',
    'job_conclusion',
    'job_started_at',
    'job_completed_at',
    'job_name',
    'job_started_date',
    'job_duration',
    'workflow_attempt_uid',
    'e2e-composite',
    'e2e-flavor']].drop_duplicates()

In [52]:
# Correlate with head_branch

df = tests[['workflow_id', 'workflow_head_branch']]
df = pd.merge(e2e_jobs_no_steps, df, how="left", on="workflow_id")
e2e_jobs_with_branch = df

In [64]:
# Summarize for each branch
df = e2e_jobs_with_branch[['job_started_date', 'workflow_head_branch']].copy()
df['job_duration_hours'] = e2e_jobs_with_branch['job_duration'].map(lambda x: x.total_seconds() / 3600)
df = df.groupby(['workflow_head_branch']).apply(lambda s: pd.Series({
    'first_job_started_date': s['job_started_date'].min(),
    'job_duration_hours': s['job_duration_hours'].sum(),
}))
e2e_jobs_runtimes = df
df

Unnamed: 0_level_0,first_job_started_date,job_duration_hours
workflow_head_branch,Unnamed: 1_level_1,Unnamed: 2_level_1
alkwa/endcall-PoC,2022-07-19 00:00:00+00:00,243.273333
alkwa/fix-contextmenu-with-lots-of-participants,2022-06-29 00:00:00+00:00,679.996944
alkwa/update-js-storybook,2022-07-26 00:00:00+00:00,884.169722
anjulgarg/2940249-videotile-call-connection-state,2022-08-02 00:00:00+00:00,639.963333
anjulgarg/2940468-participant-state,2022-08-02 00:00:00+00:00,226.191389
...,...,...
prprabhu/rush-use-workspaces,2022-07-18 00:00:00+00:00,531.557500
prprabhu/scratch-cc-dialpad,2022-07-21 00:00:00+00:00,191.759444
prprabhu/skip-sb-on-stable,2022-07-21 00:00:00+00:00,229.775000
prprabhu/tiny-prerelease-action-fix,2022-06-29 00:00:00+00:00,3278.073333


In [66]:
df = e2e_jobs_runtimes    
df = df.groupby(['first_job_started_date']).apply(lambda s: pd.Series({
    '50': s['job_duration_hours'].quantile(0.5),
    '90': s['job_duration_hours'].quantile(0.9),
    '95': s['job_duration_hours'].quantile(0.95),
}))
fig = df.plot(title='Bot time spent in e2e CI step for each PR', labels=dict(index='PR creation date', value='Runtime (hours)', variable='%ile'))
fig.update_traces(mode='markers+lines')
fig.show()