In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import time
import glob 
import os
import pandas as pd
import numpy as np
import qgrid

# Analysis

In [None]:
latest_res = max(glob.glob("data/res-*"), key=os.path.getctime)
res = pd.read_pickle(latest_res)
res.info()

## Error analysis

In [None]:
res["retest"].value_counts()

In [None]:
# Problem only 3 different bodies in the retest?, everything else occurred
res.loc[res["retest"] == True].groupby(["browser", "inc_method"]).agg(["nunique"])

In [None]:
res["gp_download_bar_height"] = res["gp_download_bar_height"].astype("int")
res["gp_download_bar_height_bin"] = res["gp_download_bar_height"].apply(lambda x: True if x > 45 else False)

In [None]:
# Get all on purpose duplicated entries (once retest=True, once=False) (+the ones that are for unknown reasons doubled (8 tests for the main run, 2 with different results, +another 16000))
his = res.loc[res.duplicated(subset=["browser_id", "test_id"], keep=False)]
#qgrid.show_grid(his)
# Check them for uniqueness on other columns
#qgrid.show_grid(his.groupby(["browser_id", "test_id"]).agg("nunique"), show_toolbar=True)

his = his.groupby(["browser_id", "test_id"]).agg("nunique")
his = his.loc[:, (his != his.iloc[0]).any()]
his = his[his.columns[~his.columns.isin(["loading_time", "complete_time"])]]
his = his.drop(his.filter(regex="id$").columns,axis=1)
tis = pd.merge(his.reset_index()[["browser_id", "test_id"]], res, how="inner", on=["browser_id", "test_id"])
# All entries that have more than one value
#qgrid.show_grid(his, show_toolbar=True)
his

In [None]:
tis.groupby(["retest", "browser_id"]).count()

In [None]:
res.loc[res["test_id"] == 4414621]

In [None]:
# Show all unstable methods with some info
# gp_window_onblur is unstable in firefox (do not use?)
# gp_download_bar_height is unstable in chromium-based browsers (automation banner) -> replace with new column smaller/larger 45? this should be stable?/or remove?
# for some the timeout/wait_time was not high enough?, we need to increase it? (old wait_time was 120ms)
# event_set, op_frame_count, op_win_window, op_win_opener all differences are due to wait_time too small? (firefox maybe not?)
# op_win_history_length was buggy, because it does not reset for page loads (now we open a new tab for every URL and it should be fixed) (50 is maximum)
# Retest window.open results: all op_ methods are stable? (except for chrome where the retest was buggy? allmost all diffs have the same scheme between first try and retest try
# we have to use chrome and edge now? (or redo chrome, seems like in the retest the renavigation of the window was often not possible)
unstable_dict = {}
columns = his.columns
for column in columns:
    if column in ["id_x", "retest"]:
        continue
    unstable_dict[column] = tis.groupby(["browser_id", "test_id", column]).filter(lambda x: len(x) == 1)[["browser", "test_id", "url_id", "inc_method", "retest", column]]
    display(unstable_dict[column])

In [None]:
print(unstable_dict.keys())
prop = "op_win_window"
display(unstable_dict[prop]["test_id"].nunique())
temp = unstable_dict[prop].groupby(["browser", "inc_method", "retest", prop])["test_id"].agg(["nunique"])
qgrid.show_grid(temp.loc[temp["nunique"] != 0])

In [None]:
res.loc[(res["body"] == "ecocnt_html=post_message=mes1") & (res["browser"] == "firefox") & (res["Status-Code"] == 501) & (res["Content-Type"] == "text/html") & (res["Location"] == "http://172.17.0.1:8000") & (res["inc_method"] == "window.open") & (res["X-Frame-Options"] == "deny") & (res["Content-Disposition"]== "empty")][["url_id"]]

In [None]:
res.loc[(res["inc_method"] == "audio")].groupby(["browser"])["event_set"].value_counts()

In [None]:
# Here the wait_time was not high enough for chrome in the second run
# we need some time to let the window load (switch origins)/not included in the load event of the page

res.loc[(res["url_id"] == 24776) & (res["inc_method"] == "window.open")][["browser", "op_frame_count", "op_win_window", "op_win_opener"]]

In [None]:
res.loc[(res["url_id"] == 24676) & (res["inc_method"] == "link-prefetch")][["browser", "event_list"]]

In [None]:
res.columns

## Timing information!

In [None]:
# Chromium based browsers are very slow for window.open
# They load fast for audio, video, img (but some of this is due to the fact that they throw the load event before the parsing/error-handling is done, firefox does not do this)
timing_table = res.groupby(["browser", "inc_method"])[["loading_time","complete_time"]].agg(["mean", "std", "min", "max"]).reset_index()
timing_table = timing_table.loc[timing_table["browser"] != "MicrosoftEdge"].sort_values(["inc_method", "browser"]).set_index(["inc_method", "browser"])
timing_table[[("loading_time", "min"), ("loading_time", "max"), ("complete_time", "min"), ("complete_time", "max")]] = timing_table[[("loading_time", "min"), ("loading_time", "max"), ("complete_time", "min"), ("complete_time", "max")]].astype(int)
timing_table

In [None]:
with open("tables/timing_incs", "w") as f:
    f.write(timing_table.round(2).to_latex())