In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from helper_dyn import get_pipeline_overview, get_distances, get_cookie_stats, get_only_both

from dil_postprocess import get_dyn_results, get_working_urls_channels
from dil_preprocess import get_url_data, basic_pruning
from dil_predict import init, predict_trees, reduce_leaky_endpoints


import pandas as pd
import json
import redis 
r = redis.Redis()

In [None]:
# Load trees in 
models = init()

In [None]:
# Get info about tree_preprocess from redis
display(json.loads(r.get("known_unhandled_bodies")))
display(json.loads(r.get("untreated_cts")))

## (Manual) analysis

- Reasons for (almost) no URLs
    - bot detection (bisnis.com) (Crawler uses puppeteer (now with stealth plugin), cookiehunter uses selenium with xfvb)
    - direct redirection: (ziprecruiter.com) (Cookiehunter and crawler use a different site name, thus the crawler does not crawl)
    - TLS errors (geekbuying.com, takungpao.com) (Cookiehunter ignores cert errors, the crawler does not (as a normal? user would also not ignore them)
    - unclear (appbrain.com, ...) (maybe timeout due to blocked or rate-limited, ...)
- Reasons for (almost) no URLs after pruning (if many URLs were found)
    - login area is separate from logout area and was not found (builderall.com)
    - timeout on homepage means only homepage got crawled, and the hompage is static (deliverr.com)
    - no differences for logged in users/no real logged in content or not much (ontvtime.ru, adtarget.com.tr, motorplus-online.com), ...
    - cookie session sharing was not successful, thus both accounts are in the "same" state: (?)
    - investigate: other reasons
- Reasons for (big) differences between chrome/firefox: check hinative.com, moneyforward.com, flipboard.com, dn.se, etc!
    - Why does it only work in firefox (samesite default?), why does it only work in chrome? (no idea)
        - log-in/cookie-session-sharing did not work in one browser only
           - probably what happend on hinative.com firefox
           - probably what happend on flipboard.com chrome
        - samesite not set, different defaults in browsers
           - if only window.open works in chrome, in firefox also other leak type works (e.g., moneyforward.com), probably the case
           - only because the session cookies are lax this does not mean nothing can leak, other cookies can also leak user info (what settings a user has)
        - FP in one browser only (by chance):
            - postMessage FPs? only one one site for some reason: firefox received the same pMs often for the states, chrome did not (e.g., dn.se); In general a problem? Just checking 2 times might not be enough for pMs..., exclude ones with two low jaro distance? or with both values occuring for both value_cookies, value_no_cookies on different URLs
        - Differences in methods/parsing of browsers (e.g., in tarafdari.com):
            - CORB makes some leaks work/not work in Chrome
            - media_error only works in chrome
            - nosniff is different in both browsers (e.g., firefox fires error event, chrome load event on nosniff responses to script tags), ...
    - ~~Why does image never works? (in chrome not tested as it is the same as embed-img?), but in firefox?~~ Works but only very rarely
    -
- State-Differences
    - we have logged-in account (usually nothing done), and logged-out account (at the beginning even unvisited)
    - diff does not have to be logged-in/logged-out
    - can also be cookie policy consented/cookie policy not consented
    - specific user/not specific user
    - (user with add blocker, user without ad blocker)
    - user with "urgent" task, user without
        - e.g., hinative/general if frame_count of value_cookies is 0 and value_no_cookies is N: user with urgent task is redirected to another page (e.g., some survey necessary after registration/login), another reason for this can be that logged-in users are ad-free
- ~~Redo groupon.com to betterteam.com: problem db-server had an error/db was incorrect~~
- ~~Redo avvo.com to extendoffice.com: problem leaky-service was down :( -> URL dict version contained line break -> attack_page.html had syntax error -> failed~~
    - ~~(the some successful ones, e.g., of stickermule.com, must have had empty/timedout responses for url_dict_version)~~

In [None]:
dat = get_pipeline_overview()

In [None]:
with pd.option_context('display.max_columns', None):
    with pd.option_context("max_rows", None):
        #display(c_res.loc[c_res["crawl_end"] != ""].sort_values(["num_basic_pruning", "num_urls"]))
        display(dat.loc[dat["crawl_end"] != ""].sort_values("cookie_end"))

In [None]:
site = "vidio.com"  # tarafdari.com, humblebundle.com, duplichecker.com, idntimes.com, imgflip.com, yellowpages.com, cafepress.com, ahrefs.com, deliverr.com, logrocket.com, logrocket.com-unpruned

for site in [site, f"{site}-unpruned"]:
    info = dat.loc[dat["site"] == site]
    df = get_dyn_results(site)
    if not "-unpruned" in site:
        url_dat = get_url_data(site)

    working_df, _, _ = get_working_urls_channels(df, log=False)
    display(working_df)
    display(working_df[["method", "inc_method"]].value_counts())

In [None]:
url_dat.loc[url_dat["req_url"] == "https://www.vidio.com/purchased/status?content_id=3337"]

In [None]:
info

In [None]:
working_df.head()

In [None]:
def get_extended_info(row, info, df):
    # Add for every entry if the same leak_channel works in the other browser
    # If the same leak_channel was tested in the other browser (only check inclusion methods) (if not tree pruning is responsible!)
    # If the same leak channel was re-tested in the other browser (only check inclusion methods) (if not SameSite and co, could be responsible)
    # 
    test_id = row["test_id"]
    browser = row["browser"]
    method = row["method"]
    inc_method = row["inc_method"]
    apg_url = row["apg_url"].split(f"&browser={browser}")[0]
    other_browser = "firefox" if browser == "chrome" else "chrome"
    check_work = df.loc[(df["browser"] == other_browser) & (df["method"] == method) & (df["inc_method"] == inc_method) & (df["test_id"] == test_id)]
    if apg_url in info["dyn_conf_urls"].values[0][other_browser]:
        check_test = True
    else:
        check_test = False
    if apg_url in info["dyn_conf_retest_urls"].values[0][other_browser]:
        check_retest = True
    else:
        check_retest = False
    res = {}
    res["leak_channel_works_other_browser"] = len(check_work)
    res["leak_channel_tested_other_browser"] = check_test
    res["leak_channel_retested_other_browser"] = check_retest
    return len(check_work), check_test, check_retest
    
working_df[["leak_channels_works_other_browser", "leak_channel_tested_other_browser", "leak_channel_retested_other_browser"]] = working_df.apply(get_extended_info, info=info, df=working_df, axis=1, result_type="expand")

In [None]:
working_df[["leak_channels_works_other_browser", "leak_channel_tested_other_browser", "leak_channel_retested_other_browser"]].value_counts()

In [None]:
# Tarafdari, all that worked in firefox are not even tested in chrome
# Working channels are script-event_set and -window.onerror 
# They do not work in chrome (partially) due to CORB
# Partially due to other script handling
working_urls = working_df["url"].unique().tolist()
working_org_data = url_dat.loc[url_dat["req_url"].isin(working_urls)].sort_values(["req_url", "cookies"])
display(working_org_data.head())

# Check headers/tree results
# display(working_org_data["resp_headers"].values.tolist()[:4])
af, d, poss, results = basic_pruning(working_org_data)
display(af.sort_values(["URL", "cookies"]))
leaky_endpoints = predict_trees(af, log=True)
reduce_leaky_endpoints(leaky_endpoints)

- Get info on whether the leak_channel was even tested/retested
    - for entries in working_df get if it also works in the other browser + if the leak channel was even tested/re-tested in the other browser
    - (for the cases were only one browser worked; i.e., how often did tree-pruning say the case does only work in one browser?)
    - Also get the raw responses from the crawler! to see if it makes sense
    -(Problem, we have no gurantee that the same response was returned that we observed with the crawler!)

In [None]:
dyn_conf_urls = info["dyn_conf_urls"].values[0]
get_only_both(dyn_conf_urls)
dyn_conf_retest_urls = info["dyn_conf_retest_urls"].values[0]
get_only_both(dyn_conf_retest_urls)
confirmed_urls = info["confirmed_leak_urls"].values[0]
get_only_both(confirmed_urls)

In [None]:
with pd.option_context("max_rows", None):
    with pd.option_context("max_columns", None):
        display(info)
        display(working_df.loc[working_df["browser_id"] == 1])
        display(get_cookie_stats(info[["cookies"]]))
        # display(working_df[["apg_url", "browser", "value_cookies", "value_no_cookies"]])

In [None]:
browser_id = 2
brow = df.loc[df["browser_id"] == browser_id]
display(brow.sort_values("test_id"))
for id_val in ["events_id", "global_properties_id", "object_properties_id", "window_properties_id"]:
    display(df.groupby(["browser_id", "cookies", "retest_num"])[id_val].value_counts().to_frame())

In [None]:
# Display differences in postMessages that are very similar
# get_distances(working_df)