# Results for Q2 vulns in the thesis

- (Potentially) vulnerable endpoints
- ...

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import json
import requests
from requests.exceptions import SSLError, ConnectTimeout, ConnectionError
from publicsuffix2 import PublicSuffixList
from urllib.parse import urlparse

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from helper_dyn import (get_pipeline_overview, get_cookie_stats, get_pipeline_stats, show_only_first, get_leak_data, display_timing,
                        process_responses, display_response_summary, display_changed,
                        parse_method_url, get_query, info_grouping, row_sym, get_conf_dfs,
                        get_info_frames, get_only_both, parse_apg_url, url_list_to_tuples,
                        get_predictions_retroactive, save_div, get_basic_pruning_reduction, 
                        get_combs_after_basic_pruning, get_stats, get_acc, calc_info_frames)
from dil_preprocess import get_url_data, basic_pruning

# (Potentially) Vulnerable endpoints

In [None]:
site_results = get_pipeline_overview()

In [None]:
# Display general stats on the pipeline

dat, conf_both, conf_any = get_pipeline_stats(site_results.loc[(~site_results["site"].str.contains(r"-unpruned|172\.17\.0.1:44320")) & (site_results["tranco_rank"] > 0)])


# For a couple of websites, we needed to retest them, but the retest did not work. Exclude them from the rest of the analysis
print("Remove ones that did not login correctly\n\n")
# Problem: 3 of the first URLs tested are reported to be vulnerable here (without a confirmed_df_dict)
# However, they are not vulnerable (in chrome) according to our newer definiton of "same"
dat, conf_both, conf_any = get_pipeline_stats(dat)

In [None]:
# Display info on how many URLs were tested, retestend and confirmed
# Grouped by site, browser and method (and combinations thereof)
for col in ["dyn_conf_urls", "dyn_conf_retest_urls", "confirmed_leak_urls"]:
    print(col)
    acc = []
    dat.apply(parse_method_url, col=col, acc=acc, axis=1)
    dyn_conf = pd.DataFrame(acc)

    with pd.option_context('display.max_rows', 20):
        for grouping in [["site"], ["site", "browser"], ["method"], ["browser", "method"], ["site", "browser", "method"]]:
            print(grouping)
            dyn_browser = dyn_conf.groupby(grouping)[["url"]].count()
            display(dyn_browser.agg(["min", "max", "mean"]))
            display(dyn_browser.sort_values("url", ascending=False))

In [None]:
# Combine all dfs of the confirmed XS-Leaks into one dataframe
df_all = get_conf_dfs(conf_any) #dat.loc[dat["crawl_end"] != ""

In [None]:
# Display information on the confirmed vulns
display(df_all.agg(["nunique", "unique", "count"]))
display(df_all.groupby(["site", "browser"]).agg(["nunique", "unique", "count"]))

In [None]:
# Show which sites are responsible for most results
df_all_summ = df_all[df_all.columns.difference(["value_cookies", "value_no_cookies"])].groupby("site").nunique().sort_values("url", ascending=False)
display(df_all_summ.agg(["count", "mean", "max", "sum"]))
with pd.option_context("max_rows", 50):
    display(df_all_summ[["apg_url", "url", "browser", "inc_method", "method"]])

In [None]:
# Get all leak URL (apg URL) data
# We can now only look at the leak URLs tested in both browsers, 
# or only at the ones tested in only one browser
sites = dat["site"].tolist()
leak_urls = url_list_to_tuples(dat["dyn_conf_urls"].tolist(), sites)
leak_url_set = set(list(leak_urls.itertuples(name=None, index=None)))

In [None]:
with pd.option_context("max_rows", 1000):
    display(leak_urls.groupby(["browser"])["method"].value_counts().to_frame())

### Methods that do not occur

In [None]:
# All inclusion methods/leak methods
# On how many URLs/sites do they work? For chrome, firefox, and both!

# Only 18 leak methods are tested in dil_postprocess#L433, get_working_channel
# The others are excluded as they are either identical or should not work?!
# The three tested ones that do not work are duration, videoHeight and videoWidth
info_frame_0, _ = get_info_frames(df_all, None, conv_method=True)

with pd.option_context("max_rows", None):
    display(info_frame_0)

## Summary/Info frames for several settings

In [None]:
header_frame = pd.read_pickle("data/header_frame")

In [None]:
# Get the info frames

# BGA dominates thesshhhhhhhhhhhhhhhhhhhhhSSSSSSS stats for many types (not leakable in chrome as sameSite=None without secure)
# Remove it to see, what remains
# df_all = df_all.loc[df_all["site"] != "boardgamearena.com"]

# Complete frame
info_frame, info_frame_new = get_info_frames(df_all, None)
# Prune all leak URLs only tested in one browser
info_frame_both, info_frame_new_both = get_info_frames(df_all, leak_url_set, leave=[2])
# Prune all leak URLs tested in both browsers
info_frame_only, info_frame_new_only = get_info_frames(df_all, leak_url_set, leave=[1])

In [None]:
with pd.option_context("max_rows", 1000):
    display(info_frame_new)

In [None]:
# GP-security-policy violations called for frame-ancestors of the embedded document
# For XFO this leak channel does not exist, so one could distinguish whether a frame is blocked because of XFO or frame-ancestors
df_all.loc[(df_all["method"] == "gp_securitypolicyviolation") & ~(df_all["inc_method"] == "iframe-csp")]

In [None]:
df_all.loc[(df_all["method"] == "op_el_naturalHeight")]

In [None]:
# iframe, object, embed, embed-img
# (Firefox can (could, does not work anymore?) code 3XX and code 200 for most of these inclusion methods as it will fire load either once or twice)
# The actual reason for most of this cases is: csp frame-ancestors!, some might have other reasons though (e.g., bga does not set frame-ancestors?)
# Other reasons could be timeouts or invalid locations leading to infinite reload

# Firefox can distingush 4XX and XFO from other codes or no XFO for object, embed and embed-img
# Chrome can (only) distingush XFO on the object tag (bug?)

# Chrome can distinguish valid images from no valid images on embed-img as it is treated the same as img

# Chrome can count client-side redirects (refresh header, meta-refresh tag, javascript) on iframe, embed, and object
# Firefox can do it aswell but needs higher wait_time timeouts

iframe_event_list = df_all.loc[(df_all["method"] == "event_list") & (df_all["inc_method"] == "object")] # & (df_all["browser"] == "firefox")]
display(iframe_event_list)
print(iframe_event_list["url"].head(5).to_list())
responses = header_frame.loc[header_frame["url"].isin(iframe_event_list["url"])]
with pd.option_context("max_rows", 101):
    display(responses[["cookies", "site", "code", "content-type", "x-frame-options", "content-disposition", "cross-origin-resource-policy", "x-content-type-options", "cross-origin-opener-policy"]].value_counts().to_frame())

In [None]:
# Frame count on window.open!
# Chrome not possible if 

win_frame = df_all.loc[(df_all["method"] == "op_frame_count") & (df_all["inc_method"] == "window.open")]# & (df_all["browser"] == "chrome")]
win_frame = win_frame.loc[(win_frame["value_cookies"] == "Not possible") | (win_frame["value_no_cookies"] == "Not possible")]
print(len(win_frame))
display(win_frame)
#win_frame["value_cookies"] = win_frame["value_cookies"].apply(tuple)
#win_frame["value_no_cookies"] = win_frame["value_no_cookies"].apply(tuple)
print(win_frame["url"].head(5).to_list())
responses = header_frame.loc[header_frame["url"].isin(win_frame["url"])]
with pd.option_context("max_rows", 200):
    #display(win_frame.pivot_table(index=["value_cookies", "value_no_cookies"], aggfunc="count"))

    display(responses[["cookies", "site", "code", "content-type", "x-frame-options", "content-disposition", "cross-origin-resource-policy", "x-content-type-options", "cross-origin-opener-policy"]].value_counts().to_frame())

## Main vuln tables including export to latex!

In [None]:
print(info_frame_new["type"].value_counts())
vuln_overview = info_frame_new.loc[info_frame_new["type"].isin(["browsers", "inc_methods", "leak_methods"])].filter(regex="^(?!.*base)", axis=1).filter(regex="^(?!.*leak)", axis=1).filter(regex="^(?!.*channel)", axis=1)
vuln_channels = info_frame_new.loc[info_frame_new["type"].isin(["leak_channels"])].filter(regex="^(?!.*base)", axis=1).filter(regex="^(?!.*leak)", axis=1).filter(regex="^(?!.*channel)", axis=1)
vuln_sites = info_frame_new.loc[info_frame_new["type"].isin(["sites"])].filter(regex="^(?!.*base)", axis=1).filter(regex="^(?!.*leak)", axis=1).filter(regex="^(?!.*channel)", axis=1)

In [None]:
#for vuln in [vuln_overview, vuln_channels, vuln_sites]:
for title, vuln in [("vuln_overview", vuln_overview), ("vuln_sites", vuln_sites), ("vuln_channels", vuln_channels)]:
    with pd.option_context("max_rows", 50):
        vuln = vuln.apply(pd.to_numeric, errors="ignore")
        #display(vuln)
        #display(vuln.describe().round(2))
        vuln = vuln.replace(r"_", "-", regex=True)
        vuln["subtype"] = vuln["subtype"].apply(str).replace(r"_", "-", regex=True)
        if title == "vuln_channels":
            vuln = vuln.loc[~(vuln["subtype"].str.contains("load-count")) & ~(vuln["subtype"].str.contains("event-set"))]
        vuln = vuln.loc[:, vuln.columns != "type"]
        vuln = vuln.rename(columns={"subtype": "Group"})
        vuln = vuln.replace({"browsers": "all"})
        vuln.columns = vuln.columns.str.replace(r'confirmed ', '')
        vuln.columns = vuln.columns.str.replace(r'sites', 'Sites')
        vuln.columns = vuln.columns.str.replace(r'firefox', 'Firefox')
        vuln.columns = vuln.columns.str.replace(r'chrome', 'Chrome')
        vuln = vuln.loc[~(vuln["Group"].str.contains("event-set"))]
        vuln = vuln.loc[~(vuln["Group"].str.contains("load-count"))]
        vuln = vuln.loc[~(vuln["Group"].str.contains("CSS2"))]
        vuln = vuln.loc[~(vuln["Group"].str.contains("op-win-window"))]
        vuln = vuln.loc[~(vuln["Group"].str.contains("naturalHeight"))]

        vuln = vuln.loc[:, vuln.columns != "Sites any browser"]



        with open(f"tables/{title}.tex", "w") as f:
            display(vuln)
            f.write(vuln.to_latex(index=False, escape=False ,header=['\\rotatebox{90}{' + c + '}' for c in vuln.columns]))
        desc = vuln.filter(regex="^(?!.*Sites)", axis=1).describe().round(2)
        desc.index = desc.index.str.replace(r"%", "\%", regex=True)
        desc = desc.loc[desc.index.isin(["mean", "std", "min", "50\%", "max"])]
        with open(f"tables/{title}_sum.tex", "w") as f:
            display(desc)
            #f.write(desc.to_latex(escape=False ,header=['\\rotatebox{90}{' + c + '}' for c in desc.columns]))
            f.write(desc.to_latex(escape=False))

### Additional investigations

In [None]:
# Get responses where gp-window-onerror works!
onerror = df_all.loc[df_all["method"] == "gp_window_onerror"]
# onerror = onerror.loc[onerror["browser"] == "chrome"]
onerror_responses = header_frame.loc[header_frame["url"].isin(onerror["url"])]
# If statuscode is 400 or 401, no error will be thrown
# If nosniff is set (on html) no error will be thrown
# In chrome if the body is html (and some other), no error will be thrown due to CORB
# Chrome defaults to Lax
# The boardgame arena thing is strange?
# It now throws errors for both states (responses are still the same; sometimes the onerror handler seems to fail, but then not only one state should have observed errors and the other no errors, but a mix?)
# Investigate in the old firefox version used?! or retest bga?, maybe FP (e.g., because the requests timed out?)
# (Possible reason, it had nosniff like pandadoc.com?, but unlikely)
with pd.option_context("max_rows", 101):
    display(onerror_responses[["cookies", "site", "code", "content-type", "x-frame-options", "content-disposition", "cross-origin-resource-policy", "x-content-type-options", "cross-origin-opener-policy"]].value_counts().to_frame())

In [None]:
onerror.loc[onerror["site"] == "boardgamearena.com"]

In [None]:
df_all.loc[(df_all["inc_method"] == "iframe") & (df_all["method"] == "event_list")].groupby("site").size()

In [None]:
df_all.loc[(df_all["inc_method"] == "iframe-csp") & (df_all["method"] == "op_win_origin")].groupby("site").size()

In [None]:
df_all.loc[(df_all["inc_method"] == "link-prefetch")]

In [None]:
# Get responses where link-stylesheet works!
style = df_all.loc[df_all["inc_method"] == "link-stylesheet"]
style_responses = header_frame.loc[header_frame["url"].isin(style["url"])]

# Chrome accepts almost all content-types, so it can be used to distinguish code only
# Firefox only accepts text/css (and empty)
with pd.option_context("max_rows", 101):
    display(style_responses[["cookies", "code", "content-type", "x-frame-options", "content-disposition", "cross-origin-resource-policy", "x-content-type-options", "cross-origin-opener-policy"]].value_counts().to_frame())

In [None]:
# Get responses where op_win_CSS2Properties works for iframe
iframe_win = df_all.loc[(df_all["inc_method"] == "iframe-csp") & (df_all["method"] == "op_win_CSS2Properties")]
iframe_responses = header_frame.loc[header_frame["url"].isin(iframe_win["url"])]

# If a frame is blocked due to CSP, firefox can directly access the frame as it falls back to an empty frame of the origin of the current page
# In chrome one cannot access the frame as it is a special "blocked" frame with the original origin

# einpresswire.com probably has the wrong data in the crawl data (either in the dyn confirmation only one redirected, or one redirect did not cause a CSP block)
with pd.option_context("max_rows", 101):
    display(iframe_responses[["cookies", "site", "code", "content-type", "x-frame-options", "content-disposition", "cross-origin-resource-policy", "x-content-type-options", "cross-origin-opener-policy"]].value_counts().to_frame())

In [None]:
with pd.option_context("max_rows", 10):
    display(iframe_win)

## Explain differences chrome/firefox

- Remove explained by sameSite
    - estimate one: 60
    - estimate two: 54
    - (estimate three: 121)
- Remove explained by tree pruning (corb, different parsing and co)
    - info_frame_only and both
    - in addition and without removing sameSite
- Remove all URLs also found by another inc channel/leak method, i.e, only have URLs remaining that were only found by this one inc channel

In [None]:
# Get all sites that have >1 non window.open case in firefox and 0 non window.open case in chrome
# From the table with only URLs tested by both
# These should be sites with sameSite rules preventing chrome from being leaky (not 100%, but almost?)
# Remove them from the above table, to investigate other cases that are harder to explain
inf = info_frame_new.loc[(info_frame_new["type"] == "inc_sites") & (~info_frame_new["subtype"].apply(str).str.contains("window.open"))][["subtype", "confirmed URLs firefox", "confirmed URLs chrome"]]
inf["site"], inf["inc_method"] = zip(*inf["subtype"])
inf["confirmed URLs firefox"] = inf["confirmed URLs firefox"].apply(int)
inf["confirmed URLs chrome"] = inf["confirmed URLs chrome"].apply(int)

inf_piv = inf.pivot_table(index="site", values=["confirmed URLs firefox", "confirmed URLs chrome"], aggfunc="sum")
display(inf_piv)
# Estimate 1
samesite_chrome_sites = inf_piv.loc[(inf_piv["confirmed URLs chrome"] <= 0) & (inf_piv["confirmed URLs firefox"] >= 1)].reset_index()["site"].tolist()
# Estimate 2
# samesite_chrome_sites = inf_piv.loc[(inf_piv["confirmed URLs chrome"] <= 1) & (inf_piv["confirmed URLs firefox"] >= 2)].reset_index()["site"].tolist()

print(f"{len(samesite_chrome_sites)} sites do not leak for any URL requiring sameSite=None,secure")

In [None]:
filtered = site_results.loc[(~site_results["site"].str.contains(r"-unpruned|172\.17\.0.1:44320")) & (site_results["tranco_rank"] > 0)]
cookie_hunter_second_failed = ['allevents.in', 'whowhatwear.com', 'creative-tim.com', 'extendoffice.com', 'lepoint.fr', 'hallmark.com', 'flourish.studio', 'dramacool.fm', 'pdfdrive.com', 'jmty.jp', 'readymag.com', 'gridoto.com', 'grubhub.com', 'asana.com', 'familyeducation.com', 'entireweb.com', 'christianpost.com', 'cutt.us', 'tiexue.net', 'lejdd.fr', 'brisbanetimes.com.au']
filtered = filtered.loc[~filtered["site"].isin(cookie_hunter_second_failed)]

# Addititional filters
# Remove all URLs that work in more than one inclusion method
(_, inf_new_rem), (_, inf_new_both_rem), (_, inf_new_only_rem) = calc_info_frames(filtered, remove_multiple="method")  # Only look at URLs that only worked for a single inc_method (or method), ...
# (_, inf_new_rem), (_, inf_new_both_rem), (_, inf_new_only_rem) = calc_info_frames(filtered, remove_multiple="inc_method")  # Only look at URLs that only worked for a single inc_method (or method), ...


# Remove sameSite non working, ...
filtered = filtered.loc[(~filtered["site"].isin(samesite_chrome_sites))]
(_, inf_new), (_, inf_new_both), (_, inf_new_only) = calc_info_frames(filtered)


In [None]:
# Remove leak URLs not tested
with pd.option_context("max_rows", None):
    with pd.option_context("max_columns", 25):
        for typ in ["browsers", "inc_methods", "leak_methods", "leak_channels", "inc_sites"]:
        # When all incs except window.open do not work for chrome, this site can be explained by sameSite
        #for typ in ["browsers", "inc_sites"]:
            for df, info in [(info_frame_new, "all"), (info_frame_new_only, "leak URLs not tested in other browser"), (info_frame_new_both, "leak URLs tested in other browser")]:
                print(info)
                display(df.loc[df["type"] == typ].filter(regex="^(?!.*base)", axis=1))

In [None]:
# Remove leak URLs not tested + remove sameSite estimated pages
with pd.option_context("max_rows", None):
    with pd.option_context("max_columns", 25):
        #for typ in ["browsers", "inc_methods", "leak_methods", "leak_channels", "inc_sites"]:
        for typ in ["browsers", "inc_methods", "leak_channels"]:
            for df, info in [(inf_new, "all"), (inf_new_only, "leak URLs not tested in other browser"), (inf_new_both, "leak URLs tested in other browser")]:
                print(info)
                display(df.loc[df["type"] == typ].filter(regex="^(?!.*base)", axis=1))

In [None]:
# Leak channels remaining after removing estimated only insecure because of samesite=not set and samesite=None&secure=False
# Table might be suited for the thesis!
leak_channels_wsamesite = inf_new.loc[inf_new["type"] == "leak_channels"].filter(regex="^(?!.*base)", axis=1)
display(leak_channels_wsamesite)
with open("tables/leak_channels_wsamesite", "w") as f:
    f.write(leak_channels_wsamesite.to_latex(index=False))

In [None]:
# Display the (leak channels or leak methods) that are the only working "method" for that URL in that browser
# Compare found URLs (any) vs found URLs (only found with one inc_method)
df = inf_new_rem.merge(info_frame_new, on=["type", "subtype"], suffixes=("_only", "_all"), how="outer").rename(columns={"type": "atype", "subtype": "bsubtype"})
df = df.reindex(sorted(df.columns), axis=1).loc[df["atype"] == "leak_channels"]
df.loc["Total"] = df.sum()
with pd.option_context("max_rows", 100):
    display(df[["bsubtype", "confirmed URLs chrome_all", "confirmed URLs chrome_only", "confirmed URLs firefox_all", "confirmed URLs firefox_only"]])

## Investigate unpruned data

- differences cannot be due to erros in our trees as everything is tested
- (we only selected websites vulnerable in both browsers: selection bias towards sites having sameSite none,secure?)

In [None]:
# Look at the unpruned data, there the results cannot be due to us not testing a leak channel!
# unpruned data
retro_names = ["pier1.com", "chartink.com", "twitcasting.tv", "pdffiller.com", "staples.ca", "tool.lu", "freelogodesign.org", "duplichecker.com", "miro.com", "mnml.la", "office.com", "pbslearningmedia.org", "redtube.com", "whatfontis.com", "glosbe.com", "wideads.com", "standardmedia.co.ke", "gyazo.com", "playground.xyz", "megogo.net", "zennioptical.com", "truecar.com", "powtoon.com", "italki.com", "themehorse.com", "amazon.in", "versobooks.com", "coursera.org", "yourstory.com", "korrespondent.net", "transifex.com", "ankiweb.net", "imgflip.com", "moneyweb.co.za", "wordpress.com", "iplocation.net", "porch.com", "youporn.com", "tmj4.com", "nimbusweb.me", "classifiedads.com", "myvidster.com", "cafepress.com", "viewsonic.com", "pakwheels.com", "idntimes.com", "mhthemes.com", "newswise.com", "universe.com", "aboutus.com"]
failed = ["twitcasting.tv", "tool.lu", "office.com", "pbslearningmedia.org", "playground.xyz", "truecar.com", "amazon.in", "coursera.org", "imgflip.com", "moneyweb.co.za", "wordpress.com", "porch.com", "viewsonic.com", "newswise.com"]
unpruned_sites = [site for site in retro_names if site not in failed]
unpruned_sites = [f"{site}-unpruned" for site in unpruned_sites]

filtered = site_results.loc[site_results["site"].isin(unpruned_sites)]
(_, inf_new), (_, inf_new_both), (_, inf_new_only) = calc_info_frames(filtered)
assert len(inf_new_only) == 0

In [None]:
with pd.option_context("max_rows", None):
    with pd.option_context("max_columns", 25):
        #for typ in ["browsers", "inc_methods", "leak_methods", "leak_channels", "inc_sites"]:
        for typ in ["browsers", "leak_channels", "inc_sites"]:
            for df, info in [(inf_new, "all")]:
                print(info)
                display(df.loc[df["type"] == typ].filter(regex="^(?!.*base)", axis=1))

## FP stuff

- Try to find FPs
    - complete sites that failed
    - general unstable methods (that need more than 2 tests and/or stricter criteria to count as a "same" observation in both tests)
- ...

In [None]:
# Show methods with worst and best distance
# Problem: Jaro similarity only makes sense for postMessages/strings
with pd.option_context("max_rows", 100):
    for group_key in ["site"]:
        for method in ["gp_window_postMessage", "op_frame_count", "gp_securitypolicyviolation", "op_el_media_error", "op_win_history_length"]:            
            display(df_all.loc[df_all["method"] == method].sort_values("jaro", ascending=False).head(2))
            display(df_all.loc[df_all["method"] == method].sort_values("jaro", ascending=False).tail(2))
            piv = df_all.loc[df_all["method"] == method].pivot_table(index=group_key, values="jaro", margins=True).sort_values("jaro")
            piv.loc["zMean"] = piv.mean()
            print(method)
            display(piv)

In [None]:
# display(df_all[["value_cookies", "value_no_cookies"]].value_counts().to_frame())
# display(df_all["value_cookies"].value_counts().to_frame())
# display(df_all["value_no_cookies"].value_counts().to_frame())

In [None]:
# Main FP analysis!!
# It is strange if a site has one value sometimes for cookies and sometimes for no cookies for the same method?

# For every inc_channel-site count the number of uniquer occurrences for cookie values/no-cookie values
# Prune "almost duplicate" channels
df_pruned = df_all.loc[~df_all["method"].isin(["event_set", "load_count", "op_win_CSS2Properties", "op_win_window", "op_el_naturalHeight"])]
groups = df_pruned.groupby(["inc_method", "method", "site", "browser"])
res = pd.DataFrame(columns=["inc_method", "method", "site", "browser", "value_cookies", "value_no_cookies", "count"])
for (inc_method, method, site, browser), group in groups:
    try:
        for i, row in group[["value_cookies", "value_no_cookies"]].apply(tuple).value_counts().to_frame().reset_index().iterrows():
            res.loc[len(res)] = [inc_method, method, site, browser, row["value_cookies"], row["value_no_cookies"], int(row[0])]
    except TypeError:
        print(inc_method, method, site)

In [None]:
# Main FP analysis!!
# It is strange if a site has one value sometimes for cookies and sometimes for no cookies for the same method?

# Display all observed properties for all sites
# If a value pair occurrs in both permutations, this is strange and it could be a FP due to server-site randomness
with pd.option_context("min_rows", 10):
    display(res.loc[res["count"] > 1].set_index(["inc_method", "method", "site", "browser"]))
    
# Get all permutations
# If a pair occurs roughly the same in both states, it could be unstable in general
# If a pair occurs often in one state and seldom in the other, it could either be an exception URL or the test failed
res["vals"] = res[["value_cookies", "value_no_cookies"]].values.tolist()
res["vals"] = res["vals"].apply(lambda x: sorted(x)).apply(tuple)    
pot_fp = res[res.duplicated(subset=["inc_method", "method", "site", "vals", "browser"], keep=False)].set_index(["inc_method", "method", "site", "browser"])

In [None]:
res.loc[res["site"] == "dn.se"]

In [None]:
print(f"Len all value pairs observed (by inc-channel/site) {len(res)}, len all value pairs that occur in both directions {len(pot_fp)}")

# Get stats for all pairs of value cookies/no cookies
# Count is the number of unique pairs per site
# Sum is the number of affected URLs
# On the complete data (org), and on the potential fp data (pot_fp)
# The potential FP data only has pairs where the pair combination occurrs in both directions
res["count"] = res["count"].apply(int)
pot_fp["count"] = pot_fp["count"].apply(int)
piv_org = res.pivot_table(index=["inc_method", "method", "browser"], values="count", aggfunc=["count", "sum"])
piv_pot_fp = pot_fp.reset_index().pivot_table(index=["inc_method", "method", "browser"], values="count", aggfunc=["count", "sum"])
with pd.option_context("max_rows", None):
    method_fp_overview = piv_org.merge(piv_pot_fp, on=["inc_method", "method", "browser"], how="outer", suffixes=("_org", "_pot_fp")).droplevel(1, axis=1)
    method_fp_overview = method_fp_overview.fillna(0).astype(int)
    method_fp_overview = method_fp_overview.unstack().swaplevel(axis=1)
    method_fp_overview = method_fp_overview[[("chrome", "count_org"), ("chrome", "sum_org"), ("chrome", "count_pot_fp"), ("chrome", "sum_pot_fp"),
                                            ("firefox", "count_org"), ("firefox", "sum_org"), ("firefox", "count_pot_fp"), ("firefox", "sum_pot_fp")]]
    method_fp_overview = method_fp_overview.rename(columns={"count_org": "Pairs", "sum_org": "URLs", "count_pot_fp": "FP pairs", "sum_pot_fp": "FP URLs", "chrome": "Chrome", "firefox": "Firefox"})
    method_fp_overview = method_fp_overview.stack(0).astype(int)
    method_fp_overview = method_fp_overview[["Pairs", "URLs", "FP pairs", "FP URLs"]]
    display(method_fp_overview)
    with open(f"tables/method_fp_overview_.tex", "w") as f:
            f.write(method_fp_overview.to_latex())

In [None]:
# Potentially FPs!
with pd.option_context("max_rows", None):
    print(len(pot_fp))
    display(pot_fp.sort_values(["site", "inc_method", "method"]))

In [None]:
df_all.loc[(df_all["method"] == "op_frame_count")&(df_all["site"] == "reddit.com")]

## URL defense stuff

In [None]:
# Look at URLs
df_all

In [None]:
# Plot over url_query_len, long queries might have sessionids or similar in the query and might be unexploitable
# Short query strings are probably exploitable
print(df_all["url_query_len"].value_counts())
df_all[["url_query_len"]].plot(kind="hist", bins=[0,1,10,20,30,40,50,100,500,1000,2000])

In [None]:
# Get path length
df_all[["https", "url_base2"]] = df_all["url_base"].str.split("://", 1, expand=True)
df_all[["url_site", "url_path"]] = df_all["url_base2"].str.split("/", 1, expand=True)
df_all["url_path_len"] = df_all["url_path"].str.len()

In [None]:
# URL len
print(df_all["url_path_len"].value_counts())
#df_all[["url_path_len", "url_query_len"]].plot(kind="hist", bins=[0,10,20,30,40,50,100,2000], subplots=True, xticks=[0,10,20,30,40,50,100])
bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
fig, ax = plt.subplots()
_, bins, patches = plt.hist([np.clip(df_all["url_path_len"], bins[0], bins[-1]), np.clip(df_all["url_query_len"], bins[0], bins[-1])], density=False, bins=bins, label=["Path", "Query"])
xlabels = bins[1:].astype(str)
xlabels[-1] = '90+'
xlabels = ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90+"]

N_labels = len(xlabels)
plt.xlim([0, 100])

plt.xticks(10 * np.arange(N_labels) + 5)
ax.set_xticklabels(xlabels)

#plt.yticks([])
plt.title('')
plt.setp(patches, linewidth=0)
plt.legend(loc='upper right')
plt.xlabel("Length in characters")
plt.ylabel("Count")
fig.tight_layout()
fig.savefig("tables/url_len_hist.pdf")

In [None]:
ax = df_all[["url_path_len", "url_query_len"]].plot(kind="box", showfliers=False, figsize=(6,4))
ax.set_xlabel("Group")
ax.set_ylabel("Characters")
ax.figure.tight_layout()
ax.figure.savefig("tables/url_len_box.pdf")
ax

In [None]:
df_all["url_path_len_pruned"] = pd.cut(df_all["url_path_len"], [0,10,20,30,40,50,100,500,2000], right=False)
df_all["url_query_len_pruned"] = pd.cut(df_all["url_query_len"], [0,10,20,30,40,50,100,500,2000], right=False)

In [None]:
count = df_all.pivot_table(index=["url_path_len_pruned", "url_query_len_pruned"], values="url", aggfunc="count").sort_values("url", ascending=False)
count = count.loc[count["url"] != 0]
count

In [None]:
display(df_all[["url_path_len", "url_query_len"]].describe())
display(df_all.sort_values("url_query_len").head(5)["url"].to_list())
display(df_all.sort_values("url_query_len").tail(5)["url"].to_list())