# Results for Q2 responses in the thesis

- Response statistics (headers, status-codes, etc)
- Response pairs and SD-URLs
- ...

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import json
import requests
from requests.exceptions import SSLError, ConnectTimeout, ConnectionError
from publicsuffix2 import PublicSuffixList
from urllib.parse import urlparse

import pandas as pd
import numpy as np

from helper_dyn import (get_pipeline_overview, get_cookie_stats, get_pipeline_stats, show_only_first, get_leak_data, display_timing,
                        process_responses, display_response_summary, display_changed,
                        parse_method_url, get_query, info_grouping, row_sym, get_conf_dfs,
                        get_info_frames, get_only_both, parse_apg_url, url_list_to_tuples,
                        get_predictions_retroactive, save_div, get_basic_pruning_reduction, 
                        get_combs_after_basic_pruning, get_stats, get_acc)
from dil_preprocess import get_url_data, basic_pruning

## Security headers and responses

In [None]:
df = get_url_data(None)
print(df.info())

In [None]:
# General stats on the crawled data
df = df.loc[(~df["site"].str.endswith("unpruned")) & (~(df["site"] == "172.17.0.1:44320"))]  # Remove all retest/evaluation data
df["req_method"] = df["req_method"].apply(str)
c_group = df.groupby(["cookies"])
with open("tables/all_requests.tex", "w") as f:
    all_reqs = c_group["req_method"].value_counts().unstack().sort_index(ascending=False)
    all_reqs = all_reqs[["GET", "POST", "OPTIONS", "CONNECT", "HEAD", "PUT", "PATCH", "GET_HISTORY"]]
    display(all_reqs)
    f.write(all_reqs.to_latex())
display(c_group["site"].nunique())
display(c_group["real_site"].nunique())
nc_sites = pd.DataFrame(c_group["site"].unique()[0])
c_sites = pd.DataFrame(c_group["site"].unique()[1])
only = get_only_both({"c": c_sites, "nc": nc_sites}, ("c", "nc"), log=True)
display(only)

In [None]:
# We only are interested in first-party get requests, so only get that subset
# Get only first party GET requests
df["site"] = df["site"].astype("str")
df["real_site"] = df["real_site"].astype("str")
first_party = df.loc[(df["site"] == df["real_site"]) & (df["req_method"] == "GET")]


# Only get the ones were we have one entry for cookies and one for no cookies
count_urls = first_party.drop(columns=["resp_body_tika_info", "resp_body_info"]).groupby(["req_url", "site", "real_site"])["cookies"].agg(["nunique", "count"])

# If count == 1, only the request with cookies succeeded the other failed for some reason (e.g., mitm proxy crashed, celery saving task crashed (contains null byte?), ...)
# If count > 2, we observed different response codes for the same URL in the same state (e.g., we got rate limited in the mean time (429) or the resource moved (302) or is not modified (304), ...)
# We exclude all with count == 1 (they have errors)
# We exclude all with count > 2 (they are unstable within one state) 
display(count_urls.value_counts())
count_index = count_urls[(count_urls["nunique"] == 2) & (count_urls["count"] == 2)].index
first_party = first_party.set_index(["req_url", "site", "real_site"])
response_pairs = first_party.loc[count_index].reset_index()

In [None]:
# Entries before and after first_party Get pruning, and 2 results pruning
print(df.shape)
print(first_party.shape)
print(response_pairs.shape)

# Investigate the exceptions
# Nunique == 1 (only observations for one state), count == N (total observations for the URL)
count_index = count_urls[(count_urls["nunique"] == 1) & (count_urls["count"] == 2)].index
first_party.loc[count_index]

### Stats

In [None]:
# Get stats for the responses considered?!
display(response_pairs.groupby("cookies")[["req_url", "site", "real_site"]].nunique())
header_frame = response_pairs.apply(process_responses, axis=1, result_type="expand")
display(header_frame)

In [None]:
# Which headers exist in our data
acc = get_acc()
acc = dict(sorted(acc.items(), key=lambda item: item[1], reverse=True))

In [None]:
# Most common headers
most_common = pd.DataFrame.from_dict(acc, orient="index").reset_index().rename(columns={0: "Count", "index": "Header"}).head(15)
with open("tables/most_common.tex", "w") as f:
    display(most_common)
    f.write(most_common.to_latex(index=False))

In [None]:
resp_over = header_frame.drop(["hash_all", "hash_min_pruned", "hash_basic_pruned"], axis=1).groupby("cookies").nunique().T.reset_index().rename(columns={"index": "Property", False: "No cookies", True: "Cookies"})
with open("tables/response_overview.tex", "w") as f:
    resp_over = resp_over[["Property", "Cookies", "No cookies"]]
    resp_over = resp_over.loc[~(resp_over["Property"] == "real_site")]
    resp_over = resp_over.replace({"code": "status-code"})
    display(resp_over)
    f.write(resp_over.to_latex(index=False))

In [None]:
# Summary of all headers and co., for both requests with and without cookies
table_dict = display_response_summary(header_frame)

In [None]:
# This will crash the window/be very slow!
res = display_response_summary(header_frame, check=["body"])

In [None]:
table_dict["body"] = res["body"]

In [None]:
def get_sites(row, df, prop):
    return df.loc[(df[prop] == row.iloc[0])]["site"].agg("nunique")

def get_body_info(row):
    body_hash = row["body"]
    body = response_pairs.loc[response_pairs["resp_body_hash"] == body_hash].iloc[0]["resp_body_info"].split(": ")[1].split("\\n")[0]
    return body


for key in table_dict:
    print(key, len(table_dict[key].T))
    if key in ["x-frame-options", "code", "content-type"]:
        n = 27
    elif key == "total":
        continue
    else:
        n = 10
    resp = table_dict[key].T.head(100)
    resp = resp.rename(columns={False: "No cookies", True: "Cookies"})
    resp = resp[["Cookies", "No cookies"]].reset_index()
    resp["Sites"] = resp.apply(get_sites, df=header_frame, prop=key, axis=1)
    resp["Total"] = resp["Cookies"] + resp["No cookies"]
    #resp = resp.sort_values(["Sites", "Total"], ascending=False)
    resp = resp.loc[:, resp.columns != "Total"].head(n)
    if key == "body":
        resp["body"] = resp.apply(get_body_info, axis=1)

    display(resp)
    with open(f"tables/response_{key}.tex", "w") as f:
        f.write(resp.to_latex(index=False))

In [None]:
# Groupby site, or even by site and cookies

# Danger! Slow as it will output many tables!
# display_response_summary(header_frame, index="site")  # ["site", "cookies"]

## Response pairs

In [None]:
response_pairs

In [None]:
# Process response headers (and status codes)
header_frame = response_pairs.apply(process_responses, axis=1, result_type="expand")

In [None]:
# Safe header frame to disk
header_frame.to_pickle("data/header_frame")

In [None]:
with pd.option_context("max_rows", 200):
    csp_diff = header_frame.groupby("url")["content-security-policy"].nunique().sort_values(ascending=False).to_frame()
    display(csp_diff.loc[csp_diff["content-security-policy"] == 2])

In [None]:
sd_urls = header_frame.groupby(["site", "url"])[["hash_all", "hash_min_pruned", "hash_basic_pruned"]].nunique().reset_index()

In [None]:
for selection in [["hash_all"], ["hash_min_pruned"], ["hash_basic_pruned"], ["hash_all", "hash_min_pruned", "hash_basic_pruned"]]:
    display(sd_urls[selection].value_counts().to_frame())

In [None]:
# Print overview of SD-URLs
sd_pairs = sd_urls[["hash_all", "hash_min_pruned", "hash_basic_pruned"]].value_counts().to_frame().reset_index().sort_values(["hash_all", "hash_min_pruned", "hash_basic_pruned"]).replace({1: "non SD", 2: "SD"}).rename({"hash_all": "Complete response", "hash_min_pruned": "Minimal pruning", "hash_basic_pruned": "Basic pruning", 0: "Count"}, axis=1)
with open("tables/sd_pairs.tex", "w") as f:
    display(sd_pairs)
    f.write(sd_pairs.to_latex(index=False))

### Compare results in db with pruning applied now

In [None]:
site_results = get_pipeline_overview()
pairs_dyn = site_results.loc[site_results["site"].isin(response_pairs["site"].tolist())]

In [None]:
pairs_pivot = pairs_dyn[["site", "num_urls", "num_basic_pruning"]].sort_values("site").copy()
pairs_pivot["site"] = pairs_pivot["site"].astype(str)
display(pairs_pivot.describe())
pairs_pivot.loc["Total"] = pairs_pivot.sum()
display(pairs_pivot)

In [None]:
count_sd = lambda x: x.eq(2).sum()
count_sd.__name__ = "SD URLs"
count_non_sd = lambda x: x.eq(1).sum()
count_non_sd.__name__ = "Non SD URLs"
sd_pivot = sd_urls.pivot_table(index="site", values=["hash_all", "hash_min_pruned", "hash_basic_pruned"], aggfunc=("count", count_sd, count_non_sd, "mean"))

display(sd_pivot.describe())

# sd_pivot.loc["Mean"] = sd_pivot.mean()
sd_pivot.loc["Total"] = sd_pivot.sum()
display(sd_pivot)

In [None]:
# Differences in the data in the site_result table, and calculated from the url_data now
sd_pivot_comp = sd_pivot[[("hash_all", "count"), ("hash_basic_pruned", "SD URLs")]].reset_index().droplevel(1, axis=1).rename(columns={"hash_all": "num_urls", "hash_basic_pruned": "num_basic_pruning"})
df = sd_pivot_comp.merge(pairs_pivot, on=["site"], suffixes=("_sd", "_pairs"))
with pd.option_context("max_rows", 20):
    display(df.loc[(df["num_urls_sd"] != df["num_urls_pairs"]) | (df["num_basic_pruning_sd"] != df["num_basic_pruning_pairs"])])

# If we run the basic pruning step now, it gives the same results as we have calculated from the url_data
# This means that the url_data has changed since the run (e.g., the pipeline was started before all results got saved, or because of an incorrect rerun we changed some data)
# As the differences are not large, it does not matter
# The exact code from the dil pipeline, now generates the same results!
af, d, poss, results = basic_pruning(get_url_data("bitly.com"))
print(d["URL"].nunique(), poss["URL"].nunique())

### Body analysis

In [None]:
# Different body out of remaining URLs
basic_pruned = sd_urls.loc[sd_urls["hash_basic_pruned"] == 2]
response_pairs.loc[response_pairs["req_url"].isin(basic_pruned["url"].tolist())].groupby("req_url")["resp_body_hash"].nunique().value_counts()

### Tree SD-URLs analysis

In [None]:
# Remaining URLs after tree pruning
# Get unique URLs from this 
sites = pairs_dyn["site"].tolist()
leak_urls = url_list_to_tuples(pairs_dyn["dyn_conf_urls"].tolist(), sites)
display(leak_urls.groupby("browser")["url"].nunique())
leak_methods_urls = leak_urls.groupby(["browser", "url"])["method"].nunique()
display(leak_methods_urls.groupby("browser").describe())
display(leak_methods_urls.describe().to_frame())

In [None]:
pairs_trees = leak_methods_urls.groupby("browser").agg(["mean", "std", "min", "median", "max"]).round(2)
with open("tables/pairs_trees.tex", "w") as f:
    display(pairs_trees)
    f.write(pairs_trees.to_latex())


### ? investigation zone

In [None]:
header_frame.loc[header_frame["url"] == "https://fotos.perfil.com/2021/09/11/trim/248/140/20210912juez-deloredo-1229108.jpg?webp"]

In [None]:
header_frame_cookies = header_frame.loc[header_frame["cookies"] == True]
header_frame_no_cookies = header_frame.loc[header_frame["cookies"] == False]
pairs = header_frame_cookies.merge(header_frame_no_cookies, on="url", suffixes=("_c", "_n"))
pairs = pairs.reindex(sorted(pairs.columns), axis=1)

In [None]:
pairs.loc[(pairs["cross-origin-opener-policy_c"] == "same-origin-allow-popups;report-to=\"coop\"") & (pairs["cross-origin-opener-policy_n"] == "Empty")]["url"].tolist()

In [None]:
def disp_anti_pattern(pairs, prop, val_c, val_n):
    print(prop, val_c, val_n)
    display(pairs.loc[(pairs[f"{prop}_c"] == val_c) & (pairs[f"{prop}_n"] == val_n)]["site_c"].agg(["nunique", "unique"]).to_frame())
    return pairs.loc[(pairs[f"{prop}_c"] == val_c) & (pairs[f"{prop}_n"] == val_n)]["site_c"].agg("unique")



# sites that set nosniff or xfo only on logged in users
with pd.option_context("max_colwidth", None):    
    
    same_empty = disp_anti_pattern(pairs, "x-frame-options", "SAMEORIGIN", "Empty")
    disp_anti_pattern(pairs, "x-frame-options", "DENY", "SAMEORIGIN")
    empty_same = disp_anti_pattern(pairs, "x-frame-options", "Empty", "SAMEORIGIN")
    disp_anti_pattern(pairs, "x-frame-options", "DENY", "deny")
    
    print(set(same_empty) & set(empty_same))
    
    disp_anti_pattern(pairs, "content-disposition", "Empty", "inline")

    no_em = disp_anti_pattern(pairs, "x-content-type-options", "nosniff", "Empty")
    em_no = disp_anti_pattern(pairs, "x-content-type-options", "Empty", "nosniff")
    print(set(no_em) & set(em_no))
    
    
    disp_anti_pattern(pairs, "cross-origin-resource-policy", "cross-origin", "Empty")
    disp_anti_pattern(pairs, "cross-origin-resource-policy", "Empty", "cross-origin")

    disp_anti_pattern(pairs, "code", 200, 429)




### Response pairs every header!

In [None]:
pairs

In [None]:
def get_sites(row, pairs):
    return pairs.loc[(pairs[f"{prop}_c"] == row.iloc[0]) & (pairs[f"{prop}_n"] == row.iloc[1])]["site_c"].agg("nunique")

# Response pair changes on one property at a time
for prop in ["content-type", "x-frame-options", "content-disposition", "cross-origin-opener-policy", "x-content-type-options", "cross-origin-resource-policy", "content-security-policy", "location", "code"]:
    with pd.option_context("max_rows", 300):
        vc = pairs[[f"{prop}_c", f"{prop}_n"]].value_counts().to_frame().reset_index()
        vc = vc.loc[vc[f"{prop}_c"] != vc[f"{prop}_n"]]
        vc["sites"] = vc.apply(get_sites, pairs=pairs, axis=1)
        vc = vc.rename(columns={0: "URLs"})
        vc = vc.sort_values(["sites", "URLs"], ascending=False)
        display(vc.head(30))
        with open(f"tables/pairs_{prop}.tex", "w") as f:
            f.write(vc.head(30).to_latex(index=False))

## Cookie stats

In [None]:
# Cookie stats
# For 21 sites, we have lost the saved cookies (due to the server crash) remove them from the analysis
cookie_stats = pairs_dyn.loc[pairs_dyn["cookies"] != {}][["cookies", "site"]].groupby(["site"], group_keys=False).apply(get_cookie_stats)
cookie_stats["site"] = cookie_stats["site"].astype(str)

In [None]:
stats = cookie_stats.pivot_table(index=["sameSite", "secure", "httpOnly"], values=["site"], aggfunc=["count", "nunique"]).droplevel(1, axis=1)
stats = stats.rename(columns={"count": "Cookies", "nunique": "Sites"})
display(stats)
with open("tables/cookies_overview.tex", "w") as f:
    f.write(stats.to_latex())

In [None]:
display(cookie_stats.describe())
display(cookie_stats["sameSite"].value_counts().to_frame())
display(cookie_stats["httpOnly"].value_counts().to_frame())
display(cookie_stats["secure"].value_counts().to_frame())
# display(cookie_stats)

In [None]:
with pd.option_context("max_rows", 50):
    # Cookie settings per site
    cs = cookie_stats.pivot_table(index="site", values=["sameSite"], aggfunc=("nunique", "unique")).droplevel(0, axis=1)
    cs["unique"] = cs["unique"].apply(tuple).apply(lambda x: sorted(x)).apply(lambda x: ", ".join(x))
    #display(cs.value_counts().to_frame())
    cookie_samesite = cs.value_counts().to_frame().reset_index().rename(columns={0: "sites", "unique": "values"})[["values", "sites"]]
    with open("tables/cookies_samesite.tex", "w") as f:
        display(cookie_samesite)
        f.write(cookie_samesite.to_latex(index=False))
    
    
    # Additional?
    cs = cookie_stats.pivot_table(index="site", values=["secure", "sameSite"], aggfunc=("nunique", "unique", "count"))
    
    display(cs.sort_values(("sameSite", "nunique")))
    cs[("sameSite", "unique")] = cs[("sameSite", "unique")].apply(tuple).apply(lambda x: sorted(x)).apply(str)
    cs[("secure", "unique")] = cs[("secure", "unique")].apply(tuple).apply(lambda x: sorted(x)).apply(str)

    display(cs.value_counts().to_frame())


In [None]:
# Get all cookies with sameSite=None + secure=False (invalid in spec and chrome -> behavior: lax)
none_insecure = cookie_stats.loc[(cookie_stats["sameSite"] == "None") & (cookie_stats["secure"] == False)]
display(none_insecure)
none_insecure.describe()