# Results for Pipeline Evaluation in the thesis

- State creator
- Stateful crawler
- Static pruner
    - Basic
    - Advanced
- Dynamic confirmation

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import json
import requests
from requests.exceptions import SSLError, ConnectTimeout, ConnectionError
from publicsuffix2 import PublicSuffixList
from urllib.parse import urlparse

import pandas as pd
import numpy as np

from helper_dyn import (get_pipeline_overview, get_cookie_stats, get_pipeline_stats, show_only_first, get_leak_data, display_timing,
                        process_responses, display_response_summary, display_changed,
                        parse_method_url, get_query, info_grouping, row_sym, get_conf_dfs,
                        get_info_frames, get_only_both, parse_apg_url, url_list_to_tuples,
                        get_predictions_retroactive, save_div, get_basic_pruning_reduction, 
                        get_combs_after_basic_pruning, get_stats, get_crawl_data)
from dil_preprocess import get_url_data

In [None]:
site_results = get_pipeline_overview()

## State creator stats

In [None]:
attempts = site_results.loc[(~site_results["site"].str.contains("-unpruned")) & (site_results["tranco_rank"] > 0)]
attempts.info()

In [None]:
login_success = attempts.loc[(~attempts["num_urls"].isna())]
display(login_success.head(2))
display(login_success.loc[login_success["num_input_rows"].isna()].head(2))

In [None]:
org_success_top_1k = 95
org_success_total = 25242
org_total = 1585964

top_1k = attempts.loc[attempts["tranco_rank"].isin(range(0, 1001))]
selenium_success = top_1k.loc[top_1k["login"].str.contains("actual site:")]
selenium_success_len = len(selenium_success)
selenium_tried = 25

top_1k_cookie_success = top_1k.loc[(~top_1k["num_urls"].isna()) & (~top_1k["login"].str.contains("actual site:"))]
top_1k_success = len(top_1k_cookie_success) + 2  # vimeo and ?
total_success_len = len(login_success)
cookie_success_len = total_success_len - selenium_success_len
attempted_len = len(attempts)

print(f"Cookiehunter success on {top_1k_success}/{len(top_1k)}. Total success: {total_success_len}. Cookiehunter success: {cookie_success_len}. Selenium success: {selenium_success_len}. Total attempted: {attempted_len}.")


def check_cookie_hunter(df, info):
    print(info)
    print("Ends with .. (Fatal crash)", df["login"].str.count("\.\.$").sum())
    print("Ends with Will reset (Fatal SSO crash)", df["login"].str.count("Will reset$").sum())
    # display(df.loc[df["login"].str.contains("\.\.$")]["login"].values)
    print("Not enough", df["login"].str.count("Not enough").sum())
    print("Did not find any", df["login"].str.count("Did not find any").sum())
    # display(df.loc[df["login"].str.contains("Not enough")]["login"].values)
    print("Timeout", df["login"].str.count("took more").sum())
    # print(df.loc[df["login"].str.contains("took more")]["site"].to_list())
    print("Could not fetch", df["login"].str.count("Could not fetch").sum())
    # print(df.loc[df["login"].str.contains("Could not fetch")]["login"].values)
    print("CAPTCHA", df["login"].str.count("CAPTCHA").sum())  
    print("FP", df["login"].str.count("Got false positive. Aborting.").sum())
    print("FP/relogin failed", df["login"].str.count("Could not re-login in sensitive mode. Aborting auditor.").sum())

    
check_cookie_hunter(top_1k, "Top 1k:")
check_cookie_hunter(attempts, f"Top {attempted_len}:")

# Stateful crawler

In [None]:
#display(login_success.loc[login_success["num_basic_pruning"] == 0])
tls_errors = 0
crawling_failed = login_success.loc[login_success["num_urls"] == 0]
for site in crawling_failed["site"].tolist():
    try:
        requests.get(f"https://{site}/", timeout=10)
        print(f"{site} has no errors, other problem?")
    except (SSLError, ConnectTimeout, ConnectionError) as e:
        print(f"{site} has TLS errors. {type(e)}")
        tls_errors += 1
print(f"Total TLS errors: {tls_errors} of {len(crawling_failed)} failed crawls (0 URLs)")

tls_failed_0 = 25  # This can change, when they fix certs and co!

In [None]:
acc = 0
for i in range(0, 21):
    new = len(login_success.loc[login_success["num_urls"] == i])
    acc += new
    print(f"Num URLs == {i} for {new} sites. Num URLs <= {i}: {acc}") 

In [None]:
# Get from db (site) how many documents where crawled per site!
site_data = get_crawl_data()
# Problem: some results are missing (top 1k), as the table was resetted :(
# site_data = site_data.loc[site_data["site"].isin(login_success["site"].tolist())]
login_success = login_success.merge(site_data, on="site", how="left")

In [None]:
# Even if only one URL is crawled, many requests to different URLs can be observed?!
login_success.loc[login_success["counter"] == 1]["num_urls"].value_counts()

In [None]:
psl = PublicSuffixList()


redirects = 0
crawling_failed = login_success.loc[login_success["num_urls"].isin(range(1, 21))]
#crawling_failed = login_success.loc[(login_success["counter"] == 1) & login_success["num_urls"] > 0]
for site in crawling_failed["site"].tolist():
    try:
        r = requests.get(f"https://{site}/", timeout=10)
        print(r.url)
        domain = urlparse(r.url).netloc
        site_new = psl.get_public_suffix(domain)
        if site_new != site:
            print(f"{site} redirects cross-site!")
            redirects += 1
        else:
            print(f"{site} has no errors, other problem?")
    except (SSLError, ConnectTimeout, ConnectionError) as e:
        print(f"{site} has TLS errors. {type(e)}")
        tls_errors += 1
print(f"Total redirects: {redirects} of {len(crawling_failed)} failed crawls (1-20 URLs)")

tls_faild_1_10 = 0

In [None]:
login_success.loc[login_success["num_urls"] > 0][["num_urls", "num_basic_pruning", "num_input_rows", "dyn_conf_chrome", "dyn_conf_firefox", "dyn_conf_retest_chrome", "dyn_conf_retest_firefox", "confirmed_urls_chrome", "confirmed_urls_firefox"]].describe()

In [None]:
login_success.loc[login_success["num_urls"] > 0][["num_urls", "num_basic_pruning"]].plot(kind="box", showfliers=False)

In [None]:
login_success[["counter"]].plot(kind="hist", )

In [None]:
print(f"Lost node_crawler data: {login_success.loc[login_success['counter'].isna()].shape}")
print(f"Node_crawler data: {login_success.loc[~login_success['counter'].isna()].shape}")
print(f"Crawl only one URL: {login_success.loc[login_success['counter'] < 2].shape}")
print(f"Crawl until 100 limit: {login_success.loc[login_success['counter'] == 100].shape}")
print(f"Crawl other limit or problem: {login_success.loc[login_success['counter'].isin(range(2,100))].shape}")

## Static pruner

- basic 
- advanced

In [None]:
# Basic pruning stats on complete dataset!

# Size reduction in starting URLs
# If we test every pruned URL in all inclusion methods (12), a lot will be tested
df = login_success.loc[login_success["num_urls"] > 0]
df["basic_pruning_reduction"] = df.apply(get_basic_pruning_reduction, axis=1)
df["dyn_all"] = df["num_basic_pruning"] * 12
url_pruning_sum = df[["site", "counter", "num_urls", "num_basic_pruning", "basic_pruning_reduction", "dyn_all", "dyn_conf_chrome", "dyn_conf_firefox"]].describe()
display(url_pruning_sum)
df[["num_urls", "num_basic_pruning"]].plot(kind="line", legend=True, use_index=False)

In [None]:
u_p = url_pruning_sum.drop(["counter"], axis=1)
u_p.columns = u_p.columns.str.replace("_", "-")
u_p.index = u_p.index.str.replace("%", "\%")
with open("tables/url_pruning.tex", "w") as f:
    u_p  = u_p.rename(columns={"num-urls": "Original URLs", "num-basic-pruning": "Basic pruning URLs", "basic-pruning-reduction": "Reduction", "dyn-all": "Leak URLs", 
                              "dyn-conf-chrome": "Chrome tests", "dyn-conf-firefox": "Firefox tests"})
    u_p = u_p.filter(items=["mean", "std", "min", "50\%", "max"], axis=0)
    tab = u_p.loc[:, u_p.columns != "Reduction"].round(2)
    display(tab)
    #f.write(tab.to_latex(escape=False ,header=['\\rotatebox{90}{' + c + '}' for c in tab.columns]))
    f.write(tab.to_latex(escape=False))

In [None]:
df["basic_pruning_reduction"].plot(kind="hist")

In [None]:
mean = df["num_urls"].mean()
mean_p = df["num_basic_pruning"].mean()
maximum = df["num_urls"].max()
maximum_p = df["num_basic_pruning"].max()
basic_mean = (mean - mean_p) / mean
basic_max = (maximum - maximum_p) / maximum
print(f"Mean reduction by basic pruning {basic_mean}, mean: {mean}, after pruning: {mean_p}")
print(f"Reduction of maximum by basic pruning {basic_max}, max: {maximum}, after pruning: {maximum_p}")


dyn_all_avg = mean_p * 12
chrome = df["dyn_conf_chrome"].mean()
firefox = df["dyn_conf_firefox"].mean()
tree_chrome = (dyn_all_avg - chrome) / dyn_all_avg
tree_firefox = (dyn_all_avg - firefox) / dyn_all_avg
print(f"Mean advanced reduction chrome: {tree_chrome}, firefox: {tree_firefox}. Dyn all: {dyn_all_avg}, Chrome: {chrome}, Firefox: {firefox}")

### Advanced pruning/unpruned analysis

In [None]:
retro_names = ["pier1.com", "chartink.com", "twitcasting.tv", "pdffiller.com", "staples.ca", "tool.lu", "freelogodesign.org", "duplichecker.com", "miro.com", "mnml.la", "office.com", "pbslearningmedia.org", "redtube.com", "whatfontis.com", "glosbe.com", "wideads.com", "standardmedia.co.ke", "gyazo.com", "playground.xyz", "megogo.net", "zennioptical.com", "truecar.com", "powtoon.com", "italki.com", "themehorse.com", "amazon.in", "versobooks.com", "coursera.org", "yourstory.com", "korrespondent.net", "transifex.com", "ankiweb.net", "imgflip.com", "moneyweb.co.za", "wordpress.com", "iplocation.net", "porch.com", "youporn.com", "tmj4.com", "nimbusweb.me", "classifiedads.com", "myvidster.com", "cafepress.com", "viewsonic.com", "pakwheels.com", "idntimes.com", "mhthemes.com", "newswise.com", "universe.com", "aboutus.com"]
# Relogin worked on 36/50 sites
# Exclude the following sites where the relogin was not successful
# Mostly google SSO, others are marked as FPs this time or other crashes
failed = ["twitcasting.tv", "tool.lu", "office.com", "pbslearningmedia.org", "playground.xyz", "truecar.com", "amazon.in", "coursera.org", "imgflip.com", "moneyweb.co.za", "wordpress.com", "porch.com", "viewsonic.com", "newswise.com"]
retro_names = [site for site in retro_names if site not in failed]
# Get the results
unpruned_names = [f"{site}-unpruned" for site in retro_names]
unpruned_sites = site_results.loc[site_results["site"].isin(unpruned_names)]
dat_unpruned, conf_both_unpruned, conf_any_unpruned = get_pipeline_stats(unpruned_sites)

In [None]:
dat_unpruned

In [None]:
# Retroactively get the predictions and the predictions without the reduced set of trees
dat_unpruned["predictions"] = get_predictions_retroactive(dat_unpruned)
dat_unpruned["predictions_all"] = get_predictions_retroactive(dat_unpruned, methods="all")
dat_unpruned["combinations_after_basic_pruning"] = get_combs_after_basic_pruning(dat_unpruned)
dat_unpruned["combinations_after_basic_pruning"] = get_combs_after_basic_pruning(dat_unpruned)

# Get TP, retest urls, all combinations and predictions
sites = dat_unpruned["site"].tolist()
all_combinations = url_list_to_tuples(dat_unpruned["dyn_conf_urls"].tolist(), sites, site_cat=True)
all_combinations_basic = url_list_to_tuples(dat_unpruned["combinations_after_basic_pruning"].tolist(), sites, site_cat=True)
predicted_trees =  url_list_to_tuples(dat_unpruned["predictions"].tolist(), sites, site_cat=True)  
predicted_trees_all = url_list_to_tuples(dat_unpruned["predictions_all"].tolist(), sites, site_cat=True)  
retest_urls = url_list_to_tuples(dat_unpruned["dyn_conf_retest_urls"].tolist(), sites, site_cat=True)
ground_truth = url_list_to_tuples(dat_unpruned["confirmed_leak_urls"].tolist(), sites, site_cat=True)

# How good does advanced pruning works?
# Remove all URLs from ground_truth and retest_urls not in all_combinations_basic
# Otherwise, we compare the wrong things, as the trees work with the output of all_combinations_basic, so vulnerable URLs found that were removed by the basic_pruning cannot be found by the trees
gt = ground_truth.merge(all_combinations_basic, on=["method", "url", "browser", "site", "nogroup"], how="left", indicator=True)
gt = gt.loc[gt["_merge"] == "both"][["method", "url", "browser", "site", "nogroup"]]
rt = retest_urls.merge(all_combinations_basic, on=["method", "url", "browser", "site", "nogroup"], how="left", indicator=True)
rt = rt.loc[rt["_merge"] == "both"][["method", "url", "browser", "site", "nogroup"]]

In [None]:
# Get unique URLs + unique URLs missed
basic_url = get_stats(ground_truth[["url", "browser", "nogroup"]], all_combinations_basic[["url", "browser", "nogroup"]], all_combinations[["url", "browser", "nogroup"]], "Ground truth not in basic pruning")
advanced_url = get_stats(gt[["url", "browser", "nogroup"]], predicted_trees[["url", "browser", "nogroup"]], all_combinations_basic[["url", "browser", "nogroup"]], "Gt not in trees")

basic_leak = get_stats(ground_truth, all_combinations_basic, all_combinations, "Ground truth not in basic pruning")
advanced_leak = get_stats(gt, predicted_trees, all_combinations_basic, "Gt not in trees")

In [None]:
df = pd.DataFrame()
for key in basic_url:
    temp = basic_url[key]
    temp["pruning"] = "basic"
    df = df.append(temp)
for key in advanced_url:
    temp = advanced_url[key]
    temp["pruning"] = "advanced"
    df = df.append(temp)
df = df.replace({"nogroup": "both"})
df = df[["pruning", "grouping", "all_comb", "pred", "gt", "tp", "fp", "fpr", "fn", "tn", "fnr"]]
display(df)
with open("tables/pruning_fn_urls", "w") as f:
    f.write(df.round(2).to_latex(index=False))

In [None]:
df = pd.DataFrame()
for key in ["['nogroup']"]:
    temp = basic_leak[key]
    temp["pruning"] = "basic"
    df = df.append(temp)
for key in ["['nogroup']", "['browser']", "['method']"]:
    temp = advanced_leak[key]
    temp["pruning"] = "advanced"
    df = df.append(temp)
df = df.replace({"nogroup": "both"})
df = df[["pruning", "grouping", "all_comb", "pred", "gt", "tp", "fp", "fpr", "fn", "tn", "fnr"]]
display(df)
with open("tables/pruning_fn_leaks", "w") as f:
    f.write(df.round(2).to_latex(index=False))

In [None]:
# only_urls = get_only_both({"true_positives": true_positives, "all_combinations": all_combinations}, ("true_positives", "all_combinations"), log=True)
only_urls = get_only_both({"ground_truth": ground_truth, "all_combinations_basic": all_combinations_basic}, ("ground_truth", "all_combinations_basic"), log=True)
only_urls = get_only_both({"ground_truth": ground_truth, "predictions": predicted_trees}, ("ground_truth", "predictions"), log=True)

# only_urls = get_only_both({"retest_urls": true_positives, "all_combinations_basic": all_combinations_basic}, ("retest_urls", "all_combinations_basic"), log=True)

only_urls = get_only_both({"all_combinations": all_combinations, "all_combinations_basic": all_combinations_basic}, ("all_combinations", "all_combinations_basic"), log=True)
# One URL missing from all combinations for pdffiller for some reason?
# only_urls.loc[only_urls["key"] == "all_combinations_basic"]

In [None]:
# In total, per site/browse/method
# This is for leak channels/leak urls (i.e., combination of inclusion method and target URL)

# How good does basic pruning works?
get_stats(ground_truth, all_combinations_basic, all_combinations, "Ground truth not in basic pruning")
# get_stats(retest_urls, all_combinations_basic, all_combinations, "Retests not in basic pruning")


get_stats(gt, predicted_trees, all_combinations_basic, "Ground truth not in trees")
get_stats(gt, predicted_trees_all, all_combinations_basic, "Ground truth not in trees_all")
# get_stats(rt, predicted_trees, all_combinations_basic, "Retests not in trees")
# get_stats(rts, predicted_trees_all, all_combinations_basic, "Retests not in trees_all")

## Dynamic confirmation

In [None]:
display_timing(login_success)

In [None]:
# Display general stats on the pipeline
dat, conf_both, conf_any = get_pipeline_stats(login_success)

cookie_hunter_second_failed = ['allevents.in', 'whowhatwear.com', 'creative-tim.com', 'extendoffice.com', 'lepoint.fr', 'hallmark.com', 'flourish.studio', 'dramacool.fm', 'pdfdrive.com', 'jmty.jp', 'readymag.com', 'gridoto.com', 'grubhub.com', 'asana.com', 'familyeducation.com', 'entireweb.com', 'christianpost.com', 'cutt.us', 'tiexue.net', 'lejdd.fr', 'brisbanetimes.com.au']

# For a couple of websites, we needed to retest them, but the retest did not work. Exclude them from the rest of the analysis
print("Remove ones that did not login correctly\n\n")
dat, conf_both, conf_any = get_pipeline_stats(dat)

In [None]:
# FNs, FPs,
# Problem responses not recorded with a proxy
# Most logs lost due to server restart (as they were in RAM)