In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import json

import pandas as pd
import numpy as np

from helper_dyn import (get_pipeline_overview, get_cookie_stats, get_pipeline_stats, show_only_first, get_leak_data, display_timing,
                        process_responses, display_response_summary, display_changed,
                        parse_method_url, get_query, info_grouping, row_sym, get_conf_dfs,
                        get_info_frames, get_only_both, parse_apg_url, url_list_to_tuples,
                        get_predictions_retroactive, save_div, get_basic_pruning_reduction, 
                        get_combs_after_basic_pruning, get_stats)
from dil_preprocess import get_url_data

# Data preparation

In [None]:
site_results = get_pipeline_overview()
# site_results = site_results.loc[site_results["tranco_rank"] > 20000]
display(site_results.info())
top_1k = site_results.loc[site_results["tranco_rank"].isin(range(0, 1000))]
c_res = site_results.loc[site_results["tranco_rank"].isin(range(1000,25000))]

In [None]:
# dbcon_leak_result
leak_results = get_leak_data()
leak_results.info()

In [None]:
leak_results.loc[leak_results["site"].str.contains("unpruned")]

In [None]:
# Display tested sites (Tranco rank)
df = site_results
missing = []
for i in range(1, 20000, 1000):  # 25000
    # display(df.loc[df["tranco_rank"].isin(range(i, i+1000))][["tranco_rank", "login"]].sort_values("tranco_rank"))
    
    # Display the tranco ranks without data
    missing.append(list(set(range(i, i+1000)) - set(df.loc[df["tranco_rank"].isin(range(i, i+1000))]["tranco_rank"].values.tolist())))

# Retest the missing ones, every site in the top 20000 should be tried
print(missing)

In [None]:
# Two groups where our test infrastructure failed (retest them)
# Due to the retest this data is messed up

# Groupon 5191 - betterteam 7007
fail_1 = df.loc[(df["id"].isin(range(5191,7008))) & (df["crawl_end"] != "")].sort_values("id")
display(fail_1.head())
print(json.dumps(fail_1["site"].values.tolist()))

# avvo.com - extend_office.com
fail_2 = df.loc[(df["id"].isin(range(12012,13328))) & (df["crawl_end"] != "")].sort_values("id")
display(fail_2.tail())
print(json.dumps(fail_2["site"].values.tolist()))

## Analysis

Timing info, general stats

In [None]:
# The timing information is skewed due to restarts and bugs
display_timing(df)

In [None]:
# Display general stats on the pipeline
dat, conf_both, conf_any = get_pipeline_stats(site_results.loc[~site_results["site"].str.contains(r"-unpruned|172\.17\.0.1:44320")])

# For a couple of websites, we needed to retest them, but the retest did not work. Exclude them from the rest of the analysis
print("Remove ones that did not login correctly\n\n")
dat, conf_both, conf_any = get_pipeline_stats(dat)

In [None]:
with pd.option_context('display.max_columns', None):
    with pd.option_context("max_rows", None):
        #display(c_res.loc[c_res["crawl_end"] != ""].sort_values(["num_basic_pruning", "num_urls"]))
        display(df.loc[df["crawl_end"] != ""].sort_values("cookie_end", ascending=False))
    

## Unpruned runs of a subset to compare how efficient our tree pruning is

- First try (comparing two runs) problem timeshift! 
- Retroactive analysis also has problems because crawl data is old

In [None]:
# Randomly sample a couple of websites where both chrome and firefox worked for the retest
full_conf_sample = conf_both.sample(15, random_state=5)
# display(full_conf_sample)
sample_names = full_conf_sample["site"].tolist()
print(json.dumps(sample_names)) 
# The dataframe is now different, thus we would get different results
sample_names = ["nitroflare.com", "bravenet.com", "any.do", "inoreader.com", "zoopla.co.uk", "har.com", "callupcontact.com", "amazon.in", "faucetcrypto.com", "nationalgeographic.com.es", "logrocket.com", "luckyorange.com", "bloglovin.com", "bshare.cn", "vidio.com"]
# Relogin failed on: bravenet.com, amazon.in, faucetcrypto.com, bshare.cn
failed = ["bravenet.com", "amazon.in", "faucetcrypto.com", "bshare.cn"]
sample_names = [site for site in sample_names if site not in failed]
sample_names

In [None]:
# Timing, vulns found and co comparison of unpruned/pruned sites!
# Display general stats on the pipeline unpruned
unpruned_names = [f"{site}-unpruned" for site in sample_names]
unpruned_sites = site_results.loc[site_results["site"].isin(unpruned_names)]
pruned_sites = site_results.loc[site_results["site"].isin(sample_names)]
dat_pruned, conf_both_pruned, conf_any_pruned = get_pipeline_stats(pruned_sites)
dat_unpruned, conf_both_unpruned, conf_any_unpruned = get_pipeline_stats(unpruned_sites)

# Random unpruned test of 15 sites
# For 4 the login did not work
# For the remaining 11 for 9 again vulnerable URLs were found for both chrome/firefox, for two sites only in firefox
display(conf_any_pruned)
display(conf_any_unpruned)

In [None]:
len(conf_any_pruned["confirmed_leak_urls"].values.tolist()[0]["chrome"])

In [None]:
# Get info on the pruned and unpruned runs!
df_unpruned = get_conf_dfs(conf_any_unpruned)
df_pruned = get_conf_dfs(conf_any_pruned)

info_unpruned, info_new_unpruned = get_info_frames(df_unpruned)
info_pruned, info_new_pruned = get_info_frames(df_pruned)
info_both = pd.merge(info_unpruned, info_pruned, on=["type", "subtype"], how="outer", suffixes=("_unpruned", "_pruned"))
info_new_both = pd.merge(info_new_unpruned, info_new_pruned, on=["type", "subtype"], how="outer", suffixes=("_unpruned", "_pruned"))

with pd.option_context("max_rows", None):
    with pd.option_context("max_columns", None):
        display(info_both)
        display(info_new_both)

In [None]:
# Get the leak_urls only working in pruned/unpruned run
def get_chrome_firefox(l):
    chrome_l = []
    firefox_l = []
    for entry in l:
        for url in entry.get("chrome", []):
            #chrome_l.append(f"chrome: {url.split('&browser=')[0]}")
            method, url, browser = parse_apg_url(url)
            chrome_l.append((method, url, browser))
        for url in entry.get("firefox", []):
            #firefox_l.append(f"firefox: {url.split('&browser=')[0]}")
            method, url, browser = parse_apg_url(url)
            firefox_l.append((method, url, browser))
    return {"chrome": chrome_l, "firefox": firefox_l}, chrome_l + firefox_l

_, pruned_leak_urls = get_chrome_firefox(dat_pruned["confirmed_leak_urls"].tolist())
_, unpruned_leak_urls = get_chrome_firefox(dat_unpruned["confirmed_leak_urls"].tolist())


# Leak URLs that only worked in the pruned or the unpruned run:
# Embed not tested in "pruned" as it is unstable in chrome??
# iframe/iframe-csp: main problem seems to be FPs in postMessages (e.g., on logrocket.com)
# Link-prefetch not tested in "pruned" as it is unstable??
# Object not tested in "pruned" as unstable or same as embed??
# Frame Count might not be so stable 
# others are due to time-shift/changes of the websites
# e.g., https://www.vidio.com/purchased/status?content_id=3337 returned 401 in the past and now returns 302 (and is now exploitable, was not in the past)

only_urls = get_only_both({"pruned": pruned_leak_urls, "unpruned": unpruned_leak_urls}, ("pruned", "unpruned"))
with pd.option_context("max_rows", None):
    with pd.option_context("max_colwidth", 100):
        display(only_urls.sort_values([0, 2, "key", 1]))

In [None]:
d = dat_pruned.loc[dat_pruned["site"] == "luckyorange.com"]["dyn_conf_urls"].values[0]
for browser in d:
    for url in d[browser]:
        if "object" in url:
            print(browser, url)

In [None]:
display(only_urls.groupby([1, 0])["key"].agg(["unique", "nunique"]).sort_values("nunique"))
display(only_urls.groupby([1, 0])[2].agg(["unique", "nunique"]).sort_values("nunique"))

In [None]:
# Other way to analyze this without timedrift! (retroactive analysist!)
# Problem here: large timeshift! comparing apples to oranges, input to trees is old

# Compare all leak_urls found in the unpruned test with the URLs that would have been tested if we used the tree module
# Both the normal tree module and the full tree module (without excluding unstable and "same" ones)
all_combinations = get_frame(dat_unpruned["dyn_conf_urls"].tolist())
predicted_trees = get_frame(dat_pruned["dyn_conf_urls"].tolist())
retest_urls = get_frame(dat_unpruned["dyn_conf_retest_urls"].tolist())
true_positives = get_frame(dat_unpruned["confirmed_leak_urls"].tolist()) #df_pruned[["url", "inc_method", "browser"]].drop_duplicates()
# In total 
# Get how many "true positives" are not in predicted_trees/predicted_trees_all
only_urls = get_only_both({"true_positives": true_positives, "predicted_trees": predicted_trees}, ("true_positives", "predicted_trees"))
only_urls = get_only_both({"true_positives": true_positives, "all_combinations": all_combinations}, ("true_positives", "all_combinations"))
only_urls = get_only_both({"retest_urls": retest_urls, "predicted_trees": predicted_trees}, ("retest_urls", "predicted_trees"))

print(f"Size reduction: {len(all_combinations)/len(predicted_trees)}")


# Get size difference in all_combinations/predicted_trees
with pd.option_context("max_columns", None):
    display(only_urls)
    display(get_frame(dat_pruned["confirmed_leak_urls"].tolist()))

### New try (retroactive)

- get effectiveness and co of both basic pruning and advanced tree pruning
- ..

In [None]:
# Randomly sample a couple of websites where both chrome and firefox worked for the retest
full_conf_sample = conf_both.sample(50, random_state=75)
# display(full_conf_sample)
retro_names = full_conf_sample["site"].tolist()
print(json.dumps(retro_names)) 
# The dataframe is now different, thus we would get different results
retro_names = ["pier1.com", "chartink.com", "twitcasting.tv", "pdffiller.com", "staples.ca", "tool.lu", "freelogodesign.org", "duplichecker.com", "miro.com", "mnml.la", "office.com", "pbslearningmedia.org", "redtube.com", "whatfontis.com", "glosbe.com", "wideads.com", "standardmedia.co.ke", "gyazo.com", "playground.xyz", "megogo.net", "zennioptical.com", "truecar.com", "powtoon.com", "italki.com", "themehorse.com", "amazon.in", "versobooks.com", "coursera.org", "yourstory.com", "korrespondent.net", "transifex.com", "ankiweb.net", "imgflip.com", "moneyweb.co.za", "wordpress.com", "iplocation.net", "porch.com", "youporn.com", "tmj4.com", "nimbusweb.me", "classifiedads.com", "myvidster.com", "cafepress.com", "viewsonic.com", "pakwheels.com", "idntimes.com", "mhthemes.com", "newswise.com", "universe.com", "aboutus.com"]
# Relogin worked on 36/50 sites
# Exclude the following sites where the relogin was not successful
# Mostly google SSO, others are marked as FPs this time or other crashes
failed = ["twitcasting.tv", "tool.lu", "office.com", "pbslearningmedia.org", "playground.xyz", "truecar.com", "amazon.in", "coursera.org", "imgflip.com", "moneyweb.co.za", "wordpress.com", "porch.com", "viewsonic.com", "newswise.com"]
retro_names = [site for site in retro_names if site not in failed]
retro_names

In [None]:
# Get the results
unpruned_names = [f"{site}-unpruned" for site in retro_names]
unpruned_sites = site_results.loc[site_results["site"].isin(unpruned_names)]
dat_unpruned, conf_both_unpruned, conf_any_unpruned = get_pipeline_stats(unpruned_sites)

In [None]:
# Retroactively get the predictions and the predictions without the reduced set of trees
dat_unpruned["predictions"] = get_predictions_retroactive(dat_unpruned)
dat_unpruned["predictions_all"] = get_predictions_retroactive(dat_unpruned, methods="all")
dat_unpruned["combinations_after_basic_pruning"] = get_combs_after_basic_pruning(dat_unpruned)
dat_unpruned["combinations_after_basic_pruning"] = get_combs_after_basic_pruning(dat_unpruned)

In [None]:
# Get TP, retest urls, all combinations and predictions
sites = dat_unpruned["site"].tolist()
all_combinations = url_list_to_tuples(dat_unpruned["dyn_conf_urls"].tolist(), sites)
all_combinations_basic = url_list_to_tuples(dat_unpruned["combinations_after_basic_pruning"].tolist(), sites)
predicted_trees =  url_list_to_tuples(dat_unpruned["predictions"].tolist(), sites)  
predicted_trees_all = url_list_to_tuples(dat_unpruned["predictions_all"].tolist(), sites)  
retest_urls = url_list_to_tuples(dat_unpruned["dyn_conf_retest_urls"].tolist(), sites)
ground_truth = url_list_to_tuples(dat_unpruned["confirmed_leak_urls"].tolist(), sites)

In [None]:
# only_urls = get_only_both({"true_positives": true_positives, "all_combinations": all_combinations}, ("true_positives", "all_combinations"), log=True)
only_urls = get_only_both({"true_positives": ground_truth, "all_combinations_basic": all_combinations_basic}, ("true_positives", "all_combinations_basic"), log=True)
only_urls = get_only_both({"true_positives": ground_truth, "predictions": predicted_trees}, ("true_positives", "predictions"), log=True)

# only_urls = get_only_both({"retest_urls": true_positives, "all_combinations_basic": all_combinations_basic}, ("retest_urls", "all_combinations_basic"), log=True)

only_urls = get_only_both({"all_combinations": all_combinations, "all_combinations_basic": all_combinations_basic}, ("all_combinations", "all_combinations_basic"), log=True)
# One URL missing from all combinations for pdffiller for some reason?
# only_urls.loc[only_urls["key"] == "all_combinations_basic"]

In [None]:
# In total, per site/browse/method
# This is for leak channels/leak urls (i.e., combination of inclusion method and target URL)

# How good does basic pruning works?
get_stats(ground_truth, all_combinations_basic, all_combinations, "Ground truth not in basic pruning")
# get_stats(retest_urls, all_combinations_basic, all_combinations, "Retests not in basic pruning")

# How good does advanced pruning works?
# Remove all URLs from ground_truth and retest_urls not in all_combinations_basic
# Otherwise, we compare the wrong things, as the trees work with the output of all_combinations_basic, so vulnerable URLs found that were removed by the basic_pruning cannot be found by the trees
gt = ground_truth.merge(all_combinations_basic, on=["method", "url", "browser", "site", "nogroup"], how="left", indicator=True)
gt = gt.loc[gt["_merge"] == "both"][["method", "url", "browser", "site", "nogroup"]]
rt = retest_urls.merge(all_combinations_basic, on=["method", "url", "browser", "site", "nogroup"], how="left", indicator=True)
rt = rt.loc[rt["_merge"] == "both"][["method", "url", "browser", "site", "nogroup"]]

get_stats(gt, predicted_trees, all_combinations_basic, "Ground truth not in trees")
get_stats(gt, predicted_trees_all, all_combinations_basic, "Ground truth not in trees_all")
# get_stats(rt, predicted_trees, all_combinations_basic, "Retests not in trees")
# get_stats(rts, predicted_trees_all, all_combinations_basic, "Retests not in trees_all")

In [None]:
# Get unique URLs + unique URLs missed
get_stats(ground_truth[["url", "browser", "nogroup"]], all_combinations_basic[["url", "browser", "nogroup"]], all_combinations[["url", "browser", "nogroup"]], "Ground truth not in basic pruning")
get_stats(gt[["url", "browser", "nogroup"]], predicted_trees[["url", "browser", "nogroup"]], all_combinations_basic[["url", "browser", "nogroup"]], "Ground truth not in basic pruning")

In [None]:
# Basic pruning stats on complete dataset!

# Size reduction in starting URLs
# If we test every pruned URL in all inclusion methods (12), a lot will be tested
df = site_results.loc[site_results["num_urls"] > 0]
df["basic_pruning_reduction"] = df.apply(get_basic_pruning_reduction, axis=1)
display(df[["site", "num_urls", "num_basic_pruning", "basic_pruning_reduction", "]].describe())
df["basic_pruning_reduction"].plot(kind="hist")

## Crawling and Cookies

- Header statistics and co
    - rare header/status-codes
    - rare/common response pairs
    - changes in rare headers/status-codes in response pairs
- Cookies collected (SameSite, HttpOnly, Secure)

In [None]:
580 * 12

In [None]:
df = get_url_data(None)
print(df.info())

In [None]:
# Process response headers (and status codes)
header_frame = df.apply(process_responses, axis=1, result_type="expand")

In [None]:
display_changed(header_frame)

In [None]:
display_response_summary(header_frame, False)

In [None]:
display_response_summary(header_frame, True)

In [None]:
dat = dat.loc[dat["crawl_end"] != ""]
cookie_stats = dat[["cookies", "site"]].groupby(["site"], group_keys=False).apply(get_cookie_stats)
display(cookie_stats.describe())
cookie_stats["sameSite"].value_counts()

# Does it leak

- Old Top 1K info:
    - Start sites: 18 (-1 redirects and crawl fails)
    - Start URLs:
        - min: 84
        - max: 2500
        - average: 650
    - Pruned URLs:
        - min: 2
        - max: 185
        - average: 74
    - Input rows:
        - min: 16
        - max: 1225
        - average: 557
    - Dyn conf per site/site-browser/method/browser-method:
        - min: 2, 1, 45(0, embed), 1(0, audio)
        - max: 598, 323, 1160 (window.open), 580
        - average: 150, 75, 255, 141
        - interesting stuff: firefox has many audio and object urls, chrome has many link-stylesheet and embed urls, other methods are more or less equal
    - Reason about pruning by testing a couple of sites without pruning! to see if we prune too much, it definitly gives us a nice speedup
    - Dyn retest per site/site-browser/method/browser-method:
        - min: 1(0), 1(0), 1(0), 1(0)
        - max: 58, 51, 123 (window.open), 90
        - average: 15, 10, 19, 14
        - interesting stuff: mainly only window.open works (firefox more than chrome, maybe because of chrome default lax SameSite?)
    - Working ones:
        - min: 1(0)
        - max: ~30
        - average: 2
        - Interesting stuff:
            - Only window.open + Iframe-csp(1x) works
                - Manual confirmation:
                    - amazon.in: login does not work
                    - coursera.org: different frame counts, depending on login state (might be due to a big overlay that we need to accept something about privacy after we created an account) + some seem to be FPs as we might not wait long enough for window.open!, + another problem is that we are too strict for postMessage
                    - technologyreview: FPs related to postMessage?; op_win_history seems to work reliably! (we need to add win_history back to the stable methods, after we made it more stable) 
            - Due to SameSite cookies?:
                - All cookies are SameSite/all session cookies are SameSite??
            - Investigate URLs that worked once, but not twice. Is our criterion too strict? (or is there noise that break our exact match requirement?):
                - unstable results, too strict! only compare element in question + smooth some of them?
                
- New stats below
    - dat working URLs

In [None]:
display(dat[["num_urls", "num_basic_pruning", "num_input_rows", "dyn_conf_firefox", "dyn_conf_chrome"]].describe())
for col in ["dyn_conf_urls", "dyn_conf_retest_urls", "confirmed_leak_urls"]:
    print(col)
    acc = []
    dat.apply(parse_method_url, col=col, acc=acc, axis=1)
    dyn_conf = pd.DataFrame(acc)

    with pd.option_context('display.max_rows', None):
        for grouping in [["site"], ["site", "browser"], ["method"], ["browser", "method"], ["site", "browser", "method"]]:
            print(grouping)
            dyn_browser = dyn_conf.groupby(grouping)[["url"]].count()
            display(dyn_browser.agg(["min", "max", "mean"]))
            display(dyn_browser.sort_values("url", ascending=False))

In [None]:
df_all = get_conf_dfs(conf_any) #dat.loc[dat["crawl_end"] != ""

In [None]:
# All confirmed vulns
display(df_all.agg(["nunique", "unique", "count"]))
display(df_all.groupby(["site", "browser"]).agg(["nunique", "unique", "count"]))

In [None]:
# Show which sites are responsible for most results
df_all[df_all.columns.difference(["value_cookies", "value_no_cookies"])].groupby("site").nunique().sort_values("url")

In [None]:
info_frame, info_frame_new = get_info_frames(df_all)

In [None]:
# All inclusion methods/leak methods
# On how many URLs/sites do they work? For chrome, firefox, and both!

# subtract the cases which are explainable by different browser parsing behavior (e.g., corb)
# simply remove all leak_urls rows that where tested at least once in both browsers
# all these non-overlapping cases can be explained by tree pruning/corb and co.
# all the remaining have other reasons that one can investigate
display(info_frame)

In [None]:
with pd.option_context("max_rows", None):
    display(info_frame_new)

In [None]:
with pd.option_context("max_rows", None):    
    display(df_all.loc[df_all["method"] == "gp_window_postMessage"].sort_values("jaro"))

In [None]:
# Plot over url_query_len, long queries might have sessionids or similar in the query and might be unexploitable
# Short query strings are probably exploitable
df_all[["url_query_len"]].plot(kind="line")

In [None]:
display(df_all[["url_len", "url_query_len"]].describe())
display(df_all.sort_values("url_query_len").head(5)["url"].to_list())
display(df_all.sort_values("url_query_len").tail(5)["url"].to_list())

In [None]:
display(df_all[["value_cookies", "value_no_cookies"]].value_counts().to_frame())
display(df_all["value_cookies"].value_counts().to_frame())
display(df_all["value_no_cookies"].value_counts().to_frame())

# Cookiehunter

- Emails received:
    - 10/11 (first run): Vimeo, Pinterest, WordPress, MailChimp, Redfin, python.org, OpenStreetMap, Pocket, Stripe, Dell, OkezoneID, Glassdoor, O'Reilly, GoFundMe, StackOverflow, Wufoo, Statcounter, readthedocs, MIT Technology Review, Houzz; Total of: 20
    - 18/19 (second run): Chess.com, Ning, Amazon.com, Hootsuite, Tumblr, Medium, postgresql (fb), Amazon.es, indiegogo (fb), digg.com (fb), NHS, Amazon.ca, buzzfeed (fb), imdb, naver (fb), amazon.it, openstreetmap (fb), coursera, manoramaoline, stack exchange (fb), globe and mail (fb), timeanddate (fb), pixnet (fb), amazon.co.jp, imageshack, adobe (fb), prezi (fb), dating service (fb), avast, bitnami, opera (fb), imgur (fb), envato, unsplash, goodreads, Amazon.de, Amazon.fr, Indeed: Total: 21 normal + 15 fb = 36
- Google SSO apps: adobe, appsflyer, asos login, atlassian, bitnami, box, change.org, coursera, digg.com, envato, evernote, hackmd, hatena-WWW, hubspot, ilovepdf, imgur, indeed, jimdo, le monde, mamba, medium, openstreetmap, opera, pixabay, pocket, prezi, soundcloud, squarespace, stack exchange, sydney morning, timeanddate, trello, tumblr, ups, urban dictionary, yahoo, zoom: Total: 37
- Facebook SSO apps: hackmd, netflix, pinterest, zoom.us, vk, the new york times, soundcloud, medium, ilovepdf, redfin, vox, squarespace, ning, the conversation, postgresql, indiegogo, digg.com, buzzfeed, naver, openstreetmap, stack exchange, globe and mail, timeanddate, pixnet, adobe, prezi, dating service, opera, imgur, ok.ru, overblog, unsplash, goodreads: Total: 33
- Stats from mongodb:
    - ready (i.e., login + signup URL found): 207
    - sso_ready (i.e., sso URL found): 452 (many fps!, e.g., fb like or share button)
    - registered: 46
    - verified: 2 
    - logged_in: 40
    - false_positive: 18 true, 21 false + a couple that have nothing
        - check for FPs: 41 times
        - could not relogin: 7 times
        - appear to be logged in even without cookies: 13 times
        - 18x start test pipeline (everything worked), not marked as FP in database, but not started: gofundme.com, statcounter.com, orreily (probably artifacts from first vs second run!) 
- Google/FB/Amozon, etc. how many duplicate URLs to better compare with stats from paper, problem redirect (fb.me, for country urls do a string match of before the dot, should work (login success on 95 of Top 1K):
    - Unique sites after basic pruning (exact string match before the first dot): 932, google 28 times, amazon 9 times, ebay 3 times, couple of websites 2 times
    - Cannot easily check redirections
- Problems (look at screenshots, logs, and counts):
    - timeout/took to long: 62 (many asian sites?)
    - Could not fetch hompage: 55 (mostly redirect websites, e.g., fb.com)
    - Did not find any signup/login/sso URLS: 277 (mix of cdns/ads, messengers (telegram, fb, ...), unavailable websites/do not serve a front page, real FPs, ...)
        - Not enough login options for this  domain (includes above): 357
    - Cookiehunter crashed for various reasons: 157
        - 'NoneType' object has no attribute 'send_keys': 73
            - Facebook profile is destroyed: 33
            - Google profile is destroyed: 43
        - Other fatal errors (Messgae: stale element reference: element is not attached to the page document, proxy issues, move_to requires web element, ...): 84
    - captchas:
        - Found on at least 89 websites
        - Captcha login (mongodb): 9 websites (what does this mean?)
        - Manage to login/register: 1 website (did not have captcha?, avast) + 6 with fallback SSO
    - complicated signup/in forms, e.g., password rules, mobile numbers
    - complicated verifications: e.g., OTP (example amazon)
    - accept cookies form blocks website
    - gmail token revokation: 13 times invalid token grant (manually renewed several times during the crawl)
    - ...

In [None]:
top_1k["pruned_site"] = top_1k["site"].apply(lambda x: x.split(".")[0])
top_1k.groupby("pruned_site")["pruned_site"].value_counts().sort_values()

In [None]:
display(top_1k["login"].str.count("Will reset$").sum())
# display(top_1k.loc[top_1k["login"].str.contains("FP.")]["login"].values)

display(top_1k["login"].str.count("\.\.$").sum())
# display(top_1k.loc[top_1k["login"].str.contains("\.\.$")]["login"].values)

display(top_1k["login"].str.count("Not enough").sum())
# display(top_1k.loc[top_1k["login"].str.contains("Not enough")]["login"].values)

# Celery and Redis stuff 

- Manually check redis and/or restart celery tasks

In [None]:
import redis
import json

r = redis.Redis()
r.set("172.17.0.1:44320::first_count", 0)
r.get("172.17.0.1:44320::second_count")

In [None]:
# Get info about tree_preprocess from redis
display(json.loads(r.get("known_unhandled_bodies")))
display(json.loads(r.get("untreated_cts")))

In [None]:
cookies = json.loads(r.get("wuzzuf.net"))
cookies

In [None]:
r.set("chartink.com-unpruned::first_count", 0)

In [None]:
r.set("bizreach.jp::second_count", 0)

In [None]:
from celery import Celery
app = Celery("helper", broker="pyamqp://guest@localhost//", backend="rpc://")

In [None]:
# Bug, datadoghq.com and jsfiddle.net did not execute final 4 times, as only one browser got retested
app.send_task("does_it_leak.start", ["megogo.net-unpruned"], queue="leak")

In [None]:
#app.send_task("start_node.test_site", args=["bizreach.jp", cookies], queue="node")