In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import json
import time
import pandas as pd
import numpy as np

from database_connector import connect, postgresql_to_dataframe
from dil_preprocess import get_url_data, basic_pruning
from dil_predict import init, predict_trees, reduce_leaky_endpoints
from dil_postprocess import get_working_incs, get_dyn_urls, get_dyn_results, get_retest_urls, get_working_urls, get_working_urls_channels

import qgrid
#import ipysheet

import redis
r = redis.Redis()

In [None]:
models = init()

In [None]:
# Connect to the database
site = "bitly.com"
dat = get_url_data(site)
dat.info()

In [None]:
dat.groupby(["cookies"]).count()

In [None]:
# Compare tika with file, both have some problems? 
# e.g. incomplete html, json, ...
# (tika somewhat strange for empty files)
qgrid.show_grid(dat[["resp_body_info", "resp_body_tika_info"]])

In [None]:
dat.groupby(["req_url"])["cookies"].agg(["nunique", "count"]).sort_values("nunique", ascending=False)

In [None]:
af, d, poss, results = basic_pruning(dat, log=True)

In [None]:
# Problem: many of the URLs in the attack frame are not predictable/guessable by a real attacker?
qgrid.show_grid(af)

In [None]:
# Methods that can leak the same thing are already reduced to one method
# firefox 24 working leak_channels, 11 inc methods (not link-prefetch)
# chromium 19 working leak_channels, 10 inc methods (not link-prefetch, not object)
# Overlap: 18 leak channels, 6 only in firefox, 1 only in chromium

In [None]:
af

In [None]:
leaky_endpoints = predict_trees(af, log=True)

In [None]:
leaks = reduce_leaky_endpoints(leaky_endpoints, log=True)

In [None]:
# All URLs x working leak methods
qgrid.show_grid(leaks)

In [None]:
incs = get_working_incs(leaks)
incs

In [None]:
urls = get_dyn_urls(leaks, incs, d, poss, log=True, unpruned=False)

In [None]:
# Get all results of the dynamic confirmation
#site = "pdffiller.com"
df = get_dyn_results(site)
display(df.info())
df[["timed_out"]].value_counts()

In [None]:
df.loc[(df["browser_id"] == 1) & (df["retest_num"] == 1)].sort_values(["test_id", "retest_num", "cookies"])

In [None]:
display(df.sort_values(["test_id", "browser_id", "cookies"]))
display(df.groupby(["test_id", "browser_id"])[["events_id", "global_properties_id", "object_properties_id", "window_properties_id"]].agg("nunique"))

In [None]:
# Find out what happenend with Firefox (and Chrome)
# Firefox all same results! (maybe setting of cookies did not work?)
# Chrome unsame results are only onblur/window.height (i.e., random stuff): seems like setting of cookies did not work? :(
# Checked that, setting of cookies worked. Then the problem might be with our test application, e.g., cookie is invalid?
# Or the problem is that the browser do not send the cookies along for some reason!
# Secure? leaker website also has to be https? (does not matter too much)
# Problem: cookie is only there for one request! then the cookie is gone :(???
# Django kills invalid session cookies by setting a new empty one
# Now, we login directly ensuring that we have a valid session cookie
print(df.columns)
qgrid.show_grid(df.loc[df["browser_id"] == 1].sort_values(["test_id", "browser_id", "retest_num", "cookies"])[["test_id", "browser_id", "retest_num", "cookies", "events_id", "object_properties_id", "global_properties_id", "window_properties_id"]])

In [None]:
retest, _, _ = get_retest_urls(df, log=True)
# run retest!
print("run retest")
display(retest)

# Retest done; check if it worked: got same results for cookies/cookies - no-cookies/no-cookies twice and different results for cookies/no-cookies twice (implicitly given by the first condition as we only check the ones that had different results in the first test)
# Dynamic fp_confirmation is hard/error-prone: e.g., a postMessage with a timestamp will be counted as a FP using our method 
# (can be a real FP, if both states receive the postMessage with different timstamps)
# Other example: /random_image/ only available for members will always have different image dimensions
# One solution would be to just check if a postMessage was received or not? (but this has another problem: if both states receive a distinct postMessage) 

# Reload data after retest is done
_, pot, pot_leaks = get_retest_urls(get_dyn_results(site), log=True, retest_num=1)

print("reloaded data")
# Check that the potential leak is stable (has the same result twice)
working, leak_urls = get_working_urls(pot, pot_leaks, log=True)
display(working)
display(leak_urls)

# Alternative
#conf = pot.groupby(["browser_id", "test_id", "cookies"])[["events_id", "global_properties_id", "object_properties_id", "window_properties_id"]].agg(["nunique", "count"])
#conf_miss = conf[conf.filter(regex="count", axis=1).isin([1]).all(axis=1)]
#print(f"Dropping missing URLs: {conf_miss.shape}")
#conf = conf.drop(conf_miss.index) # Does not drop the corresponding one (cookie/non-cookie)
#conf = conf[conf.filter(regex="nunique", axis=1).isin([1]).all(axis=1)].reset_index()
##display(conf)
#conf = df.loc[(df["browser_id"].isin(conf["browser_id"])) & (df["test_id"].isin(conf["test_id"])) & (df["retest_num"] == 1)].sort_values("test_id")
#display(conf)
#display(conf[~conf.isin(dup)].dropna())

### Not all URLs where found (and not all methods)
Find out why?
- ~~Initial crawl was incorrect~~ (redo crawl: answer was not the problem)
    - all urls found + cookies are correct
- ~~Preprocessing/basic bruning too strict/incorrect~~ several fixes applied
- ~~Trees are inaccurate (too strict)~~
- ~~Postprocessing is incomplete/has errors~~ several fixes applied
- ...

In [None]:
summary = working_df.sort_values(["url", "method", "inc_method", "browser"]).groupby(["url", "browser", "inc_method"])["method"].unique().to_frame()
#display(summary)

df_unpruned = get_dyn_results(f"{site}-unpruned")
working_df_unpruned , _, _ = get_working_urls_channels(df_unpruned, log=False)
summary_unpruned = working_df_unpruned.sort_values(["url", "method", "inc_method", "browser"]).groupby(["url", "browser", "inc_method"])["method"].unique().to_frame()
#display(summary_unpruned)

# only unpruned: 
# - /leak14/: only leaks for `sec_fetch_site` == "cross-site", so it is removed by the basic pruning step (should not occur in the wild)
# - ~~others, e.g., /leak16/ iframe: bug in preprocess xfo: fixed~~
# - others, e.g., /leak6/ link-prefetch: equivalent to other methods or trees not used because often not reliable enough: 
# rethink some of this? event_set of some inclusion method seem to work?, trees could also be inaccurate? :(
# - /leak9/: redirect, depending on the resulting page (e.g., not existinge vs existing) many other methods can work too 
# (for redirect ones, add other methods as well?) 

# only pruned:
# ~~iframe-csp: bug, was missing from test~~
with pd.option_context("max_rows", None):
    display(summary.join(summary_unpruned, rsuffix="-unpruned", how="outer"))

In [None]:
working_df_unpruned.loc[working_df_unpruned["url"].str.contains("leak9")]

In [None]:
display(df.loc[df["apg_url"].str.contains(r"/img/.*leak1/")].sort_values(["test_id", "browser_id", "retest_num", "cookies"]))
display(df_unpruned.loc[df_unpruned["apg_url"].str.contains(r"/img/.*leak1/")].sort_values(["test_id", "browser_id", "retest_num", "cookies"]))

In [None]:
display(tf.loc[tf["test_id"] == 4659295])
display(tf.loc[tf["test_id"] == 4659375])

In [None]:
import ipywidgets as widgets
from ipysheet import Cell, column, from_dataframe, to_dataframe, to_array
try:
    testapp_frame = pd.read_csv("testapp_frame")
    testapp_frame = testapp_frame.fillna('')
    sheet = from_dataframe(testapp_frame)
except (NameError, FileNotFoundError):
    nrows = 5
    sheet = ipysheet.sheet(columns=1,rows=nrows)
    column1 = ipysheet.column(0, [None] * nrows)

row_button = widgets.Button(description='Add Row')
column_button = widgets.Button(description='Add Column')
out = widgets.Output()

def add_row(_):
    sheet.rows += 1
    for col in sheet.cells: # this assumes that each cell is a column, this might break otherwise
        col.row_end +=1
        col = np.append(col,[None]) # Change None to whatever default value you want

def add_column(_):
    """Only works for the initial run, does not work after data is imported anymore.
       Adding a colum, saving and reloading the frame works!
       Adding and directly editing does not work
    """
    sheet.columns +=1 # Need to increment index first to avoid a ValueError
    ipysheet.column(sheet.columns-1,[None]*sheet.rows) 
            
row_button.on_click(add_row)
column_button.on_click(add_column)

display(widgets.VBox([widgets.HBox([row_button,column_button]),sheet]))

In [None]:
testapp_frame = pd.DataFrame(to_array(sheet))
testapp_frame.to_csv("testapp_frame", index=False)
testapp_frame

In [None]:
for site in [
 '172.17.0.1:44320',
 'vimeo.com',
 'amazon.in',
 'unsplash.com',
 'goodreads.com',
 'digg.com',
 'coursera.org',
 'epa.gov',
 'chess.com',
 'stripe.com',
 'avast.com',
 'bitnami.com',
 'envato.com',
 'ning.com',
 'postgresql.org',
 'urbandictionary.com',
 'readthedocs.io',
 'technologyreview.com',
 'hackmd.io']:
    df = get_dyn_results(site)
    print(f"Doing {site}, df.shape: {df.shape}")
    working_df, working_urls, url_dict = get_working_urls_channels(df, log=False)
    display(working_urls)
    display(working_df)
    # pd.DataFrame.from_dict(json.loads(json.dumps(working_df.to_dict("list"))))

In [None]:
# Reset counter, to be able to retest site
import json
#r.set("hackmd.io", json.dumps([ {'domain': 'hackmd.io', 'name': 'sectionFilterApplied', 'value': 'true', 'path': '/', 'httpOnly': False, 'secure': False}, {'domain': 'hackmd.io', 'secure': True, 'value': 's%3A93EUlGSOqODk1Dm6cd4twh1NcRy5Fi4v.8KZtWWt67yLRN%2FCpXEzExoXJlY0sOxBcOTIfdWVPg%2BY', 'expiry': 1725511081, 'path': '/', 'httpOnly': True, 'name': 'connect.sid'},  {'domain': 'hackmd.io', 'secure': True, 'value': 'en-US', 'expiry': 1655837481, 'path': '/', 'httpOnly': False, 'name': 'locale'},  {'domain': 'hackmd.io', 'name': 'sectionsSortStrategy', 'value': 'cat_new_to_old', 'path': '/', 'httpOnly': False, 'secure': False}, {'domain': 'hackmd.io', 'name': 'overviewLayoutStrategy', 'value': '', 'path': '/', 'httpOnly': False, 'secure': False}, {'domain': 'hackmd.io', 'name': '_csrf', 'value': 'Vi1fP7b2S0iCMyxwHtmz6m5A', 'path': '/', 'httpOnly': False, 'secure': False}, {'domain': 'hackmd.io', 'name': 'notesSortStrategy', 'value': 'new_to_old', 'path': '/', 'httpOnly': False, 'secure': False}]))
for site in ["172.17.0.1:44320", "172.17.0.1:44320-unpruned"]:
    print(r.get(f"{site}::first_count"))
    r.set(f"{site}::first_count", 0)
    r.set(f"{site}::second_count", 0)

# OLD stuff below

In [None]:
from helper_functions import get_ef, get_gf, get_of, get_wf
ef = get_ef()
gf = get_gf()
of = get_of()
wf = get_wf()
res = pot
res = res.merge(ef, how="left", on="events_id")
res = res.merge(gf, how="left", on="global_properties_id")
res = res.merge(of, how="left", on="object_properties_id")
res = res.merge(wf, how="left", on="window_properties_id")
display(pot.sort_values(["test_id", "browser_id", "cookies"]))
display(res.sort_values(["test_id", "browser_id", "cookies"])[["test_id", "browser_id", "cookies", "retest_num", "op_frame_count", "gp_window_postMessage"]])
res["event_set"]

In [None]:
qgrid.show_grid(df.loc[df["test_id"].isin(pot["test_id"]) & (df["window_properties_id"] != 109)].sort_values(["test_id", "cookies"]))

In [None]:
import json
import subprocess
import os
# save URLs dict to file (json?) and start the dynamic confirmation
# Run the automator framework with correct db settings + dict input + higher timeouts
# Start the framework twice? once with cookies and once without cookies?!
# Add cookies column to db/what about test? (maybe better to just put it into another table in the db!)
# (XSSI handling needs to be added)
url_dict_path =  f"data/{site}.json"
with open(url_dict_path, "w") as f:
    json.dump(urls, f)
print(site)
os.environ["PIPENV_DOTENV_LOCATION"] = "../.env"
#print(subprocess.check_output(["pipenv", "run", "python", "test_browsers.py", "local_grid", f"../analysis/{url_dict_path}", site, "True", "Test"], cwd="../automator"))

In [None]:
import glob
at = af.reset_index(drop=True)
display(at)
start = time.time()
for file in glob.glob("trees/tenmin/mojo/[1,2]/*.mojo"):
    break
    if "conflicted" in file:
        continue
    print(file)
    res = h2o.mojo_predict_pandas(at[th_headers], file, genmodel_jar_path="/home/jannis/Downloads/h2o-3.32.1.1/h2o.jar")
    if res["predict"].nunique() > 1:
        res = pd.concat([at, res], axis=1)
        valid = res.groupby("URL")["predict"].nunique()
        valid = valid[valid > 1]
        leaky = res.loc[res["URL"].isin(valid.index)]
        display(leaky)
print(f"Took {time.time() - start} seconds")

In [None]:
gen_path = "/home/jannis/Downloads/h2o-3.32.1.1/h2o.jar"
dat = af.groupby("URL")
# Replace by only good working ones?, otherwise we have too many trees!
files = glob.glob("trees/mojo/[1,2]/*.mojo")
files = [file for file in files if not "conflicted" in file]
print(len(files))
for key, item in dat:
    continue
    df = dat.get_group(key)
    both = df[th_headers]
    working = []
    for file in files:
        # Remove the (errornous) output of h2o: change file at path: /..../site-packages/h2o/utils/shared_utils.py line 414: to_csv add index=False
        res = h2o.mojo_predict_pandas(both, file, genmodel_jar_path=gen_path)
        if res["predict"].nunique() == 2:
            working.append(file)
    print(working)