In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import pandas as pd
import numpy as np
import sys
import os
import random
from collections import defaultdict, Counter
import statistics
from bs4 import BeautifulSoup
from urllib.parse import urlsplit, urlunsplit
import regex
import difflib
from subprocess import check_output
sys.path.append("../")
from utils import plotly_utils

#### Study html writes on sampled 20 URLs from carta

In [4]:
data = json.load(open('../datacollect/data/carta_urls_100.json', 'r'))
data = random.sample(data, 20)
json.dump(data, open('test_urls.json', 'w+'), indent=2)

#### Number of writes per page

In [37]:
dirs = os.listdir('writes')
number_writes = defaultdict(list)
filelists = ['live', 'archive']
for dirr in dirs:
    # for i in range(5):
    for i in filelists:
        writes = json.load(open(f'writes/{dirr}/{i}.json', 'r'))
        number_writes[dirr].append(len(writes))
num_writes = {k: (min(v),  statistics.median(v), max(v)) for k, v in number_writes.items()}
num_writes

{'chicagoimagists.com_1': (28, 28.5, 29),
 'www.arprim.org_1': (116, 116.5, 117),
 'www.bonhams.com_1': (368, 392.5, 417),
 'canadianart.ca_1': (62, 68.5, 75),
 'www.myragreene.com_1': (0, 0.5, 1),
 'www.artbrussels.com_1': (292, 294.0, 296),
 'soulisauctions.com_1': (26, 26.5, 27),
 'paulwongprojects.com_1': (58, 59.0, 60),
 'www.chloebeaulac.com_1': (13, 13.5, 14),
 'alllightexpanded.com_1': (42598, 42601.5, 42605),
 'nationalmuseumofmexicanart.org_1': (43, 49.0, 55),
 '2022.vbexhibitions.hk_1': (40, 40.5, 41),
 '127garazh.org_1': (84, 84.5, 85),
 'www.gkb-furniture.com_1': (94, 94.5, 95),
 'niadart.org_1': (102, 116.0, 130),
 'www.wallergallery.com_1': (47, 47.0, 47),
 'www.masonfineartandevents.com_1': (25, 25.0, 25),
 'www.visualartscentre.ca_1': (55, 55.0, 55),
 'wyld.gallery_1': (142, 143.0, 144)}

### Order of each element in the writes

In [8]:
dirs = os.listdir('writes_liveweb_only')
stats = {'total_element': 0, 'allpresent_element': 0}
total_empty_xpath = 0
max_order_diffs = []
# filelists = ['live', 'archive']
for dirr in dirs:
    max_order_diff = []
    xpath_order = defaultdict(list)
    for i in range(5):
    # for i in filelists:
        writes = json.load(open(f'writes_liveweb_only/{dirr}/{i}.json', 'r'))
        for j, write in enumerate(writes):
            if len(write['xpath']) == 0:
                total_empty_xpath += 1
                continue
            tup = (write['xpath'][-1], write['method'], write['arg'])
            xpath_order[tup].append(j)
    for xpath, orders in xpath_order.items():
        num_orders = len(orders)
        stats['total_element'] += 1
        if len(orders) == 5:
            stats['allpresent_element'] += 1
        if len(orders) > 2:
            max_order, min_order = max(orders), min(orders)
            max_order_diff.append(max_order - min_order)
    # * Plot type 1
    max_order_diffs += random.sample(max_order_diff, min(100, len(max_order_diff)))
    # * Plot type 2
    # max_order_diff.sort()
    # if len(max_order_diff) > 0:
    #     ninety_percentile = int(len(max_order_diff) * 0.9)
    #     ninety_percentile = max_order_diff[ninety_percentile]
    #     fifty_percentile = int(len(max_order_diff) * 0.5)
    #     fifty_percentile = max_order_diff[fifty_percentile]
    #     hundred_percentile = max_order_diff[-1]
    #     max_order_diffs.append({'50th': fifty_percentile, '90th': ninety_percentile, '100th': hundred_percentile})
            
# * Plot type 1
stats, total_empty_xpath
df = pd.DataFrame({1: max_order_diffs})
plotly_utils.plot_CDF(df, xtitle='Maximum order difference for writes in 5 loads (max 100 writes for each page)', ytitle='CDF across writes')
# * Plot type 2
# df = pd.DataFrame(max_order_diffs)
# plotly_utils.plot_bar(df)

### Intersection of all the xpaths across 5 runs

#### Functions for common prefix

In [3]:
def counter_subset(c1, c2):
    for k, v in c1.items():
        if k not in c2 or c2[k] < v:
            return False
    return True

# Get the longest prefix of two lists contains the same set of elements
def get_longest_common_prefix(l1, l2):
    l1_set, l2_set = Counter(l1), Counter(l2)
    if l1_set == l2_set:
        return min(len(l1), len(l2))
    # if counter_subset(l1_set, l2_set) or counter_subset(l2_set, l1_set):
    #     return min(len(l1), len(l2))
    l1_unique, l2_unique = l1_set - l2_set, l2_set - l1_set
    intersection = l1_set & l2_set
    max_prefix = 0
    l1_prefix, l2_prefix = Counter(), Counter()
    for i in range(min(len(l1), len(l2))):
        l1_prefix[l1[i]] += 1
        l2_prefix[l2[i]] += 1
        if len(l1_prefix-l2_set) > 0 or len(l2_prefix-l1_set) > 0:
            break
        if counter_subset(l1_prefix, intersection) and counter_subset(l2_prefix, intersection):
            if l1_prefix == l2_prefix:
                max_prefix = i+1
    return max_prefix

# Get the longest prefix for liveweb pages compared to all elements in archive pages
def get_longest_live_prefix(l1, l2):
    l1_set, l2_set = Counter(l1), Counter(l2)
    if l1_set == l2_set:
        return min(len(l1), len(l2))
    max_prefix = 0
    l1_prefix = Counter()
    for i in range(min(len(l1), len(l2))):
        l1_prefix[l1[i]] += 1
        if len(l1_prefix-l2_set) > 0:
            break
        max_prefix = i + 1
    return max_prefix

#### Filter and ignore writes

In [3]:
import re
# import regex as re

def ignore_op(write):
    ignore_list = [
        ['setAttribute', ['loading', 'eager']],
        ['setAttribute', ['id']]
    ]
    if 'id' in write['method']:
        return True
    args = [str(a) for a in write['arg'] if '<script' not in str(a)]
    if len(args) == 0:
        return True
    for ii in ignore_list:
        if write['method'] == ii[0] and set(ii[1]).issubset(set(args)):
            return True
    return False

# Eliminate substrings matched by regex
def filter_regex(s):
    # s = s.replace('\n', '')
    rs = [
        re.compile('http://localhost:8080/sync/.*?/'),
        re.compile(' \S*?id="(.|\n|\r)*?"'),
        re.compile(" \S*?id='(.|\n|\r)*?'"),
        re.compile(' rel="(.|\n|\r)*?"'),
        re.compile(' srcset="(.|\n|\r)*?"'),
        re.compile(' loading="(.|\n|\r)*?"'),
        re.compile(' aria.*?="(.|\n|\r)*?"'),
        re.compile(' transform.*?="(.|\n|\r)*?"'),
        re.compile('transform: .*?;'),
        re.compile(' data-ruffle-polyfilled="(.|\n|\r)*?"'),
        re.compile(' novalidate="(.|\n|\r)*?"'),
    ]
    for r in rs:
        # print(r)
        s = r.sub('', s)
    return s

# Order attributes of tags
def order_attributes(arg):
    # return args
    from bs4.formatter import XMLFormatter
    class SortAttributes(XMLFormatter):
        def attributes(self, tag):
            """Reorder a tag's attributes however you want."""
            sorted_attrs = sorted(tag.attrs.keys())
            new_attributes = []
            for attr in sorted_attrs:
                val = tag.attrs[attr]
                if attr == 'style':
                    val = val.split('; ')
                    val = '; '.join(sorted(val))
                new_attributes.append((attr, val))
            return new_attributes
    # spaces, arg = '', arg
    soup = BeautifulSoup(arg.strip(), 'html.parser')
    new_arg = soup.encode(formatter=SortAttributes()).decode()
    return new_arg

def element_normal(s):
    s = filter_regex(s)
    return order_attributes(s)

element_normal('<div id="AAA">')

'<div></div>'

### Directly compare render tree

In [181]:
def tree_match(dirr):
    live_tree = json.load(open(f'{dirr}/live.json', 'r'))
    archive_tree = json.load(open(f'{dirr}/archive.json', 'r'))
    live_queue = live_tree
    archive_queue = archive_tree
    while len(live_queue) > 0:
        if len(archive_queue) == 0:
            print('difference length of queue: live > archive')
            return False
        live_node = live_queue.pop()
        archive_node = archive_queue.pop()
        print(live_node.get('xpath'), archive_node.get('xpath'))
        if live_node['name'] != archive_node['name']:
            print('different node', live_node['xpath'], archive_node['xpath'])
            return False
        if live_node['dimension'] != archive_node['dimension']:
            print('different dimension', '\n', 
                    live_node['xpath'], '\n\t', live_node['dimension'], '\n',
                    archive_node['xpath'], '\n\t', archive_node['dimension'])
            return False
        live_queue.extend(live_node['children'])
        archive_queue.extend(archive_node['children'])
    if len(archive_queue) != 0:
        print('difference length of queue: live < archive')
    return len(archive_queue) == 0

def tree_diff(dirr, debug=False, info_file=False):
    from xmldiff import main as xmlmain
    from xmldiff import formatting as xmlformat
    live_html = open(f'{dirr}/live.html', 'r').read()
    archive_html = open(f'{dirr}/archive.html', 'r').read()
    live_html = filter_regex(live_html)
    archive_html = filter_regex(archive_html)
    formatter = xmlformat.DiffFormatter(pretty_print=True)
    diffs = xmlmain.diff_texts(live_html, archive_html,
                        diff_options={'F': 0.8, 'ratio_mode': 'accurate'})
    if debug:
        pdiffs = xmlmain.diff_texts(live_html, archive_html,
                        diff_options={'F': 0.8, 'ratio_mode': 'accurate'},
                        formatter=formatter)
        print(pdiffs)
    return len(diffs) == 0


def html_match(dirr, compare=['live', 'archive'], info_file=False, debug=False):
    live, archive = compare
    live_html = open(f'{dirr}/{live}.html', 'r').read()
    archive_html = open(f'{dirr}/{archive}.html', 'r').read()
    live_html_lines = live_html.splitlines()
    archive_html_lines = archive_html.splitlines()
    live_html_lines = [(element_normal(l), l) for l in live_html_lines]
    archive_html_lines = [(element_normal(l), l) for l in archive_html_lines]
    
    
    diffs = difflib.ndiff([c[0] for c in live_html_lines], 
                          [c[0] for c in archive_html_lines])
    # print(json.dumps(list(diffs), indent=2))
    live_count, archive_count = 0, 0
    new_diffs = []
    for d in diffs:
        if d[0] == '-':
            live_line = live_html_lines[live_count][1]
            new_diffs.append(d[0] + live_line)
            live_count += 1
        elif d[0] == '+':
            archive_line = archive_html_lines[archive_count][1]
            new_diffs.append(d[0] + archive_line)
            archive_count += 1
        elif d[0] == ' ':
            live_count += 1
            archive_count += 1
    diffs = new_diffs

    def _filter_diff_with_info(live_info, archive_info, diffs):
        diff_xpath_live, diff_xpath_archive = {}, {}
        for diff in diffs:
            if diff[0] == '-':
                text = diff[1:].strip()
                if text in live_info:
                    xpath = live_info[text]['xpath']
                    diff_xpath_live[xpath] = {
                        'text': text,
                        'dimension': {
                            'width': live_info[text]['dimension']['width'],
                            'height': live_info[text]['dimension']['height']
                        } if live_info[text]['dimension'] else None
                    }
            if diff[0] == '+':
                text = diff[1:].strip()
                if text in archive_info:
                    xpath = archive_info[text]['xpath']
                    diff_xpath_archive[xpath] = {
                        'text': text,
                        'dimension': {
                            'width': archive_info[text]['dimension']['width'],
                            'height': archive_info[text]['dimension']['height']
                        } if archive_info[text]['dimension'] else None
                    }
        intersection = set(diff_xpath_live.keys()).intersection(set(diff_xpath_archive.keys()))
        diff_live_same, diff_archive_same = set(), set()
        
        def dimension_match(d1, d2):
            if d1 is None or d2 is None:
                return False
            width_match = abs(d1['width'] - d2['width']) / min(d1['width'], d2['width']) < 0.01
            height_match = abs(d1['height'] - d2['height']) / min(d1['height'], d2['height']) < 0.01
            return width_match and height_match
        
        for its in intersection:
            if dimension_match(diff_xpath_live[its]['dimension'], diff_xpath_archive[its]['dimension']):
                diff_live_same.add(diff_xpath_live[its]['text'])
                diff_archive_same.add(diff_xpath_archive[its]['text'])
        new_diffs = []
        for diff in diffs:
            if diff[0] == '-':
                text = diff[2:].strip()
                if text not in diff_live_same:
                    new_diffs.append(diff)
            if diff[0] == '+':
                text = diff[2:].strip()
                if text not in diff_archive_same:
                    new_diffs.append(diff)
        return new_diffs
    
    if info_file:
        live_info = json.load(open(f'{dirr}/{live}_elements.json', 'r'))
        # live_info = {filter_pipeline(k): v for k, v in live_info.items()}
        archive_info = json.load(open(f'{dirr}/{archive}_elements.json', 'r'))
        # archive_info = {filter_pipeline(k): v for k, v in archive_info.items()}
        diffs = _filter_diff_with_info(live_info, archive_info, diffs)
        
    if debug and len(diffs) > 0:
        print(json.dumps(diffs, indent=2))
        json.dump(diffs, open(f'diffs.json', 'w'), indent=2)
    return len(diffs) == 0, diffs

def format_diffs(diffs, dirr, compare=['live', 'archive']):
    live, archive = compare
    live_html = open(f'{dirr}/{live}.html', 'r').read()
    archive_html = open(f'{dirr}/{archive}.html', 'r').read()
    live_html_lines = live_html.splitlines()
    archive_html_lines = archive_html.splitlines()

    def conseq_sublist(partial, whole):
        conseqs = []
        i, j = 0, 1
        while i < len(partial):
            if j > len(partial):
                break
            n = j-i
            sublist = any([(partial[i:j] == whole[k:k+n]) for k in range(len(whole)-n+1)])
            if not sublist:
                conseqs.append(partial[i:j])
                i = j
            j += 1
        if i < len(partial):
            conseqs.append(partial[i:])
        return conseqs
    # * Merge consequent - and + into change
    changes, dels, adds = [], [], []
    i = 0
    while i < len(diffs):
        if diffs[i][0] == '-':
            # * V1
            # if i < len(diffs)-1 and diffs[i+1][0] == '+':
            #     changes.append((diffs[i][1:], diffs[i+1][1:]))
            #     i += 2
            # else:
            #     assert(i >= len(diffs)-1 or diffs[i+1][0] == '-')
            #     dels.append(diffs[i][1:])
            #     i += 1
            # * V2
            dels.append(diffs[i][1:])
            i += 1
        else:
            assert(diffs[i][0] == '+')
            adds.append(diffs[i][1:])
            i += 1
    dels = conseq_sublist(dels, live_html_lines)
    # dels = ['\n'.join(d) for d in dels]
    adds = conseq_sublist(adds, archive_html_lines)
    # adds = ['\n'.join(a) for a in adds]
    changes_map = {c[0]: c[1] for c in changes}
    conseq_changes = conseq_sublist([c[0] for c in changes], live_html_lines)
    # print(json.dumps(changes, indent=2))
    # print(json.dumps(conseq_changes, indent=2))
    changes = []
    for conseq in conseq_changes:
        changes.append((conseq, [changes_map[c] for c in conseq]))
    return {
        # "changes": changes,
        "deletes": dels,
        "adds": adds
    }

compare = ['live', 'archive']
# compare = ['archive', 'wayback'] 
# compare = ['live', 'wayback']
# m, d = html_match('writes/cloud.gov_1', info_file=True, debug=True)
m, d = html_match('writes/apps.irs.gov_1', compare=compare, info_file=True, debug=True)
# d = format_diffs(d, 'writes/www.lagunapueblo-nsn.gov_1', compare=compare)
d
# tree_diff('writes/test', debug=True)


[
  "+    <iframe id=\"a2a_sm_ifr\" title=\"AddToAny Utility Frame\" transparency=\"true\" allowtransparency=\"true\" frameborder=\"0\" src=\"https://static.addtoany.com/menu/sm.24.html#type=core&amp;event=load\" style=\"height: 1px; width: 1px; border: 0px; left: 0px; top: 0px; position: absolute; z-index: 100000;\" allow=\"autoplay 'self'; fullscreen 'self'\">"
]


['+    <iframe id="a2a_sm_ifr" title="AddToAny Utility Frame" transparency="true" allowtransparency="true" frameborder="0" src="https://static.addtoany.com/menu/sm.24.html#type=core&amp;event=load" style="height: 1px; width: 1px; border: 0px; left: 0px; top: 0px; position: absolute; z-index: 100000;" allow="autoplay \'self\'; fullscreen \'self\'">']

In [None]:
def throw_url_prefix(url):
    hostnames = ['localhost:8080', 'web.archive.org']
    for hostname in hostnames:
        if hostname in url:
            url = re.sub(f'https?://{hostname}/.*?/.*?/', '', url)
            break
    us = urlsplit(url)
    netloc = us.netloc.split(':')[0].split('.')
    if 'www' in netloc[0]:
        netloc = netloc[1:]
    us = us._replace(scheme='', netloc='.'.join(netloc))
    path = us.path[:-1] if us.path != '/' and us.path[-1] == '/' else us.path
    us = us._replace(path=path)
    return urlunsplit(us)

# def network_sets(network):
#     n = set()
#     for v in network.values():
#         for vv in v:
#             n.add((throw_url_prefix(vv['url']), vv['status']//100*100))
#     return n

def network_match(dirr, compare=['live', 'archive'], debug=False):
    def collect_network(tag, extra_info, network):
        def _1(all_attrs):
            if extra_info.get('currentSrc', ""):
                return extra_info['currentSrc']
            return all_attrs.get('src')
        src_attrs = [_1] #[[attr from extrinfo, attr from tag]]
        soup = BeautifulSoup(tag, 'html.parser')
        networks = []
        if not soup.find():
            print(tag)
            return networks
        all_attrs = soup.find().attrs
        for getter in src_attrs:
            val = getter(all_attrs)
            if not val:
                continue
            val = throw_url_prefix(val)
            if val in network and network[val]['method'] == 'GET':
                networks.append(network[val])
        return networks
    
    def map_tag_network(dirr, filename):
        html = open(f'{dirr}/{filename}.html', 'r').read().splitlines()
        elements = json.load(open(f'{dirr}/{filename}_elements.json', 'r'))
        network = json.load(open(f'{dirr}/{filename}_network.json', 'r'))
        network = {throw_url_prefix(v['url']): v for v in network}
        # print(json.dumps(network, indent=2))
        
        network_tags = defaultdict(list)
        for tag in html:
            tag = tag.strip()
            if not tag or BeautifulSoup(tag, 'html.parser').find() is None:
                continue
            extra_attr = elements[tag]['extraAttr']
            # if filename == 'wayback':
            #     print(network)
            
            networks = collect_network(tag, extra_attr, network)
            
            for n in networks:
                network_tags[(throw_url_prefix(n['url']), n['status']//100<=2)].append(tag)
        return network_tags
    
    live, archive = compare
    live_network = map_tag_network(dirr, live)
    archive_network = map_tag_network(dirr, archive)

    live_unique = list(set(live_network.keys()) - set(archive_network.keys()))
    archive_unique = list(set(archive_network.keys()) - set(live_network.keys()))
    if debug:
        print(json.dumps(live_unique, indent=2))
        print(json.dumps(archive_unique, indent=2))

    # live_unique = [live_network[k] for k in live_unique]
    # archive_unique = [archive_network[k] for k in archive_unique]
    return len(live_unique) + len(archive_unique) == 0, {'live': live_unique, 'archive': archive_unique}

# network_match('writes/cloud.gov_1', debug=True)
network_match('writes/wow.uscgaux.info_1', compare=['archive', 'wayback'], debug=True)

### Compare all dirs under writes

#### Pair to pair comparison

In [149]:
import time
metadata = json.load(open('eot_metadata.json', 'r'))
metadata = {v['directory']: v for v in metadata.values()}
prefix = 'writes'
compare = ['archive', 'wayback']
dirs = os.listdir(prefix)
longest_prefix_info = []
onload_match = []
count = 0
num_false, num_true = 0, 0
start = time.time()
for dirr in dirs:
    # if dirr in ['bookamhs.alaska.gov_1']:
    #     continue
    count += 1
    if 'archive.html' not in os.listdir(f'{prefix}/{dirr}') or 'wayback.html' not in os.listdir(f'{prefix}/{dirr}'):
        continue
    print(count, dirr, metadata[dirr]['ts_delta'])
    hm, hd = html_match(f'{prefix}/{dirr}', compare=compare, info_file=True)
    hd = format_diffs(hd, f'{prefix}/{dirr}', compare=compare)
    nm, nd = network_match(f'{prefix}/{dirr}', compare=compare)
    onload_match.append({
        'directory': dirr,
        'html_match': hm,
        'network_match': nm,
        'html_diff': hd,
        'network_diff': nd
    })
    if not hm:
        num_false += 1
        # print("\tAny False")
    else:
        num_true += 1
        print("\tAll True")
end = time.time()
print(len(dirs), num_true)
# print("Time taken: ", end - start)
# json.dump(onload_match, open('onload_match_eot.json', 'w'), indent=2)

1 www.fda.gov_1 0
	All True
2 www.ndtourism.com_1 0
3 edis.usitc.gov_1 -21
5 taf.faa.gov_1 -34
	All True
6 www.nist.gov_1 -2
7 gov.idaho.gov_1 -4
	All True
8 www.nlm.nih.gov_1 -1
9 www.bta.ms.gov_1 -6
	All True
10 newhampshire.jobcorps.gov_1 -18
11 www.supremecourt.gov_1 -75
12 theftaz.azag.gov_1 -21
	All True
13 www.aces.edu_1 -1
14 cloud.gov_1 -16
15 www.uspto.gov_1 0
16 osrp.lanl.gov_1 1
	All True
18 nimhd.nih.gov_1 1
19 www.accessidaho.org_1 -1
20 github.com_1 -87
21 www.ddap.pa.gov_1 -11
23 publicfiles.fcc.gov_1 -17
	All True
24 uccweb.sos.la.gov_1 -371
	All True
25 www.cbp.gov_1 0
26 www.collegedrinkingprevention.gov_1 -7
27 bookamhs.alaska.gov_1 -5
	All True
28 internationalc2institute.org_1 2
	All True
30 waterdata.usgs.gov_1 -87
31 radiate.fnal.gov_1 -27
32 www.ncdoj.gov_1 -79
33 arcnet.arc.gov_1 -31
34 home.treasury.gov_1 -1
	All True
35 tribalvr.mptn-nsn.gov_1 -76
36 www.sandiego.gov_1 -1
37 www.airforce.com_1 -1
38 unicor.gov_1 -222
39 www.illinois.gov_1 -1
40 www.smartgrid

#### Compare live delete

In [178]:
import time
metadata = json.load(open('eot_metadata.json', 'r'))
metadata = {v['directory']: v for v in metadata.values()}
prefix = 'writes'
compares = [['live', 'archive'], ['archive', 'wayback'], ['live', 'wayback']]
dirs = os.listdir(prefix)
longest_prefix_info = []
onload_match = []
count = 0
num_false, num_same_false = 0, 0
results = []
for dirr in dirs:
    count += 1

    if 'live.html' not in os.listdir(f'{prefix}/{dirr}') or 'archive.html' not in os.listdir(f'{prefix}/{dirr}') or 'wayback.html' not in os.listdir(f'{prefix}/{dirr}'):
        continue
    live_dels = [None, None, None]
    matches = [True, True, True]
    print(count, dirr, metadata[dirr]['ts_delta'])
    obj = {
        'directory': dirr,
        'ts_delta': metadata[dirr]['ts_delta'],
    }
    for i, compare in enumerate(compares):
        hm, hd = html_match(f'{prefix}/{dirr}', compare=compare, info_file=True)
        hd = format_diffs(hd, f'{prefix}/{dirr}', compare=compare)
        nm, nd = network_match(f'{prefix}/{dirr}', compare=compare)
        live_dels[i] = ['\n'.join(d) for d in hd['deletes']]
        if not hm or not nm:
            matches[i] = False
        obj[f'{compare[0]} vs. {compare[1]} match'] = matches[i]
    if not matches[0]:
        print("\tFalse")
        num_false += 1
        del_archive = live_dels[0]
        del_wayback = live_dels[2]
        def del_archive_in_wayback(del_archive, del_wayback):
            total = 0
            for a in del_archive:
                for w in del_wayback:
                    if a in w:
                        total += 1
                        break
            return total
        same_false = del_archive_in_wayback(del_archive, del_wayback)
        if same_false > 0:
            print("\tSame False", same_false, len(del_archive), len(del_wayback))
            num_same_false += 1
        obj['del_wayback'] = del_wayback
        obj['del_archive'] = del_archive
        obj['same_false'] = same_false
    results.append(obj)
    json.dump(results, open('eot_match_info.json', 'w'), indent=2)

print(len(dirs), num_false, num_same_false)

1 www.fda.gov_1 0
2 www.ndtourism.com_1 0
	False
	Same False 1 2 10
3 edis.usitc.gov_1 -21
5 taf.faa.gov_1 -34
6 www.nist.gov_1 -2
7 gov.idaho.gov_1 -4
	False
	Same False 1 1 1
8 www.nlm.nih.gov_1 -1
9 www.bta.ms.gov_1 -6
10 newhampshire.jobcorps.gov_1 -18
	False
	Same False 1 1 1
11 www.supremecourt.gov_1 -75
12 theftaz.azag.gov_1 -21
	False
	Same False 1 1 1
13 www.aces.edu_1 -1
	False
	Same False 1 1 3
14 cloud.gov_1 -16
15 www.uspto.gov_1 0
	False
	Same False 1 1 3
16 osrp.lanl.gov_1 1
18 nimhd.nih.gov_1 1
	False
19 www.accessidaho.org_1 -1
	False
	Same False 1 3 6
20 github.com_1 -87
21 www.ddap.pa.gov_1 -11
	False
23 publicfiles.fcc.gov_1 -17
24 uccweb.sos.la.gov_1 -371
25 www.cbp.gov_1 0
	False
26 www.collegedrinkingprevention.gov_1 -7
	False
27 bookamhs.alaska.gov_1 -5
28 internationalc2institute.org_1 2
30 waterdata.usgs.gov_1 -87




31 radiate.fnal.gov_1 -27
	False
	Same False 1 2 3
32 www.ncdoj.gov_1 -79
	False
33 arcnet.arc.gov_1 -31
34 home.treasury.gov_1 -1
35 tribalvr.mptn-nsn.gov_1 -76
36 www.sandiego.gov_1 -1
	False
37 www.airforce.com_1 -1
	False
38 unicor.gov_1 -222
39 www.illinois.gov_1 -1
	False
40 www.smartgrid.gov_1 -4
41 www.tn.gov_1 -43
	False
	Same False 1 1 1
42 www.helpwithmycreditcard.gov_1 -251
	False
	Same False 1 1 13
44 www.flickr.com_1 -92
45 www.fsgb.gov_1 -101
	False
47 www.va.gov_1 -9
48 wow.uscgaux.info_1 -1
	False
49 911commission.gov_1 -3
52 mailboxes.dreamhost.com_1 -24
53 apps.apple.com_1 -226
	False
	Same False 1 36 28
54 house.louisiana.gov_1 2
	False
	Same False 3 4 4
56 www.dsireusa.org_1 -3
	False
	Same False 1 3 4
57 uscgboating.org_1 0
58 occupations.ky.gov_1 -371
59 www.philadelphiafed.org_1 0
	False
61 www.penndot.pa.gov_1 0
	False
	Same False 1 1 8
62 p2pays.org_1 -1
64 lec.akleg.gov_1 -5
66 blm.sciencebase.gov_1 -15
68 communityofgardens.tumblr.com_1 -1574
	False
	Same Fa

### Collect render tree into html format

In [21]:
dirr = 'wyld.gallery_1'
live = open(f'writes/{dirr}/live.html', 'r').read()
archive = open(f'writes/{dirr}/archive.html', 'r').read()


d = difflib.HtmlDiff(wrapcolumn=100)
f = open('diff.html', 'w+')
s = d.make_file(live.splitlines(), archive.splitlines())
f.write(s)
f.close()

# s = difflib.ndiff(live.splitlines(), archive.splitlines())
# list(s)

### Compare different methods with screenshots

In [13]:
dir_suffix = ['2k', '1080']
dir_suffix = ['']
dir_screenshots = defaultdict(lambda: {ds: False for ds in dir_suffix})
for ds in dir_suffix:
    data = json.load(open(f'./screenshot_diff/onload_screenshot_similarity.json', 'r'))
    for d in data:
        dirr = d['directory']
        simi = d['screenshot_similarity']
        dir_screenshots[dirr][ds] = simi >= 1
# unmatched_screenshots = [d for d, v in dir_screenshots.items() if not v['2k']
#                          and not v['1080']]
unmatched_screenshots = [d for d, v in dir_screenshots.items() if not v['']]
all_screenshots = {d for d in dir_screenshots}

#### Render tree vs. Screenshots

In [27]:
onload_match = json.load(open('onload_match_eot.json', 'r'))
# onload_match = {d['directory']: d['match'] for d in onload_match}
onload_match = {d['directory']: d['html_match'] and d['network_match'] for d in onload_match}
print("Total: render tree:", len([v for v in onload_match.values() if not v]), "screenshots", len(unmatched_screenshots))
render_additional = {k: v for k ,v in onload_match.items() if k not in all_screenshots}
print("Unque to render tree:", json.dumps(render_additional, indent=2))

print("Unique False to render tree:", json.dumps([k for k, v in onload_match.items() if v == False and k not in unmatched_screenshots and k not in render_additional], indent=2))

print("Unique False to screenshot:", json.dumps([k for k ,v in onload_match.items() if v == True and k  in unmatched_screenshots], indent=2))


Total: render tree: 53 screenshots 64
Unque to render tree: {}
Unique False to render tree: [
  "www.ddap.pa.gov_1",
  "www.tn.gov_1",
  "leg.wa.gov_1"
]
Unique False to screenshot: [
  "www.bta.ms.gov_1",
  "dgs.navajo-nsn.gov_1",
  "waterdata.usgs.gov_1",
  "home.treasury.gov_1",
  "www.fsgb.gov_1",
  "www.philadelphiafed.org_1",
  "app.oregonstudentaid.gov_1",
  "www.ntsb.gov_1",
  "bera.house.gov_1",
  "www.nsa.gov_1",
  "www.kywd.uscourts.gov_1",
  "www.aoc.gov_1",
  "louisianacda.com_1",
  "www.denali.gov_1"
]


### Clear wayback crawls

In [105]:
from subprocess import call
dirs = os.listdir('writes')
for dirr in dirs:
    call(f'rm writes/{dirr}/wayback*', shell=True)
data = json.load(open('eot_metadata.json', 'r'))
for v in data.values():
    if 'wayback' in v:
        del(v['wayback'])
json.dump(data, open('eot_metadata.json', 'w'), indent=2)

rm: cannot remove 'writes/edis.usitc.gov_1/wayback*': No such file or directory
rm: cannot remove 'writes/nevada.egrantsmanagement.com_1/wayback*': No such file or directory
rm: cannot remove 'writes/taf.faa.gov_1/wayback*': No such file or directory
rm: cannot remove 'writes/www.nist.gov_1/wayback*': No such file or directory
rm: cannot remove 'writes/gov.idaho.gov_1/wayback*': No such file or directory
rm: cannot remove 'writes/www.nlm.nih.gov_1/wayback*': No such file or directory
rm: cannot remove 'writes/www.bta.ms.gov_1/wayback*': No such file or directory
rm: cannot remove 'writes/newhampshire.jobcorps.gov_1/wayback*': No such file or directory
rm: cannot remove 'writes/www.supremecourt.gov_1/wayback*': No such file or directory
rm: cannot remove 'writes/theftaz.azag.gov_1/wayback*': No such file or directory
rm: cannot remove 'writes/cloud.gov_1/wayback*': No such file or directory
rm: cannot remove 'writes/www.uspto.gov_1/wayback*': No such file or directory
rm: cannot remove 

### Calculate wayback difference with archive

In [125]:
from dateutil import parser as dparser
def get_ts(url):
    return re.search(r'https?://.*?/.*?/(.*?)/.*', url).group(1)
metadata = json.load(open('eot_metadata.json', 'r'))
for v in metadata.values():
    if 'wayback' not in v:
        continue
    wayback = v['wayback']
    archive = v['archive']
    try:
        wayback_str = get_ts(wayback)
        archive_str = get_ts(archive)
        wayback_ts = dparser.parse(wayback_str)
        archive_ts = dparser.parse(archive_str)
        ts_delta = wayback_ts-archive_ts
    except:
        print(wayback_str, archive_str)
        continue
    v['ts_delta'] = ts_delta.days
    print(v['directory'], ts_delta.days)
json.dump(metadata, open('eot_metadata.json', 'w'), indent=2)

www.fmcs.gov_1 -1
www.ncdoj.gov_1 -79
occupations.ky.gov_1 -371
www.republican.senate.gov_1 -1
www.accessidaho.org_1 -1
clay.floridahealth.gov_1 2
nimhd.nih.gov_1 1
staging-portal.kansas.gov_1 0
www.aces.edu_1 -1
www.fda.gov_1 0
blm.sciencebase.gov_1 -15
www.smartgrid.gov_1 -4
www.lagunapueblo-nsn.gov_1 0
p2pays.org_1 -1
lec.akleg.gov_1 -5
www.kywd.uscourts.gov_1 -6
www.ndtourism.com_1 0
www.denali.gov_1 -16
app.oregonstudentaid.gov_1 -24
comanchenation.com_1 -9
riskmgt.alabama.gov_1 2
www.tn.gov_1 -43
uscgboating.org_1 0
www.facebook.com_1 -174
communityofgardens.tumblr.com_1 -1574
www.fsgb.gov_1 -101
mchb.hrsa.gov_1 -2
publicfiles.fcc.gov_1 -17
www.peacecorps.gov_1 -136
github.com_1 -87
transparency.ct.gov_1 -73
ethics.wi.gov_1 -48
ncdhh.nebraska.gov_1 -27
internationalc2institute.org_1 2
louisianacda.com_1 0
www.wyo.gov_1 -1
dbc.ca.gov_1 -19
www.va.gov_1 -9
azgovernor.gov_1 -14
29palmsbomi-nsn.gov_1 -28
arcnet.arc.gov_1 -31
cloud.gov_1 -16
uccweb.sos.la.gov_1 -371
www.microsoft.com_