In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import pandas as pd
import sys
sys.path.append("../")
from baseline import pixel_diff
import cv2



## Util functions

In [3]:
def writes_diff(dirname):
    live_writes = json.load(open(f'writes/{dirname}/live_writes.json', 'r'))
    archive_writes = json.load(open(f'writes/{dirname}/archive_writes.json', 'r'))
    return len(live_writes['writes']) - len(archive_writes['writes'])

# writes_diff('test')

In [4]:
def screenshot_simi(dirname):
    live_img = f'writes/{dirname}/live.png'
    archive_img = f'writes/{dirname}/archive.png'
    return pixel_diff.diff(live_img, archive_img, return_diff_img=True)

# img_simi = screenshot_simi('test')
# print(img_simi[0])
# cv2.imwrite(f'test_{img_simi[0]:.2f}.png', img_simi[1])

### 0. Check if there is any page with more archive writes than live writes
- On 178 sample pages, no page has more archive writes than live writes
- New run produces 12 pages with more archive writes.
    - www.nrcs.usda.gov_1 (-1): No fidelity issue.

In [7]:
similarities = json.load(open('screenshots/onload_screenshot_similarity.json', 'r'))
print("Total number of pages: ", len(similarities))
less_live_writes = 0
for simi_obj in similarities:
    diff = writes_diff(simi_obj['directory'])
    if diff < 0:
        less_live_writes += 1
        print(simi_obj['directory'], diff)
print("Total number of pages with less live writes: ", less_live_writes)

Total number of pages:  178
www.nrcs.usda.gov_1 -1
www.fws.gov_2 -42
www.federalreserve.gov_1 -6
www.transportation.gov_1 -2
www-curator.jsc.nasa.gov_1 -1
www.ce9.uscourts.gov_1 -1
geo.arc.nasa.gov_1 -7
www.glerl.noaa.gov_1 -1
www.speaker.gov_1 -1
www.ncpc.gov_1 -2
edworkforce.house.gov_1 -1
www.usda.gov_1 -4
Total number of pages with less live writes:  12


### 1. Relationship between the screenshots similarity and the writes diff

In [6]:
table = [['type', 'similarity=1', 'similarity<1'],
         ['writes_diff=0', 0, 0],
         ['writes_diff>0', 0, 0]]
df = pd.DataFrame(table[1:], columns=table[0])
df.set_index('type', inplace=True)

similarities = json.load(open('screenshots/onload_screenshot_similarity.json', 'r'))

for simi_obj in similarities:
    directory = simi_obj['directory']
    similarity = simi_obj['screenshot_similarity']
    col = 'similarity=1' if similarity >= 1 else 'similarity<1'
    row = 'writes_diff=0' if writes_diff(directory) == 0 else 'writes_diff>0'
    df.loc[row, col] += 1
df

Unnamed: 0_level_0,similarity=1,similarity<1
type,Unnamed: 1_level_1,Unnamed: 2_level_1
writes_diff=0,84,59
writes_diff>0,3,32


#### 1.1 Check certain type of category in the table
- False positive (writes detection falsely detecting a page as fidelity problematic): 3 cases
    - weblogin.lanl.gov_1: Set attribute of an element twice (with different value)
    - www.whitehouse.gov_1 & www.whitehouse.gov_2: Visibility: hidden element (already fixed)
- True positive check:
    - www.nist.gov_1: Correct
    - www.aoc.gov_1: FIXED. Wrong (liveweb has additional svg. Suspect to be sync between live and archive)
    - www.fws.gov_1: FIXED. Exception triggered on measurements.
    - www.nrel.gov_1: FIXED
    - www.ams.usda.gov_1: FIXED
- False negative check:
    - photojournal.jpl.nasa.gov_1: ? Looks no fidelity issue
    - www.baaqmd.gov_1: False negative. Carousel not working on archive
    - ffb.treasury.gov_1: False negative. Because of 503
    - www.miwd.uscourts.gov_1: True negative. Screenshot seem to catch the location of search icon wrong
    - www.whitehouse.senate.gov_1: False negative. Google translate banner missing on archive

In [8]:
similarities = json.load(open('screenshots/onload_screenshot_similarity.json', 'r'))

target_row, target_col = 'writes_diff>0', 'similarity=1'
for simi_obj in similarities:
    directory = simi_obj['directory']
    similarity = simi_obj['screenshot_similarity']
    col = 'similarity=1' if similarity >= 1 else 'similarity<1'
    row = 'writes_diff=0' if writes_diff(directory) == 0 else 'writes_diff>0'
    if row == target_row and col == target_col:
        print(directory)

www.fws.gov_2
www-curator.jsc.nasa.gov_1
www.ce9.uscourts.gov_1
