In [2]:
import sqlite3
import pandas as pd

In [3]:
import re

In [4]:
pd.set_option("display.max_colwidth",500)
pd.set_option("display.max_rows",500)
features_to_show = ["crawl_id", "func_name", "top_level_url", "symbol", "operation", "value", "arguments"]

In [5]:
DB = 'sample_2018-06_1m_stateless_census_crawl.sqlite'

In [6]:
con = sqlite3.connect(DB)

In [7]:
con.row_factory = sqlite3.Row
cur = con.cursor()
js = pd.read_sql_query("SELECT * FROM javascript", con)

In [8]:
js[js.operation == "call"].symbol.value_counts().head(15)

window.Storage.getItem                  46851
window.Storage.setItem                  18104
window.Storage.removeItem               13812
CanvasRenderingContext2D.fill            7258
CanvasRenderingContext2D.save            7074
CanvasRenderingContext2D.restore         7070
HTMLCanvasElement.getContext             4208
window.Storage.key                       3689
CanvasRenderingContext2D.measureText     3103
CanvasRenderingContext2D.stroke          2393
CanvasRenderingContext2D.arc             1558
CanvasRenderingContext2D.fillRect        1371
CanvasRenderingContext2D.clip            1327
CanvasRenderingContext2D.fillText         858
CanvasRenderingContext2D.rotate           537
Name: symbol, dtype: int64

In [9]:
js[(js.operation == "call") &
   (js.symbol == "CanvasRenderingContext2D.fillText")
  ].arguments.value_counts().head(10)

{"0":"Cwm fjordbank glyphs vext quiz, 😃","1":4,"2":45}    74
{"0":"Cwm fjordbank glyphs vext quiz, 😃","1":2,"2":15}    74
{"0":"!image!","1":2,"2":15}                              39
{"0":"!image!","1":4,"2":17}                              39
{"0":"Soft Ruddy Foothold 2","1":2,"2":2}                 19
{"0":"!H71JCaj)]# 1@#","1":4,"2":8}                       19
{"0":"🇺🇳","1":0,"2":0}                                    18
{"0":"🇺​🇳","1":0,"2":0}                                   18
{"0":"09:30","1":5,"2":130}                               14
{"0":"🕴‍♀️","1":0,"2":0}                                  14
Name: arguments, dtype: int64

<b> Criteria 1 from Englehardt & Narayanan, 2016: </b>

To be a fingerprinting candidate: "1.The canvas element’s height and width properties must
not be set below 16 px."

In [10]:
def width_height_finder(data):
    w = re.compile('[^(line)]width', re.IGNORECASE)
    h = re.compile('[^(line)]height', re.IGNORECASE)
    ww = w.search(data)
    hh = h.search(data)
    if ww!= None:
        return data
    elif hh!=None:
        return data

In [11]:
l = list(map(width_height_finder, list(js[js.operation == "set"].symbol)))
l = [x for x in l if x is not None]

In [12]:
set(l)

{'HTMLCanvasElement.height', 'HTMLCanvasElement.width'}

In [13]:
MIN_CANVAS_IMAGE_WIDTH = 16
MIN_CANVAS_IMAGE_HEIGHT = 16


canvas_size = js[js.symbol.isin(["HTMLCanvasElement.width", "HTMLCanvasElement.height"])].value.astype(float)
index_size_filter = canvas_size.where(canvas_size > MIN_CANVAS_IMAGE_HEIGHT).dropna().index

In [14]:
index_size_filter

Int64Index([   638,    639,   1138,   1139,   1142,   1143,   1192,   1193,
              1241,   1242,
            ...
            501091, 501092, 501186, 501187, 501190, 501191, 501196, 501197,
            501200, 501201],
           dtype='int64', length=7559)

In [15]:
# js.iloc[index_size_filter][features_to_show]

<b> Criteria 2 from Englehardt & Narayanan, 2016: </b> 

To be a fingerprinting candidate: "2.Text must be written to canvas with least two colors or
at least 10 distinct characters."

In [16]:
# js.symbol.value_counts()

In [17]:
# js[js.symbol == "CanvasRenderingContext2D.strokeStyle"]

In [18]:
import json 

def count_distinct_letters(text):
    if text!=None:
        t = json.loads(text, encoding='utf-8')['0']
        if not isinstance(t, int):
            return len(set(t))

In [19]:
CANVAS_WRITE_FUNCS = [
    "CanvasRenderingContext2D.fillText",
    "CanvasRenderingContext2D.strokeText",
    "CanvasRenderingContext2D.fill"
    ]

In [20]:
text = js[js.symbol.isin(CANVAS_WRITE_FUNCS)].arguments.dropna()
indexx = text.index
text = map(count_distinct_letters, text) 
res = pd.Series(text, index= indexx)
index_count_filter = res.where(res>=10).dropna().index

In [21]:
# js.iloc[index_count_filter][features_to_show]

<b> Criteria 3 from Englehardt & Narayanan, 2016 </b>

To be a fingerprinting candidate: "3. The script should not call the save, restore, or addEventListener
methods of the rendering context."

In [22]:
CANVAS_FP_DO_NOT_CALL_LIST = ["CanvasRenderingContext2D.save",
                              "CanvasRenderingContext2D.restore",
                              "HTMLCanvasElement.addEventListener"]

In [23]:
# js[js.operation == "call"][js.arguments!=None].arguments.value_counts().head(15)
index_call_filter = js[~js.symbol.isin(CANVAS_FP_DO_NOT_CALL_LIST)].index
index_call_filter

Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                 8,      9,
            ...
            501197, 501198, 501199, 501200, 501201, 501202, 501203, 501204,
            501205, 501206],
           dtype='int64', length=486982)

In [24]:
# js.iloc[index_call_filter][features_to_show]

In [25]:
# js[js.crawl_id==14].symbol.isin(CANVAS_FP_DO_NOT_CALL_LIST)

<b>Criteria 4 from Englehardt & Narayanan, 2016: </b>

To be a fingerprinting candidate: "4. The script extracts an image with toDataURL or with a
single call to getImageData that specifies an area with a
minimum size of 16px × 16px"

In [26]:
# js.symbol.value_counts()

In [27]:
CANVAS_READ_FUNCS = [
    "HTMLCanvasElement.toDataURL",
    "CanvasRenderingContext2D.getImageData"
    ]

In [28]:
# js[js.symbol.isin(CANVAS_READ_FUNCS)].arguments.dropna()

In [29]:
def check_dimensions(image):
    if image!=None:
        img = json.loads(image, encoding='utf-8')
        if len(img)>3 and img['2']>=16 and img['3']>=16:
            return image

In [30]:
image = js[js.symbol.isin(CANVAS_READ_FUNCS)].arguments
indexx = image.index
image = map(check_dimensions, image)
res = pd.Series(image, index = indexx)
index_image_filter = res.index
index_image_filter

Int64Index([   668,    672,   1190,   1240,   1298,   1349,   2246,   2750,
              2754,   3221,
            ...
            500611, 500615, 501080, 501084, 501090, 501094, 501189, 501193,
            501199, 501203],
           dtype='int64', length=682)

In [31]:
# js.iloc[index_image_filter][features_to_show]

At least 2 criteria

In [32]:
lst = []

In [33]:
for i in range(len(js)):
    if i in index_call_filter and i in index_count_filter:
        lst.append(js.iloc[i])
    elif i in index_call_filter and i in index_size_filter:
        lst.append(js.iloc[i])
    elif i in index_call_filter and i in index_image_filter:
        lst.append(js.iloc[i])
    elif i in index_count_filter and i in index_size_filter:
        lst.append(js.iloc[i])
    elif i in index_count_filter and i in index_image_filter:
        lst.append(js.iloc[i])
    elif i in index_image_filter and i in index_size_filter:
        lst.append(js.iloc[i])

In [34]:
df = pd.DataFrame(lst)
len(df)

8656

In [80]:
fp_visit_id = []
for i in set(js.visit_id):
    cond1 = cond2 = cond3 = cond4 = False
    for j in js[js.visit_id==i].index:
        if j in index_size_filter:
            cond1 = True
        if j in index_count_filter:
            cond2 = True
        if j in index_call_filter:
            cond3 = True
        if j in index_image_filter:
            cond4 = True
    if cond1 and cond2 and cond3 and cond4:
        fp_visit_id.append(i)

In [85]:
len(fp_visit_id)

81

Each criterion separately

In [34]:
# lst1 = []
# lst2 = []
# lst3 = []
# lst4 = []

In [35]:
# for i in range(len(js)):
#     if i in index_size_filter:
#         lst1.append(js.iloc[i])
#     elif i in index_count_filter:
#         lst2.append(js.iloc[i])
#     elif i in index_call_filter:
#         lst3.append(js.iloc[i])
#     elif i in index_image_filter:
#         lst4.append(js.iloc[i])

In [36]:
# crt1 = pd.DataFrame(lst1)
# crt2 = pd.DataFrame(lst2)
# crt3 = pd.DataFrame(lst3)
# crt4 = pd.DataFrame(lst4)

In [37]:
# crt1[features_to_show]

All four criteria

In [38]:
# lst = []

In [39]:
# for i in range(len(js)):
#     if i in index_call_filter and i in index_count_filter and i in index_size_filter and i in index_image_filter:
#         lst.append(js.iloc[i])

In [40]:
# df = pd.DataFrame(lst)
# len(df)