In [2]:
import sqlite3
import pandas as pd

In [3]:
import re

In [4]:
pd.set_option("display.max_colwidth",500)
pd.set_option("display.max_rows",500)
features_to_show = ["crawl_id", "func_name", "top_level_url", "symbol", "operation", "value", "arguments"]

In [5]:
DB = 'sample_2018-06_1m_stateless_census_crawl.sqlite'

In [6]:
con = sqlite3.connect(DB)

In [7]:
con.row_factory = sqlite3.Row
cur = con.cursor()
js = pd.read_sql_query("SELECT * FROM javascript", con)

In [34]:
# js

In [8]:
js[js.operation == "call"].symbol.value_counts().head(15)

window.Storage.getItem                  46851
window.Storage.setItem                  18104
window.Storage.removeItem               13812
CanvasRenderingContext2D.fill            7258
CanvasRenderingContext2D.save            7074
CanvasRenderingContext2D.restore         7070
HTMLCanvasElement.getContext             4208
window.Storage.key                       3689
CanvasRenderingContext2D.measureText     3103
CanvasRenderingContext2D.stroke          2393
CanvasRenderingContext2D.arc             1558
CanvasRenderingContext2D.fillRect        1371
CanvasRenderingContext2D.clip            1327
CanvasRenderingContext2D.fillText         858
CanvasRenderingContext2D.rotate           537
Name: symbol, dtype: int64

In [9]:
js[(js.operation == "call") &
   (js.symbol == "CanvasRenderingContext2D.fillText")
  ].arguments.value_counts().head(10)

{"0":"Cwm fjordbank glyphs vext quiz, 😃","1":2,"2":15}    74
{"0":"Cwm fjordbank glyphs vext quiz, 😃","1":4,"2":45}    74
{"0":"!image!","1":4,"2":17}                              39
{"0":"!image!","1":2,"2":15}                              39
{"0":"!H71JCaj)]# 1@#","1":4,"2":8}                       19
{"0":"Soft Ruddy Foothold 2","1":2,"2":2}                 19
{"0":"🇺​🇳","1":0,"2":0}                                   18
{"0":"🇺🇳","1":0,"2":0}                                    18
{"0":"🕴​♀️","1":0,"2":0}                                  14
{"0":"09:30","1":5,"2":130}                               14
Name: arguments, dtype: int64

<b> Criteria 1 from Englehardt & Narayanan, 2016: </b>

To be a fingerprinting candidate: "1.The canvas element’s height and width properties must
not be set below 16 px."

In [10]:
def width_height_finder(data):
    w = re.compile('[^(line)]width', re.IGNORECASE)
    h = re.compile('[^(line)]height', re.IGNORECASE)
    ww = w.search(data)
    hh = h.search(data)
    if ww!= None:
        return data
    elif hh!=None:
        return data

In [11]:
l = list(map(width_height_finder, list(js[js.operation == "set"].symbol)))
l = [x for x in l if x is not None]

In [12]:
set(l)

{'HTMLCanvasElement.height', 'HTMLCanvasElement.width'}

In [13]:
MIN_CANVAS_IMAGE_WIDTH = 16
MIN_CANVAS_IMAGE_HEIGHT = 16


canvas_size = js[js.symbol.isin(["HTMLCanvasElement.width", "HTMLCanvasElement.height"])].value.astype(float)
index_size_filter = canvas_size.where(canvas_size > MIN_CANVAS_IMAGE_HEIGHT).dropna().index

In [14]:
index_size_filter

Int64Index([   638,    639,   1138,   1139,   1142,   1143,   1192,   1193,
              1241,   1242,
            ...
            501091, 501092, 501186, 501187, 501190, 501191, 501196, 501197,
            501200, 501201],
           dtype='int64', length=7559)

In [15]:
# js.iloc[index_size_filter][features_to_show]

Unnamed: 0,crawl_id,func_name,top_level_url,symbol,operation,value,arguments
638,14,[87]/i</e.prototype.getCanvasFp,https://vk.com/,HTMLCanvasElement.width,set,2000,
639,14,[87]/i</e.prototype.getCanvasFp,https://vk.com/,HTMLCanvasElement.height,set,200,
1138,3,h.createCanvasAndSolve,https://www.facebook.com/,HTMLCanvasElement.width,set,200,
1139,3,h.createCanvasAndSolve,https://www.facebook.com/,HTMLCanvasElement.height,set,200,
1142,3,g,https://www.facebook.com/,HTMLCanvasElement.width,get,200,
1143,3,g,https://www.facebook.com/,HTMLCanvasElement.height,get,200,
1192,3,g,https://www.facebook.com/,HTMLCanvasElement.width,get,200,
1193,3,g,https://www.facebook.com/,HTMLCanvasElement.height,get,200,
1241,3,h.createCanvasAndSolve,https://www.facebook.com/,HTMLCanvasElement.width,set,200,
1242,3,h.createCanvasAndSolve,https://www.facebook.com/,HTMLCanvasElement.height,set,200,


<b> Criteria 2 from Englehardt & Narayanan, 2016: </b> 

To be a fingerprinting candidate: "2.Text must be written to canvas with least two colors or
at least 10 distinct characters."

In [43]:
# js.symbol.value_counts()

In [44]:
# js[js.symbol == "CanvasRenderingContext2D.strokeStyle"]

In [17]:
import json 

def count_distinct_letters(text):
    if text!=None:
        t = json.loads(text, encoding='utf-8')['0']
        if not isinstance(t, int):
            return len(set(t))

In [18]:
CANVAS_WRITE_FUNCS = [
    "CanvasRenderingContext2D.fillText",
    "CanvasRenderingContext2D.strokeText",
    "CanvasRenderingContext2D.fill"
    ]

In [19]:
text = js[js.symbol.isin(CANVAS_WRITE_FUNCS)].arguments.dropna()
indexx = text.index
text = map(count_distinct_letters, text) 
res = pd.Series(text, index= indexx)
index_count_filter = res.where(res>=10).dropna().index

In [20]:
# js.iloc[index_count_filter][features_to_show]

Unnamed: 0,crawl_id,func_name,top_level_url,symbol,operation,value,arguments
650,14,[87]/i</e.prototype.getCanvasFp,https://vk.com/,CanvasRenderingContext2D.fillText,call,,"{""0"":""Cwm fjordbank glyphs vext quiz"",""1"":2,""2"":15}"
653,14,[87]/i</e.prototype.getCanvasFp,https://vk.com/,CanvasRenderingContext2D.fillText,call,,"{""0"":""Cwm fjordbank glyphs vext quiz"",""1"":4,""2"":45}"
2241,4,r.canvasFingerprint/<,https://www.linkedin.com/,CanvasRenderingContext2D.fillText,call,,"{""0"":""92UV<v=Xd&N@Ig_P#1iqrWHBoclz>FZkyYu4xf(O^A8TJh)mbnGs$S]3-k!%j0Q{+w[RCKEat?L56}M~`D7e*"",""1"":2,""2"":15}"
2245,4,r.canvasFingerprint/<,https://www.linkedin.com/,CanvasRenderingContext2D.fillText,call,,"{""0"":""92UV<v=Xd&N@Ig_P#1iqrWHBoclz>FZkyYu4xf(O^A8TJh)mbnGs$S]3-k!%j0Q{+w[RCKEat?L56}M~`D7e*"",""1"":4,""2"":17}"
2719,6,e.prototype.getCanvasFp,https://www.reddit.com/,CanvasRenderingContext2D.fillText,call,,"{""0"":""Cwm fjordbank glyphs vext quiz, 😃"",""1"":2,""2"":15}"
2725,6,e.prototype.getCanvasFp,https://www.reddit.com/,CanvasRenderingContext2D.fillText,call,,"{""0"":""Cwm fjordbank glyphs vext quiz, 😃"",""1"":4,""2"":45}"
5057,11,v,http://www.sohu.com/,CanvasRenderingContext2D.fillText,call,,"{""0"":""Cwm fjordbank glyphs vext quiz, 😃"",""1"":2,""2"":15}"
5060,11,v,http://www.sohu.com/,CanvasRenderingContext2D.fillText,call,,"{""0"":""Cwm fjordbank glyphs vext quiz, 😃"",""1"":4,""2"":45}"
5097,11,v,http://www.sohu.com/,CanvasRenderingContext2D.fillText,call,,"{""0"":""Cwm fjordbank glyphs vext quiz, 😃"",""1"":2,""2"":15}"
5102,11,v,http://www.sohu.com/,CanvasRenderingContext2D.fillText,call,,"{""0"":""Cwm fjordbank glyphs vext quiz, 😃"",""1"":4,""2"":45}"


<b> Criteria 3 from Englehardt & Narayanan, 2016 </b>

To be a fingerprinting candidate: "3. The script should not call the save, restore, or addEventListener
methods of the rendering context."

In [21]:
CANVAS_FP_DO_NOT_CALL_LIST = ["CanvasRenderingContext2D.save",
                              "CanvasRenderingContext2D.restore",
                              "HTMLCanvasElement.addEventListener"]

In [22]:
# js[js.operation == "call"][js.arguments!=None].arguments.value_counts().head(15)
index_call_filter = js[~js.symbol.isin(CANVAS_FP_DO_NOT_CALL_LIST)].index
index_call_filter

Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                 8,      9,
            ...
            501197, 501198, 501199, 501200, 501201, 501202, 501203, 501204,
            501205, 501206],
           dtype='int64', length=486982)

In [23]:
# js.iloc[index_call_filter][features_to_show]

Unnamed: 0,crawl_id,func_name,top_level_url,symbol,operation,value,arguments
0,7,,https://www.google.co.in/?gws_rd=ssl,window.navigator.userAgent,get,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,
1,7,,https://www.google.co.in/?gws_rd=ssl,window.navigator.userAgent,get,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,
2,7,,https://www.google.co.in/?gws_rd=ssl,window.navigator.userAgent,get,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,
3,11,,https://www.google.co.jp/?gws_rd=ssl,window.navigator.userAgent,get,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,
4,7,,https://www.google.co.in/?gws_rd=ssl,window.navigator.userAgent,get,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,
5,7,,https://www.google.co.in/?gws_rd=ssl,window.navigator.userAgent,get,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,
6,7,,https://www.google.co.in/?gws_rd=ssl,window.navigator.platform,get,Linux x86_64,
7,7,mp,https://www.google.co.in/?gws_rd=ssl,window.navigator.userAgent,get,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,
8,11,,https://www.google.co.jp/?gws_rd=ssl,window.navigator.userAgent,get,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,
9,11,,https://www.google.co.jp/?gws_rd=ssl,window.navigator.userAgent,get,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,


In [52]:
# js[js.crawl_id==14].symbol.isin(CANVAS_FP_DO_NOT_CALL_LIST)

<b>Criteria 4 from Englehardt & Narayanan, 2016: </b>

To be a fingerprinting candidate: "4. The script extracts an image with toDataURL or with a
single call to getImageData that specifies an area with a
minimum size of 16px × 16px"

In [53]:
# js.symbol.value_counts()

In [24]:
CANVAS_READ_FUNCS = [
    "HTMLCanvasElement.toDataURL",
    "CanvasRenderingContext2D.getImageData"
    ]

In [25]:
# js[js.symbol.isin(CANVAS_READ_FUNCS)].arguments.dropna()

In [33]:
def check_dimensions(image):
    if image!=None:
        img = json.loads(image, encoding='utf-8')
        if len(img)>3:
            if img['2']>=16 and img['3']>=16:
                return image
        else: 
            return image

In [34]:
image = js[js.symbol.isin(CANVAS_READ_FUNCS)].arguments
indexx = image.index
image = map(check_dimensions, image)
res = pd.Series(image, index = indexx).dropna()
index_image_filter = res.index
index_image_filter

Int64Index([  3221,   6203,   6208,   6401,  39591,  39594,  39597,  39600,
             39603,  39606,
            ...
            339197, 339202, 339207, 345340, 415883, 422995, 443435, 460390,
            463353, 467632],
           dtype='int64', length=186)

In [55]:
# js

Unnamed: 0,id,crawl_id,visit_id,script_url,script_line,script_col,func_name,script_loc_eval,document_url,top_level_url,call_stack,symbol,operation,value,arguments,time_stamp
0,1,7,7,https://www.google.co.in/?gws_rd=ssl,1,3641,,,https://www.google.co.in/?gws_rd=ssl,https://www.google.co.in/?gws_rd=ssl,,window.navigator.userAgent,get,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,,2018-06-27T14:19:39.880Z
1,2,7,7,https://www.google.co.in/?gws_rd=ssl,1,3731,,,https://www.google.co.in/?gws_rd=ssl,https://www.google.co.in/?gws_rd=ssl,,window.navigator.userAgent,get,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,,2018-06-27T14:19:39.880Z
2,3,7,7,https://www.google.co.in/?gws_rd=ssl,1,3732,,,https://www.google.co.in/?gws_rd=ssl,https://www.google.co.in/?gws_rd=ssl,,window.navigator.userAgent,get,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,,2018-06-27T14:19:39.882Z
3,4,11,11,https://www.google.co.jp/?gws_rd=ssl,1,3641,,,https://www.google.co.jp/?gws_rd=ssl,https://www.google.co.jp/?gws_rd=ssl,,window.navigator.userAgent,get,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,,2018-06-27T14:19:39.950Z
4,5,7,7,https://www.google.co.in/?gws_rd=ssl,1,5173,,,https://www.google.co.in/?gws_rd=ssl,https://www.google.co.in/?gws_rd=ssl,,window.navigator.userAgent,get,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,,2018-06-27T14:19:39.882Z
5,6,7,7,https://www.google.co.in/?gws_rd=ssl,39,36,,,https://www.google.co.in/?gws_rd=ssl,https://www.google.co.in/?gws_rd=ssl,,window.navigator.userAgent,get,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,,2018-06-27T14:19:39.894Z
6,7,7,7,https://www.google.co.in/?gws_rd=ssl,39,776,,,https://www.google.co.in/?gws_rd=ssl,https://www.google.co.in/?gws_rd=ssl,,window.navigator.platform,get,Linux x86_64,,2018-06-27T14:19:39.894Z
7,8,7,7,https://www.google.co.in/?gws_rd=ssl,335,92,mp,,https://www.google.co.in/?gws_rd=ssl,https://www.google.co.in/?gws_rd=ssl,,window.navigator.userAgent,get,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,,2018-06-27T14:19:39.932Z
8,9,11,11,https://www.google.co.jp/?gws_rd=ssl,1,3731,,,https://www.google.co.jp/?gws_rd=ssl,https://www.google.co.jp/?gws_rd=ssl,,window.navigator.userAgent,get,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,,2018-06-27T14:19:39.950Z
9,10,11,11,https://www.google.co.jp/?gws_rd=ssl,1,3732,,,https://www.google.co.jp/?gws_rd=ssl,https://www.google.co.jp/?gws_rd=ssl,,window.navigator.userAgent,get,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,,2018-06-27T14:19:39.950Z


In [54]:
# js.operation.value_counts()

get            329145
call           122271
set             49788
set(failed)         3
Name: operation, dtype: int64

At least 2 criteria

In [59]:
lst = []

In [60]:
for i in range(len(js)):
    if i in index_call_filter and i in index_count_filter:
        lst.append(js.iloc[i])
    elif i in index_call_filter and i in index_size_filter:
        lst.append(js.iloc[i])
    elif i in index_call_filter and i in index_image_filter:
        lst.append(js.iloc[i])
    elif i in index_count_filter and i in index_size_filter:
        lst.append(js.iloc[i])
    elif i in index_count_filter and i in index_image_filter:
        lst.append(js.iloc[i])
    elif i in index_image_filter and i in index_size_filter:
        lst.append(js.iloc[i])

In [61]:
df = pd.DataFrame(lst)
len(df)

8122

In [44]:
fp_visit_id = []
for i in set(js.visit_id):
    cond1 = cond2 = cond3 = cond4 = False
    for j in js[js.visit_id==i].index:
        if j in index_size_filter:
            cond1 = True
        if j in index_count_filter:
            cond2 = True
        if j in index_call_filter:
            cond3 = True
        if j in index_image_filter:
            cond4 = True
    if cond1 and cond2 and cond3 and cond4:
        fp_visit_id.append(i)

In [51]:
fp_visit_id

[18, 19, 139, 201, 254, 382, 390, 407, 465, 512, 697, 874, 939]

In [52]:
# js[js.visit_id.isin(fp_visit_id)]

Unnamed: 0,id,crawl_id,visit_id,script_url,script_line,script_col,func_name,script_loc_eval,document_url,top_level_url,call_stack,symbol,operation,value,arguments,time_stamp
2249,2250,5,19,http://www.sina.com.cn/,23,414,,,http://www.sina.com.cn/,http://www.sina.com.cn/,,window.navigator.userAgent,get,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,,2018-06-27T14:20:01.218Z
3218,3219,11,18,http://39d0825d09f05.cdn.sohucs.com/sdk/passport-4.0.3.js,1,6062,browser<,,http://www.sohu.com/,http://www.sohu.com/,,window.navigator.userAgent,get,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,,2018-06-27T14:20:04.584Z
3219,3220,11,18,http://39d0825d09f05.cdn.sohucs.com/sdk/passport-4.0.3.js,1,6316,browser<,,http://www.sohu.com/,http://www.sohu.com/,,window.navigator.vendor,get,,,2018-06-27T14:20:04.586Z
3220,3221,11,18,http://statics.itc.cn/web/v3/static/js/main-f895b2f9d0.js,1,4865,i.detectWebP,,http://www.sohu.com/,http://www.sohu.com/,,HTMLCanvasElement.getContext,call,,"{""0"":""2d""}",2018-06-27T14:20:04.606Z
3221,3222,11,18,http://statics.itc.cn/web/v3/static/js/main-f895b2f9d0.js,1,4889,i.detectWebP,,http://www.sohu.com/,http://www.sohu.com/,,HTMLCanvasElement.toDataURL,call,,"{""0"":""image/webp""}",2018-06-27T14:20:04.606Z
3222,3223,11,18,http://statics.itc.cn/web/v3/static/js/main-f895b2f9d0.js,1,4979,i.detectStorage,,http://www.sohu.com/,http://www.sohu.com/,,window.localStorage,get,{},,2018-06-27T14:20:04.608Z
3223,3224,11,18,http://statics.itc.cn/web/v3/static/js/main-f895b2f9d0.js,1,4979,i.detectStorage,,http://www.sohu.com/,http://www.sohu.com/,,window.Storage.setItem,call,,"{""0"":""test"",""1"":""test""}",2018-06-27T14:20:04.610Z
3224,3225,11,18,http://statics.itc.cn/web/v3/static/js/main-f895b2f9d0.js,1,5015,i.detectStorage,,http://www.sohu.com/,http://www.sohu.com/,,window.localStorage,get,"{""test"":""test""}",,2018-06-27T14:20:04.610Z
3225,3226,11,18,http://statics.itc.cn/web/v3/static/js/main-f895b2f9d0.js,1,5015,i.detectStorage,,http://www.sohu.com/,http://www.sohu.com/,,window.Storage.removeItem,call,,"{""0"":""test""}",2018-06-27T14:20:04.610Z
3226,3227,11,18,http://statics.itc.cn/web/v3/static/js/main-f895b2f9d0.js,1,658,,,http://www.sohu.com/,http://www.sohu.com/,@http://statics.itc.cn/web/v3/static/js/main-f895b2f9d0.js:1:658\nt@http://statics.itc.cn/web/v3/static/js/main-f895b2f9d0.js:1:102\n@http://statics.itc.cn/web/v3/static/js/main-f895b2f9d0.js:2:179\nt@http://statics.itc.cn/web/v3/static/js/main-f895b2f9d0.js:1:102\n@http://statics.itc.cn/web/v3/static/js/main-f895b2f9d0.js:1:7016\nt@http://statics.itc.cn/web/v3/static/js/main-f895b2f9d0.js:1:102\n@http://statics.itc.cn/web/v3/static/js/main-f895b2f9d0.js:1:376\n@http://statics.itc.cn/web/v3/...,window.document.cookie,get,,,2018-06-27T14:20:04.610Z


Each criterion separately

In [34]:
# lst1 = []
# lst2 = []
# lst3 = []
# lst4 = []

In [35]:
# for i in range(len(js)):
#     if i in index_size_filter:
#         lst1.append(js.iloc[i])
#     elif i in index_count_filter:
#         lst2.append(js.iloc[i])
#     elif i in index_call_filter:
#         lst3.append(js.iloc[i])
#     elif i in index_image_filter:
#         lst4.append(js.iloc[i])

In [36]:
# crt1 = pd.DataFrame(lst1)
# crt2 = pd.DataFrame(lst2)
# crt3 = pd.DataFrame(lst3)
# crt4 = pd.DataFrame(lst4)

In [37]:
# crt1[features_to_show]

All four criteria

In [38]:
# lst = []

In [39]:
# for i in range(len(js)):
#     if i in index_call_filter and i in index_count_filter and i in index_size_filter and i in index_image_filter:
#         lst.append(js.iloc[i])

In [40]:
# df = pd.DataFrame(lst)
# len(df)