In [188]:
import sqlite3
import pandas as pd

In [189]:
import re

In [248]:
pd.set_option("display.max_colwidth",500)
pd.set_option("display.max_rows",500)
features_to_show = ["crawl_id", "func_name", "top_level_url", "symbol", "operation", "value", "arguments"]

In [247]:
DB = 'sample_2018-06_1m_stateless_census_crawl.sqlite'

In [192]:
con = sqlite3.connect(DB)

In [193]:
con.row_factory = sqlite3.Row
cur = con.cursor()
js = pd.read_sql_query("SELECT * FROM javascript", con)

In [194]:
js[js.operation == "call"].symbol.value_counts().head(15)

window.Storage.getItem                  46851
window.Storage.setItem                  18104
window.Storage.removeItem               13812
CanvasRenderingContext2D.fill            7258
CanvasRenderingContext2D.save            7074
CanvasRenderingContext2D.restore         7070
HTMLCanvasElement.getContext             4208
window.Storage.key                       3689
CanvasRenderingContext2D.measureText     3103
CanvasRenderingContext2D.stroke          2393
CanvasRenderingContext2D.arc             1558
CanvasRenderingContext2D.fillRect        1371
CanvasRenderingContext2D.clip            1327
CanvasRenderingContext2D.fillText         858
CanvasRenderingContext2D.rotate           537
Name: symbol, dtype: int64

In [195]:
js[(js.operation == "call") &
   (js.symbol == "CanvasRenderingContext2D.fillText")
  ].arguments.value_counts().head(10)

{"0":"Cwm fjordbank glyphs vext quiz, üòÉ","1":2,"2":15}    74
{"0":"Cwm fjordbank glyphs vext quiz, üòÉ","1":4,"2":45}    74
{"0":"!image!","1":4,"2":17}                              39
{"0":"!image!","1":2,"2":15}                              39
{"0":"Soft Ruddy Foothold 2","1":2,"2":2}                 19
{"0":"!H71JCaj)]# 1@#","1":4,"2":8}                       19
{"0":"üá∫‚Äãüá≥","1":0,"2":0}                                   18
{"0":"üá∫üá≥","1":0,"2":0}                                    18
{"0":"09:30","1":5,"2":130}                               14
{"0":"üï¥‚Äã‚ôÄÔ∏è","1":0,"2":0}                                  14
Name: arguments, dtype: int64

<b> Criteria 1 from Englehardt & Narayanan, 2016: </b>

To be a fingerprinting candidate: "1.The canvas element‚Äôs height and width properties must
not be set below 16 px."

In [196]:
def width_height_finder(data):
    w = re.compile('[^(line)]width', re.IGNORECASE)
    h = re.compile('[^(line)]height', re.IGNORECASE)
    ww = w.search(data)
    hh = h.search(data)
    if ww!= None:
        return data
    elif hh!=None:
        return data

In [197]:
l = list(map(width_height_finder, list(js[js.operation == "set"].symbol)))
l = [x for x in l if x is not None]

In [198]:
set(l)

{'HTMLCanvasElement.height', 'HTMLCanvasElement.width'}

In [199]:
MIN_CANVAS_IMAGE_WIDTH = 16
MIN_CANVAS_IMAGE_HEIGHT = 16


canvas_size = js[js.symbol.isin(["HTMLCanvasElement.width", "HTMLCanvasElement.height"])].value.astype(float)
index_size_filter = canvas_size.where(canvas_size > MIN_CANVAS_IMAGE_HEIGHT).dropna().index

In [200]:
index_size_filter

Int64Index([   638,    639,   1138,   1139,   1142,   1143,   1192,   1193,
              1241,   1242,
            ...
            501091, 501092, 501186, 501187, 501190, 501191, 501196, 501197,
            501200, 501201],
           dtype='int64', length=7559)

In [249]:
js.iloc[index_size_filter][features_to_show]

Unnamed: 0,crawl_id,func_name,top_level_url,symbol,operation,value,arguments
638,14,[87]/i</e.prototype.getCanvasFp,https://vk.com/,HTMLCanvasElement.width,set,2000,
639,14,[87]/i</e.prototype.getCanvasFp,https://vk.com/,HTMLCanvasElement.height,set,200,
1138,3,h.createCanvasAndSolve,https://www.facebook.com/,HTMLCanvasElement.width,set,200,
1139,3,h.createCanvasAndSolve,https://www.facebook.com/,HTMLCanvasElement.height,set,200,
1142,3,g,https://www.facebook.com/,HTMLCanvasElement.width,get,200,
1143,3,g,https://www.facebook.com/,HTMLCanvasElement.height,get,200,
1192,3,g,https://www.facebook.com/,HTMLCanvasElement.width,get,200,
1193,3,g,https://www.facebook.com/,HTMLCanvasElement.height,get,200,
1241,3,h.createCanvasAndSolve,https://www.facebook.com/,HTMLCanvasElement.width,set,200,
1242,3,h.createCanvasAndSolve,https://www.facebook.com/,HTMLCanvasElement.height,set,200,


<b> Criteria 2 from Englehardt & Narayanan, 2016: </b> 

To be a fingerprinting candidate: "2.Text must be written to canvas with least two colors or
at least 10 distinct characters."

In [202]:
# js.symbol.value_counts()

window.document.cookie                               136037
window.navigator.userAgent                            85408
window.localStorage                                   48576
window.Storage.getItem                                46851
window.Storage.setItem                                18104
window.Storage.removeItem                             13812
window.name                                           13089
window.navigator.appName                              12283
window.sessionStorage                                 10357
CanvasRenderingContext2D.fillStyle                     8632
window.screen.colorDepth                               7806
CanvasRenderingContext2D.fill                          7258
CanvasRenderingContext2D.save                          7074
CanvasRenderingContext2D.restore                       7070
window.navigator.platform                              5187
window.navigator.appVersion                            4860
window.navigator.language               

In [203]:
# js[js.symbol == "CanvasRenderingContext2D.strokeStyle"]

Unnamed: 0,id,crawl_id,visit_id,script_url,script_line,script_col,func_name,script_loc_eval,document_url,top_level_url,call_stack,symbol,operation,value,arguments,time_stamp
1162,1163,3,3,https://static.xx.fbcdn.net/rsrc.php/v3iYXl4/yu/l/en_US/aArLzhwqJVj.js,19,2584,g.prototype.drawCircle,,https://www.facebook.com/,https://www.facebook.com/,,CanvasRenderingContext2D.strokeStyle,set,{},,2018-06-27T14:19:44.150Z
1212,1213,3,3,https://static.xx.fbcdn.net/rsrc.php/v3iYXl4/yu/l/en_US/aArLzhwqJVj.js,19,2584,g.prototype.drawCircle,,https://www.facebook.com/,https://www.facebook.com/,,CanvasRenderingContext2D.strokeStyle,set,{},,2018-06-27T14:19:44.194Z
1268,1269,3,3,https://static.xx.fbcdn.net/rsrc.php/v3iYXl4/yu/l/en_US/aArLzhwqJVj.js,19,2584,g.prototype.drawCircle,,https://www.facebook.com/,https://www.facebook.com/,,CanvasRenderingContext2D.strokeStyle,set,{},,2018-06-27T14:19:44.270Z
1320,1321,3,3,https://static.xx.fbcdn.net/rsrc.php/v3iYXl4/yu/l/en_US/aArLzhwqJVj.js,19,2584,g.prototype.drawCircle,,https://www.facebook.com/,https://www.facebook.com/,,CanvasRenderingContext2D.strokeStyle,set,{},,2018-06-27T14:19:44.316Z
20930,20931,4,72,https://code.createjs.com/createjs-2015.11.26.min.js,12,28940,b.exec,,https://s0.2mdn.net/2276943/1512572759808/index.html,https://www.cnn.com/,,CanvasRenderingContext2D.strokeStyle,set,#FFFFFF,,2018-06-27T14:21:39.168Z
22016,22017,4,72,https://code.createjs.com/createjs-2015.11.26.min.js,12,28940,b.exec,,https://s0.2mdn.net/2276943/1512572759808/index.html,https://www.cnn.com/,,CanvasRenderingContext2D.strokeStyle,set,#FFFFFF,,2018-06-27T14:21:40.144Z
23072,23073,4,72,https://code.createjs.com/createjs-2015.11.26.min.js,12,28940,b.exec,,https://s0.2mdn.net/2276943/1512572759808/index.html,https://www.cnn.com/,,CanvasRenderingContext2D.strokeStyle,set,#FFFFFF,,2018-06-27T14:21:40.612Z
23641,23642,4,72,https://code.createjs.com/createjs-2015.11.26.min.js,12,28940,b.exec,,https://s0.2mdn.net/2276943/1512572759808/index.html,https://www.cnn.com/,,CanvasRenderingContext2D.strokeStyle,set,#FFFFFF,,2018-06-27T14:21:40.794Z
23651,23652,4,72,https://code.createjs.com/createjs-2015.11.26.min.js,12,28940,b.exec,,https://s0.2mdn.net/2276943/1512572759808/index.html,https://www.cnn.com/,,CanvasRenderingContext2D.strokeStyle,set,#FFFFFF,,2018-06-27T14:21:41.054Z
23906,23907,4,72,https://code.createjs.com/createjs-2015.11.26.min.js,12,28940,b.exec,,https://s0.2mdn.net/2276943/1512572759808/index.html,https://www.cnn.com/,,CanvasRenderingContext2D.strokeStyle,set,#FFFFFF,,2018-06-27T14:21:45.126Z


In [204]:
import json 

def count_distinct_letters(text):
    if text!=None:
        t = json.loads(text, encoding='utf-8')['0']
        if not isinstance(t, int):
            return len(set(t))

In [205]:
CANVAS_WRITE_FUNCS = [
    "CanvasRenderingContext2D.fillText",
    "CanvasRenderingContext2D.strokeText",
    "CanvasRenderingContext2D.fill"
    ]

In [206]:
text = js[js.symbol.isin(CANVAS_WRITE_FUNCS)].arguments.dropna()
indexx = text.index
text = map(count_distinct_letters, text) 
res = pd.Series(text, index= indexx)
index_count_filter = res.where(res>=10).dropna().index

In [252]:
js.iloc[index_count_filter][features_to_show]

Unnamed: 0,crawl_id,top_level_url,arguments
650,14,https://vk.com/,"{""0"":""Cwm fjordbank glyphs vext quiz"",""1"":2,""2"":15}"
653,14,https://vk.com/,"{""0"":""Cwm fjordbank glyphs vext quiz"",""1"":4,""2"":45}"
2241,4,https://www.linkedin.com/,"{""0"":""92UV<v=Xd&N@Ig_P#1iqrWHBoclz>FZkyYu4xf(O^A8TJh)mbnGs$S]3-k!%j0Q{+w[RCKEat?L56}M~`D7e*"",""1"":2,""2"":15}"
2245,4,https://www.linkedin.com/,"{""0"":""92UV<v=Xd&N@Ig_P#1iqrWHBoclz>FZkyYu4xf(O^A8TJh)mbnGs$S]3-k!%j0Q{+w[RCKEat?L56}M~`D7e*"",""1"":4,""2"":17}"
2719,6,https://www.reddit.com/,"{""0"":""Cwm fjordbank glyphs vext quiz, üòÉ"",""1"":2,""2"":15}"
2725,6,https://www.reddit.com/,"{""0"":""Cwm fjordbank glyphs vext quiz, üòÉ"",""1"":4,""2"":45}"
5057,11,http://www.sohu.com/,"{""0"":""Cwm fjordbank glyphs vext quiz, üòÉ"",""1"":2,""2"":15}"
5060,11,http://www.sohu.com/,"{""0"":""Cwm fjordbank glyphs vext quiz, üòÉ"",""1"":4,""2"":45}"
5097,11,http://www.sohu.com/,"{""0"":""Cwm fjordbank glyphs vext quiz, üòÉ"",""1"":2,""2"":15}"
5102,11,http://www.sohu.com/,"{""0"":""Cwm fjordbank glyphs vext quiz, üòÉ"",""1"":4,""2"":45}"


<b> Criteria 3 from Englehardt & Narayanan, 2016 </b>

To be a fingerprinting candidate: "3. The script should not call the save, restore, or addEventListener
methods of the rendering context."

In [208]:
CANVAS_FP_DO_NOT_CALL_LIST = ["CanvasRenderingContext2D.save",
                              "CanvasRenderingContext2D.restore",
                              "HTMLCanvasElement.addEventListener"]

In [209]:
# js[js.operation == "call"][js.arguments!=None].arguments.value_counts().head(15)
index_call_filter = js[~js.symbol.isin(CANVAS_FP_DO_NOT_CALL_LIST)].index
index_call_filter

Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                 8,      9,
            ...
            501197, 501198, 501199, 501200, 501201, 501202, 501203, 501204,
            501205, 501206],
           dtype='int64', length=486982)

In [272]:
js.iloc[index_call_filter][features_to_show]

Unnamed: 0,crawl_id,func_name,top_level_url,symbol,operation,value,arguments
0,7,,https://www.google.co.in/?gws_rd=ssl,window.navigator.userAgent,get,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,
1,7,,https://www.google.co.in/?gws_rd=ssl,window.navigator.userAgent,get,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,
2,7,,https://www.google.co.in/?gws_rd=ssl,window.navigator.userAgent,get,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,
3,11,,https://www.google.co.jp/?gws_rd=ssl,window.navigator.userAgent,get,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,
4,7,,https://www.google.co.in/?gws_rd=ssl,window.navigator.userAgent,get,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,
5,7,,https://www.google.co.in/?gws_rd=ssl,window.navigator.userAgent,get,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,
6,7,,https://www.google.co.in/?gws_rd=ssl,window.navigator.platform,get,Linux x86_64,
7,7,mp,https://www.google.co.in/?gws_rd=ssl,window.navigator.userAgent,get,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,
8,11,,https://www.google.co.jp/?gws_rd=ssl,window.navigator.userAgent,get,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,
9,11,,https://www.google.co.jp/?gws_rd=ssl,window.navigator.userAgent,get,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,


In [296]:
js[js.crawl_id==14].symbol.isin(CANVAS_FP_DO_NOT_CALL_LIST)

621       False
622       False
623       False
624       False
625       False
626       False
627       False
628       False
629       False
630       False
631       False
632       False
633       False
634       False
635       False
636       False
637       False
638       False
639       False
640       False
641       False
642       False
643       False
644       False
645       False
646       False
647       False
648       False
649       False
650       False
651       False
652       False
653       False
654       False
655       False
656       False
657       False
658       False
659       False
660       False
661       False
662       False
663       False
664       False
665       False
666       False
667       False
668       False
669       False
670       False
671       False
672       False
673       False
674       False
675       False
676       False
677       False
678       False
679       False
680       False
681       False
682       False
683     

<b>Criteria 4 from Englehardt & Narayanan, 2016: </b>

To be a fingerprinting candidate: "4. The script extracts an image with toDataURL or with a
single call to getImageData that specifies an area with a
minimum size of 16px √ó 16px"

In [232]:
# js.symbol.value_counts()

In [212]:
CANVAS_READ_FUNCS = [
    "HTMLCanvasElement.toDataURL",
    "CanvasRenderingContext2D.getImageData"
    ]

In [233]:
# js[js.symbol.isin(CANVAS_READ_FUNCS)].arguments.dropna()

In [283]:
def check_dimensions(image):
    if image!=None:
        img = json.loads(image, encoding='utf-8')
        if len(img)>3 and img['2']>=16 and img['3']>=16:
            return image

In [290]:
image = js[js.symbol.isin(CANVAS_READ_FUNCS)].arguments
indexx = image.index
image = map(check_dimensions, image)
res = pd.Series(image, index = indexx)
index_image_filter = res.index
index_image_filter

Int64Index([   668,    672,   1190,   1240,   1298,   1349,   2246,   2750,
              2754,   3221,
            ...
            500611, 500615, 501080, 501084, 501090, 501094, 501189, 501193,
            501199, 501203],
           dtype='int64', length=682)

In [294]:
js.iloc[index_image_filter][features_to_show]

Unnamed: 0,crawl_id,func_name,top_level_url,symbol,operation,value,arguments
668,14,[87]/i</e.prototype.getCanvasFp,https://vk.com/,HTMLCanvasElement.toDataURL,call,,
672,14,[87]/i</e.prototype.getWebglFp,https://vk.com/,HTMLCanvasElement.toDataURL,call,,
1190,3,h.solve,https://www.facebook.com/,HTMLCanvasElement.toDataURL,call,,
1240,3,h.solve,https://www.facebook.com/,HTMLCanvasElement.toDataURL,call,,
1298,3,h.solve,https://www.facebook.com/,HTMLCanvasElement.toDataURL,call,,
1349,3,h.solve,https://www.facebook.com/,HTMLCanvasElement.toDataURL,call,,
2246,4,r.canvasFingerprint,https://www.linkedin.com/,HTMLCanvasElement.toDataURL,call,,
2750,6,e.prototype.getCanvasFp,https://www.reddit.com/,HTMLCanvasElement.toDataURL,call,,
2754,6,e.prototype.getWebglFp,https://www.reddit.com/,HTMLCanvasElement.toDataURL,call,,
3221,11,i.detectWebP,http://www.sohu.com/,HTMLCanvasElement.toDataURL,call,,"{""0"":""image/webp""}"


At least 2 criteria

In [217]:
lst = []

In [218]:
for i in range(len(js)):
    if i in index_call_filter and i in index_count_filter:
        lst.append(js.iloc[i])
    elif i in index_call_filter and i in index_size_filter:
        lst.append(js.iloc[i])
    elif i in index_call_filter and i in index_image_filter:
        lst.append(js.iloc[i])
    elif i in index_count_filter and i in index_size_filter:
        lst.append(js.iloc[i])
    elif i in index_count_filter and i in index_image_filter:
        lst.append(js.iloc[i])
    elif i in index_image_filter and i in index_size_filter:
        lst.append(js.iloc[i])

In [219]:
df = pd.DataFrame(lst)
len(df)

8122

Each criterion separately

In [220]:
# lst1 = []
# lst2 = []
# lst3 = []
# lst4 = []

In [221]:
# for i in range(len(js)):
#     if i in index_size_filter:
#         lst1.append(js.iloc[i])
#     elif i in index_count_filter:
#         lst2.append(js.iloc[i])
#     elif i in index_call_filter:
#         lst3.append(js.iloc[i])
#     elif i in index_image_filter:
#         lst4.append(js.iloc[i])

In [222]:
# crt1 = pd.DataFrame(lst1)
# crt2 = pd.DataFrame(lst2)
# crt3 = pd.DataFrame(lst3)
# crt4 = pd.DataFrame(lst4)

In [238]:
# crt1[features_to_show]

All four criteria

In [224]:
# lst = []

In [225]:
# for i in range(len(js)):
#     if i in index_call_filter and i in index_count_filter and i in index_size_filter and i in index_image_filter:
#         lst.append(js.iloc[i])

In [226]:
# df = pd.DataFrame(lst)
# len(df)

0