In [1]:
import pandas as pd
import re

In [2]:
js = pd.read_csv('10k.csv')

In [3]:
len(js.visit_id.value_counts())

8503

In [5]:
js[js.operation == "call"].symbol.value_counts().head(15)

CanvasRenderingContext2D.setTransform            1361759
CanvasRenderingContext2D.closePath                387245
CanvasRenderingContext2D.beginPath                274705
CanvasRenderingContext2D.clearRect                255852
window.Storage.getItem                            245579
CanvasRenderingContext2D.fill                     239270
CanvasRenderingContext2D.save                     223513
CanvasRenderingContext2D.restore                  221344
window.Storage.setItem                            212565
HTMLCanvasElement.getContext                      151449
CanvasRenderingContext2D.fillRect                 129560
CanvasRenderingContext2D.createLinearGradient      87621
CanvasRenderingContext2D.fillText                  75467
CanvasRenderingContext2D.arc                       67729
CanvasRenderingContext2D.bezierCurveTo             64692
Name: symbol, dtype: int64

<b>Criteria 1 from Englehardt & Narayanan, 2016:</b>

To be a fingerprinting candidate: "1.The canvas element’s height and width properties must not be set below 16 px."

In [4]:
def width_height_finder(data):
    w = re.compile('[^(line)]width', re.IGNORECASE)
    h = re.compile('[^(line)]height', re.IGNORECASE)
    ww = w.search(data)
    hh = h.search(data)
    if ww!= None:
        return data
    elif hh!=None:
        return data

In [5]:
l = list(map(width_height_finder, list(js[js.operation == "set"].symbol)))
l = [x for x in l if x is not None]

In [6]:
set(l)

{'HTMLCanvasElement.height', 'HTMLCanvasElement.width'}

In [7]:
MIN_CANVAS_IMAGE_WIDTH = 16
MIN_CANVAS_IMAGE_HEIGHT = 16


canvas_size_px = js[js.symbol.isin(["HTMLCanvasElement.width", "HTMLCanvasElement.height"])].value.astype(str)
canvas_size_px = canvas_size_px.apply(lambda x: re.sub("px", "", x))
canvas_size = pd.Series(canvas_size_px).astype(float)
index_size_filter = canvas_size.where(canvas_size > MIN_CANVAS_IMAGE_HEIGHT).dropna().index

In [8]:
index_size_filter

Int64Index([    841,     842,    1097,    1098,    1607,    1608,    1665,
               1666,    1706,    1707,
            ...
            7326167, 7326168, 7326179, 7326180, 7326195, 7326196, 7326210,
            7326211, 7326225, 7326226],
           dtype='int64', length=275219)

In [9]:
# js.iloc[index_size_filter]

<b> Criteria 2 from Englehardt & Narayanan, 2016: </b> 

To be a fingerprinting candidate: "2.Text must be written to canvas with least two colors or
at least 10 distinct characters."

In [10]:
#TODO: pindex, pvalue to arguments

In [11]:
def count_distinct_letters(text):
    if text!=None:
        if not isinstance(text, int):
            return len(set(text))

In [12]:
CANVAS_WRITE_FUNCS = [
    "CanvasRenderingContext2D.fillText",
    "CanvasRenderingContext2D.strokeText",
    "CanvasRenderingContext2D.fill"
    ]

In [13]:
text = js[js.symbol.isin(CANVAS_WRITE_FUNCS) & (js.pindex == 0)].pvalue.dropna()
indexx = text.index
text = map(count_distinct_letters, text) 
res = pd.Series(text, index= indexx)
index_count_filter = res.where(res>=10).dropna().index

In [14]:
js.iloc[index_count_filter]

Unnamed: 0,id,operation,symbol,pindex,pvalue,value,visit_id,crawl_id,top_url,public_suffix,url
1120,1121,call,CanvasRenderingContext2D.fillText,0.0,Cwm fjordbank glyphs vext quiz,,11,1,http://taobao.com,,http://g.alicdn.com/alilog/mlog/aplus_v2.js
1125,1126,call,CanvasRenderingContext2D.fillText,0.0,Cwm fjordbank glyphs vext quiz,,11,1,http://taobao.com,,http://g.alicdn.com/alilog/mlog/aplus_v2.js
2998,2999,call,CanvasRenderingContext2D.fillText,0.0,Cwm fjordbank glyphs vext quiz,,27,1,http://tmall.com,,https://g.alicdn.com/alilog/mlog/aplus_v2.js
3003,3004,call,CanvasRenderingContext2D.fillText,0.0,Cwm fjordbank glyphs vext quiz,,27,1,http://tmall.com,,https://g.alicdn.com/alilog/mlog/aplus_v2.js
36242,36243,call,CanvasRenderingContext2D.fillText,0.0,fpcf canvasing text,,97,1,http://dropbox.com,,https://cf.dropboxstatic.com/static/javascript...
64743,64744,call,CanvasRenderingContext2D.fillText,0.0,Cwm fjordbank glyphs vext quiz,,135,1,http://cnzz.com,,http://a.tbcdn.cn/s/aplus_v2.js
64748,64749,call,CanvasRenderingContext2D.fillText,0.0,Cwm fjordbank glyphs vext quiz,,135,1,http://cnzz.com,,http://a.tbcdn.cn/s/aplus_v2.js
71113,71114,call,CanvasRenderingContext2D.fillText,0.0,Cwm fjordbank glyphs vext quiz,,152,1,http://detail.tmall.com,,http://g.alicdn.com/alilog/mlog/aplus_v2.js
71118,71119,call,CanvasRenderingContext2D.fillText,0.0,Cwm fjordbank glyphs vext quiz,,152,1,http://detail.tmall.com,,http://g.alicdn.com/alilog/mlog/aplus_v2.js
113102,113103,call,CanvasRenderingContext2D.fillText,0.0,http://valve.github.io,,210,1,http://groupon.com,,https://assets4.grouponcdn.com/layout/assets/s...


<b> Criteria 3 from Englehardt & Narayanan, 2016 </b>

To be a fingerprinting candidate: "3. The script should not call the save, restore, or addEventListener
methods of the rendering context."

In [15]:
CANVAS_FP_DO_NOT_CALL_LIST = ["CanvasRenderingContext2D.save",
                              "CanvasRenderingContext2D.restore",
                              "HTMLCanvasElement.addEventListener"]

In [16]:
index_call_filter = js[~js.symbol.isin(CANVAS_FP_DO_NOT_CALL_LIST)].index
index_call_filter

Int64Index([      0,       1,       2,       3,       4,       5,       6,
                  7,       8,       9,
            ...
            7328705, 7328706, 7328707, 7328708, 7328709, 7328710, 7328711,
            7328712, 7328713, 7328714],
           dtype='int64', length=6881420)

In [17]:
js.iloc[index_call_filter]

Unnamed: 0,id,operation,symbol,pindex,pvalue,value,visit_id,crawl_id,top_url,public_suffix,url
0,1,get,window.navigator.userAgent,,,Mozilla/5.0 (X11; Linux x86_64; rv:41.0) Gecko...,1,1,http://google.com,,https://www.google.com/?gws_rd=ssl
1,2,get,window.navigator.userAgent,,,Mozilla/5.0 (X11; Linux x86_64; rv:41.0) Gecko...,1,1,http://google.com,,https://www.google.com/?gws_rd=ssl
2,3,get,window.navigator.userAgent,,,Mozilla/5.0 (X11; Linux x86_64; rv:41.0) Gecko...,1,1,http://google.com,,https://www.google.com/?gws_rd=ssl
3,4,get,window.navigator.userAgent,,,Mozilla/5.0 (X11; Linux x86_64; rv:41.0) Gecko...,1,1,http://google.com,,https://www.google.com/?gws_rd=ssl
4,5,get,window.navigator.userAgent,,,Mozilla/5.0 (X11; Linux x86_64; rv:41.0) Gecko...,1,1,http://google.com,,https://www.google.com/?gws_rd=ssl
5,6,get,window.navigator.platform,,,Linux x86_64,1,1,http://google.com,,https://www.google.com/?gws_rd=ssl
6,7,get,window.navigator.plugins[0].description,,,Shockwave Flash 11.2 r202,1,1,http://google.com,,https://www.google.com/?gws_rd=ssl
7,8,get,window.navigator.plugins[0].description,,,Shockwave Flash 11.2 r202,1,1,http://google.com,,https://www.google.com/?gws_rd=ssl
8,9,get,window.navigator.userAgent,,,Mozilla/5.0 (X11; Linux x86_64; rv:41.0) Gecko...,1,1,http://google.com,,https://www.google.com/?gws_rd=ssl
9,10,get,window.navigator.userAgent,,,Mozilla/5.0 (X11; Linux x86_64; rv:41.0) Gecko...,1,1,http://google.com,,https://www.google.com/xjs/_/js/k=xjs.s.en_US....


<b>Criteria 4 from Englehardt & Narayanan, 2016: </b>

To be a fingerprinting candidate: "4. The script extracts an image with toDataURL or with a
single call to getImageData that specifies an area with a
minimum size of 16px × 16px"

In [18]:
CANVAS_READ_FUNCS = [
    "HTMLCanvasElement.toDataURL",
    "CanvasRenderingContext2D.getImageData"
    ]

In [19]:
def check_dimensions(image):
    if image!=None:
        img = re.search(r'\d+', image).group(0)
        if float(img)>=16:
            return img

In [20]:
image = js[(js.symbol.isin(CANVAS_READ_FUNCS)) & (js.pindex.isin([2, 3]))].pvalue.astype(str)
indexx = image.index
image = map(check_dimensions, image)
res = pd.Series(image, index = indexx).dropna()
index_image_filter = res.index
index_image_filter

Int64Index([   8343,    8344,    8481,    8482,    8555,    8556,    9209,
               9210,    9246,    9530,
            ...
            7315660, 7315661, 7315703, 7315704, 7325435, 7325436, 7325529,
            7325530, 7325622, 7325623],
           dtype='int64', length=14637)

In [21]:
len(js.iloc[index_image_filter]["visit_id"].value_counts())

370

In [22]:
fp_visit_id = []
for i in set(js.visit_id):
    cond1 = cond2 = cond3 = cond4 = False
    for j in js[js.visit_id==i].index:
        if j in index_size_filter:
            cond1 = True
        if j in index_count_filter:
            cond2 = True
        if j in index_call_filter:
            cond3 = True
        if j in index_image_filter:
            cond4 = True
    if cond1 and cond2 and cond3 and cond4:
        fp_visit_id.append(i)

In [23]:
len(fp_visit_id) #out of 8503

27

In [28]:
js[js.visit_id.isin(fp_visit_id)]

Unnamed: 0,id,operation,symbol,pindex,pvalue,value,visit_id,crawl_id,top_url,public_suffix,url
36223,36224,get,window.navigator.platform,,,Linux x86_64,97,1,http://dropbox.com,,https://cf.dropboxstatic.com/static/javascript...
36224,36225,get,window.sessionStorage,,,{},97,1,http://dropbox.com,,https://cf.dropboxstatic.com/static/javascript...
36225,36226,get,window.localStorage,,,{},97,1,http://dropbox.com,,https://cf.dropboxstatic.com/static/javascript...
36226,36227,get,window.navigator.userAgent,,,Mozilla/5.0 (X11; Linux x86_64; rv:41.0) Gecko...,97,1,http://dropbox.com,,https://cf.dropboxstatic.com/static/javascript...
36227,36228,get,window.navigator.userAgent,,,Mozilla/5.0 (X11; Linux x86_64; rv:41.0) Gecko...,97,1,http://dropbox.com,,https://cf.dropboxstatic.com/static/javascript...
36228,36229,get,window.navigator.userAgent,,,Mozilla/5.0 (X11; Linux x86_64; rv:41.0) Gecko...,97,1,http://dropbox.com,,https://cf.dropboxstatic.com/static/javascript...
36229,36230,get,window.navigator.userAgent,,,Mozilla/5.0 (X11; Linux x86_64; rv:41.0) Gecko...,97,1,http://dropbox.com,,https://cf.dropboxstatic.com/static/javascript...
36230,36231,set,HTMLCanvasElement.width,,,280,97,1,http://dropbox.com,,https://cf.dropboxstatic.com/static/javascript...
36231,36232,set,HTMLCanvasElement.height,,,60,97,1,http://dropbox.com,,https://cf.dropboxstatic.com/static/javascript...
36232,36233,get,HTMLCanvasElement.style,,,{},97,1,http://dropbox.com,,https://cf.dropboxstatic.com/static/javascript...


In [24]:
lst = []

In [25]:
for i in range(len(js)):
    if i in index_call_filter and i in index_count_filter:
        lst.append(js.iloc[i])
    elif i in index_call_filter and i in index_size_filter:
        lst.append(js.iloc[i])
    elif i in index_call_filter and i in index_image_filter:
        lst.append(js.iloc[i])
    elif i in index_count_filter and i in index_size_filter:
        lst.append(js.iloc[i])
    elif i in index_count_filter and i in index_image_filter:
        lst.append(js.iloc[i])
    elif i in index_image_filter and i in index_size_filter:
        lst.append(js.iloc[i])

In [26]:
df = pd.DataFrame(lst)

Unnamed: 0,id,operation,symbol,pindex,pvalue,value,visit_id,crawl_id,top_url,public_suffix,url
841,842,set,HTMLCanvasElement.height,,,60,8,1,http://qq.com,,http://cdn.tanx.com/t/acookie/acbeacon2.html#m...
842,843,set,HTMLCanvasElement.width,,,400,8,1,http://qq.com,,http://cdn.tanx.com/t/acookie/acbeacon2.html#m...
1097,1098,set,HTMLCanvasElement.width,,,2000,11,1,http://taobao.com,,http://g.alicdn.com/alilog/mlog/aplus_v2.js
1098,1099,set,HTMLCanvasElement.height,,,200,11,1,http://taobao.com,,http://g.alicdn.com/alilog/mlog/aplus_v2.js
1120,1121,call,CanvasRenderingContext2D.fillText,0.0,Cwm fjordbank glyphs vext quiz,,11,1,http://taobao.com,,http://g.alicdn.com/alilog/mlog/aplus_v2.js
1125,1126,call,CanvasRenderingContext2D.fillText,0.0,Cwm fjordbank glyphs vext quiz,,11,1,http://taobao.com,,http://g.alicdn.com/alilog/mlog/aplus_v2.js
1607,1608,set,HTMLCanvasElement.height,,,60,13,1,http://sina.com.cn,,http://cdn.tanx.com/t/acookie/acbeacon2.html#m...
1608,1609,set,HTMLCanvasElement.width,,,400,13,1,http://sina.com.cn,,http://cdn.tanx.com/t/acookie/acbeacon2.html#m...
1665,1666,set,HTMLCanvasElement.height,,,60,13,1,http://sina.com.cn,,https://assets.alicdn.com/g/security/umscript/...
1666,1667,set,HTMLCanvasElement.width,,,400,13,1,http://sina.com.cn,,https://assets.alicdn.com/g/security/umscript/...


In [27]:
len(df.visit_id.value_counts())

1125