# Canvas-Font Fingerprinting detection
- Based on https://github.com/mozilla/openwpm-crawler/blob/master/analysis/Sample%20Analysis.ipynb
- See our 2016 study for background on Canvas-Font Fingerprinting: https://webtransparency.cs.princeton.edu/webcensus/#canvas-font-fp

In [1]:
import re
import json
import sqlite3
import pandas as pd

In [2]:
# import some analysis utilities from https://github.com/englehardt/crawl_utils
import sys
sys.path.append('./crawl_utils/')
import domain_utils as du
import analysis_utils as au

In [3]:
# use the sample sqlite for the 2018-06 stateless crawl
DB = '/home/marleensteinhoff/UNi/Projektseminar/Datenanalyse/sample_2018-06_1m_stateless_census_crawl.sqlite'
con = sqlite3.connect(DB)
con.row_factory = sqlite3.Row

### Load JavaScript Calls

In [4]:
js = pd.read_sql_query("SELECT * FROM javascript", con)
print("Number of javascript calls", len(js))

Number of javascript calls 501207


In [5]:
# Add the public suffix + 1 of a bunch of the URL columns
js['script_ps1'] = js['script_url'].apply(lambda x: du.get_ps_plus_1(x) if x is not None else None)
js['top_ps1'] = js['top_level_url'].apply(lambda x: du.get_ps_plus_1(x) if x is not None else None)
js['document_ps1'] = js['document_url'].apply(lambda x: du.get_ps_plus_1(x) if x is not None else None)

#### Canvas function calls

Filter JS calls and inspect for the use of canvas font fingerprinting

In [6]:
js[
    (js.symbol == 'CanvasRenderingContext2D.measureText') &
    (js.script_ps1 != js.top_ps1)
].groupby('script_ps1').top_ps1.count().sort_values(ascending=False)

script_ps1
mathtag.com                      2000
cloudflare.com                    295
musthird.com                      174
walmartimages.com                 102
targetimg1.com                     95
jrjimg.cn                          88
d309knd7es5f10.cloudfront.net      30
cdn-apple.com                      27
js                                 26
sndcdn.com                          2
boxcdn.net                          1
cdn-net.com                         1
cdnst.net                           1
Name: top_ps1, dtype: int64

In [7]:
font_shorthand = re.compile(r"^\s*(?=(?:(?:[-a-z]+\s*){0,2}(italic|oblique))?)(?=(?:(?:[-a-z]+\s*){0,2}(small-caps))?)(?=(?:(?:[-a-z]+\s*){0,2}(bold(?:er)?|lighter|[1-9]00))?)(?:(?:normal|\1|\2|\3)\s*){0,3}((?:xx?-)?(?:small|large)|medium|smaller|larger|[.\d]+(?:\%|in|[cem]m|ex|p[ctx]))(?:\s*\/\s*(normal|[.\d]+(?:\%|in|[cem]m|ex|p[ctx])))?\s*([-_\{\}\(\)\&!\',\*\.\"\sa-zA-Z0-9]+?)\s*$")

In [8]:
js[
    (js.symbol == 'CanvasRenderingContext2D.measureText') &
    (js.script_ps1 != js.top_ps1) & 
    (js.script_ps1 == 'mathtag.com')
].arguments.apply(lambda x: json.loads(x)["0"]).unique()

array(['mmmmmmmmmmlli'], dtype=object)

In [9]:
js[
    (js.symbol == 'CanvasRenderingContext2D.font') &
    (js.script_ps1 != js.top_ps1) & 
    (js.script_ps1 == 'mathtag.com')
].value.apply(lambda x: re.match(font_shorthand, x).group(6)).unique()

array(['monospace', 'sans-serif', 'serif', 'AR DARLING,monospace',
       'AR DARLING,sans-serif', 'AR DARLING,serif',
       'Earwig Factory,monospace', 'Earwig Factory,sans-serif',
       'Earwig Factory,serif', 'Minya Nouvelle,monospace',
       'Minya Nouvelle,sans-serif', 'Minya Nouvelle,serif',
       'Burnstown Dam,monospace', 'Burnstown Dam,sans-serif',
       'Burnstown Dam,serif', 'Sybil Green,monospace',
       'Sybil Green,sans-serif', 'Sybil Green,serif',
       'Stereofidelic,monospace', 'Stereofidelic,sans-serif',
       'Stereofidelic,serif', 'Urdu Typesetting,monospace',
       'Urdu Typesetting,sans-serif', 'Urdu Typesetting,serif',
       'Blue Highway Linocut,monospace',
       'Blue Highway Linocut,sans-serif', 'Blue Highway Linocut,serif',
       'Credit Valley,monospace', 'Credit Valley,sans-serif',
       'Credit Valley,serif', 'Velvenda Cooler,monospace',
       'Velvenda Cooler,sans-serif', 'Velvenda Cooler,serif',
       'Mufferaw,monospace', 'Mufferaw,sans-s

In [83]:
print(js[(js.symbol == 'CanvasRenderingContext2D.font') & (js.script_ps1 != js.top_ps1)])
test=js[(js.symbol == 'CanvasRenderingContext2D.font') & (js.script_ps1 != js.top_ps1)]

            id  crawl_id  visit_id  \
1144      1145         3         3   
1174      1175         3         3   
1182      1183         3         3   
1194      1195         3         3   
1224      1225         3         3   
...        ...       ...       ...   
492372  492373         2       971   
492404  492405         2       971   
495009  495010         4       985   
495012  495013         4       985   
497272  497273        13       997   

                                               script_url script_line  \
1144    https://static.xx.fbcdn.net/rsrc.php/v3iYXl4/y...          19   
1174    https://static.xx.fbcdn.net/rsrc.php/v3iYXl4/y...          19   
1182    https://static.xx.fbcdn.net/rsrc.php/v3iYXl4/y...          19   
1194    https://static.xx.fbcdn.net/rsrc.php/v3iYXl4/y...          19   
1224    https://static.xx.fbcdn.net/rsrc.php/v3iYXl4/y...          19   
...                                                   ...         ...   
492372  https://cdn.doubleverify

In [87]:
test.groupby(['script_ps1', 'top_ps1', 'value']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,crawl_id,visit_id,script_url,script_line,script_col,func_name,script_loc_eval,document_url,top_level_url,call_stack,symbol,operation,arguments,time_stamp,document_ps1
script_ps1,top_ps1,value,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
,fanpage.gr,600 32px Arial,2,2,2,2,2,2,2,2,2,2,2,2,2,0,2,2
360buyimg.com,jd.com,11pt no-real-font-123,2,2,2,2,2,2,2,2,2,2,2,2,2,0,2,2
360buyimg.com,jd.com,18pt Arial,2,2,2,2,2,2,2,2,2,2,2,2,2,0,2,2
adsco.re,tamilrockers.gy,14px 'Arial',1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1
adtima.vn,zing.vn,11pt no-real-font-123,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
walmartimages.com,walmart.com,"400 normal 18px BogleWeb,Helvetica Neue,Helvetica,Arial,sans-serif",1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1
walmartimages.com,walmart.com,"400 normal 19px BogleWeb,Helvetica Neue,Helvetica,Arial,sans-serif",2,2,2,2,2,2,2,2,2,2,2,2,2,0,2,2
walmartimages.com,walmart.com,"400 normal 36px BogleWeb,Helvetica Neue,Helvetica,Arial,sans-serif",1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1
walmartimages.com,walmart.com,"400 normal 44px BogleWeb,Helvetica Neue,Helvetica,Arial,sans-serif",2,2,2,2,2,2,2,2,2,2,2,2,2,0,2,2


In [85]:
test.value.apply(lambda x: re.match(font_shorthand, x))

1144      <re.Match object; span=(0, 11), match='123px A...
1174      <re.Match object; span=(0, 11), match='123px A...
1182      <re.Match object; span=(0, 11), match='106px A...
1194      <re.Match object; span=(0, 11), match='118px A...
1224      <re.Match object; span=(0, 11), match='114px A...
                                ...                        
492372    <re.Match object; span=(0, 10), match='14px Ar...
492404    <re.Match object; span=(0, 10), match='14px Ar...
495009    <re.Match object; span=(0, 21), match='11pt no...
495012    <re.Match object; span=(0, 10), match='18pt Ar...
497272    <re.Match object; span=(0, 12), match="14px 'A...
Name: value, Length: 2723, dtype: object

In [88]:
def regex_value(row):
    s=re.match(font_shorthand, row['value'])
    print(s)
    return s

In [90]:

test['value_b'] = test.apply(lambda row: regex_value(row), axis=1)

<re.Match object; span=(0, 11), match='123px Arial'>
<re.Match object; span=(0, 11), match='123px Arial'>
<re.Match object; span=(0, 11), match='106px Arial'>
<re.Match object; span=(0, 11), match='118px Arial'>
<re.Match object; span=(0, 11), match='114px Arial'>
<re.Match object; span=(0, 11), match='108px Arial'>
<re.Match object; span=(0, 11), match='123px Arial'>
<re.Match object; span=(0, 11), match='123px Arial'>
<re.Match object; span=(0, 11), match='106px Arial'>
<re.Match object; span=(0, 11), match='118px Arial'>
<re.Match object; span=(0, 11), match='114px Arial'>
<re.Match object; span=(0, 11), match='108px Arial'>
<re.Match object; span=(0, 10), match='15px Arial'>
<re.Match object; span=(0, 10), match='15px Arial'>
<re.Match object; span=(0, 21), match='11pt no-real-font-123'>
<re.Match object; span=(0, 10), match='18pt Arial'>
<re.Match object; span=(0, 21), match='11pt no-real-font-123'>
<re.Match object; span=(0, 10), match='18pt Arial'>
<re.Match object; span=(0, 21)

<re.Match object; span=(0, 30), match='72px Architects Daughter,serif'>
<re.Match object; span=(0, 20), match='72px Taffy,monospace'>
<re.Match object; span=(0, 21), match='72px Taffy,sans-serif'>
<re.Match object; span=(0, 16), match='72px Taffy,serif'>
<re.Match object; span=(0, 28), match='72px Weltron Urban,monospace'>
<re.Match object; span=(0, 29), match='72px Weltron Urban,sans-serif'>
<re.Match object; span=(0, 24), match='72px Weltron Urban,serif'>
<re.Match object; span=(0, 26), match='72px 1942 report,monospace'>
<re.Match object; span=(0, 27), match='72px 1942 report,sans-serif'>
<re.Match object; span=(0, 22), match='72px 1942 report,serif'>
<re.Match object; span=(0, 18), match='72px Fat,monospace'>
<re.Match object; span=(0, 19), match='72px Fat,sans-serif'>
<re.Match object; span=(0, 14), match='72px Fat,serif'>
<re.Match object; span=(0, 30), match='72px Rod Transparent,monospace'>
<re.Match object; span=(0, 31), match='72px Rod Transparent,sans-serif'>
<re.Match objec

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['value_b'] = test.apply(lambda row: regex_value(row), axis=1)


In [91]:
test

Unnamed: 0,id,crawl_id,visit_id,script_url,script_line,script_col,func_name,script_loc_eval,document_url,top_level_url,call_stack,symbol,operation,value,arguments,time_stamp,script_ps1,top_ps1,document_ps1,value_b
1144,1145,3,3,https://static.xx.fbcdn.net/rsrc.php/v3iYXl4/y...,19,1402,g.prototype.drawEmoji,,https://www.facebook.com/,https://www.facebook.com/,,CanvasRenderingContext2D.font,set,123px Arial,,2018-06-27T14:19:44.144Z,fbcdn.net,facebook.com,facebook.com,"<re.Match object; span=(0, 11), match='123px A..."
1174,1175,3,3,https://static.xx.fbcdn.net/rsrc.php/v3iYXl4/y...,19,1086,g.prototype.drawText,,https://www.facebook.com/,https://www.facebook.com/,,CanvasRenderingContext2D.font,set,123px Arial,,2018-06-27T14:19:44.154Z,fbcdn.net,facebook.com,facebook.com,"<re.Match object; span=(0, 11), match='123px A..."
1182,1183,3,3,https://static.xx.fbcdn.net/rsrc.php/v3iYXl4/y...,19,1402,g.prototype.drawEmoji,,https://www.facebook.com/,https://www.facebook.com/,,CanvasRenderingContext2D.font,set,106px Arial,,2018-06-27T14:19:44.158Z,fbcdn.net,facebook.com,facebook.com,"<re.Match object; span=(0, 11), match='106px A..."
1194,1195,3,3,https://static.xx.fbcdn.net/rsrc.php/v3iYXl4/y...,19,1402,g.prototype.drawEmoji,,https://www.facebook.com/,https://www.facebook.com/,,CanvasRenderingContext2D.font,set,118px Arial,,2018-06-27T14:19:44.188Z,fbcdn.net,facebook.com,facebook.com,"<re.Match object; span=(0, 11), match='118px A..."
1224,1225,3,3,https://static.xx.fbcdn.net/rsrc.php/v3iYXl4/y...,19,1086,g.prototype.drawText,,https://www.facebook.com/,https://www.facebook.com/,,CanvasRenderingContext2D.font,set,114px Arial,,2018-06-27T14:19:44.210Z,fbcdn.net,facebook.com,facebook.com,"<re.Match object; span=(0, 11), match='114px A..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
492372,492373,2,971,https://cdn.doubleverify.com/dv-measurements23...,353,385,g</a.prototype.Qs,,https://www.zerohedge.com/,https://www.zerohedge.com/,,CanvasRenderingContext2D.font,set,14px Arial,,2018-06-27T14:49:39.438Z,doubleverify.com,zerohedge.com,zerohedge.com,"<re.Match object; span=(0, 10), match='14px Ar..."
492404,492405,2,971,https://cdn.doubleverify.com/dv-measurements23...,353,385,g</a.prototype.Qs,,https://www.zerohedge.com/,https://www.zerohedge.com/,,CanvasRenderingContext2D.font,set,14px Arial,,2018-06-27T14:49:34.488Z,doubleverify.com,zerohedge.com,zerohedge.com,"<re.Match object; span=(0, 10), match='14px Ar..."
495009,495010,4,985,https://cdn.pulpix.com/static/pulpix.js,8,1958,pxLoadReq/</e.prototype.getCanvasFp,,https://www.lequipe.fr/,https://www.lequipe.fr/,,CanvasRenderingContext2D.font,set,11pt no-real-font-123,,2018-06-27T14:49:54.158Z,pulpix.com,lequipe.fr,lequipe.fr,"<re.Match object; span=(0, 21), match='11pt no..."
495012,495013,4,985,https://cdn.pulpix.com/static/pulpix.js,8,2137,pxLoadReq/</e.prototype.getCanvasFp,,https://www.lequipe.fr/,https://www.lequipe.fr/,,CanvasRenderingContext2D.font,set,18pt Arial,,2018-06-27T14:49:54.160Z,pulpix.com,lequipe.fr,lequipe.fr,"<re.Match object; span=(0, 10), match='18pt Ar..."


In [113]:
test.groupby(['script_ps1']).count().rename({"id": "count"}, axis='columns')[["count", "value_b"]]


Unnamed: 0_level_0,count,value_b
script_ps1,Unnamed: 1_level_1,Unnamed: 2_level_1
,2,2
360buyimg.com,4,4
adsco.re,1,1
adtima.vn,2,2
alicdn.com,56,56
answerscloud.com,2,2
areyouahuman.com,3,3
atlassbx.com,4,4
azureedge.net,2,2
boxcdn.net,1,1


In [112]:
test.groupby('script_ps1').top_ps1.count().sort_values(ascending=False)

script_ps1
mathtag.com                      2000
musthird.com                      177
jrjimg.cn                         106
targetimg1.com                     96
alicdn.com                         56
doubleverify.com                   39
d309knd7es5f10.cloudfront.net      33
tiqcdn.com                         32
cdn-apple.com                      27
fbcdn.net                          24
perimeterx.net                     10
static6.com                        10
consumable.com                      8
libertystmedia.com                  8
fqtag.com                           8
walmartimages.com                   8
d2fbkzyicji7c4.cloudfront.net       8
tokopedia.net                       6
foresee.com                         4
atlassbx.com                        4
360buyimg.com                       4
imedia.cz                           4
viafoura.net                        4
sndcdn.com                          3
guoshipartners.com                  3
areyouahuman.com                    3
r

In [80]:
import numpy as np
np.sum([(test["top_ps1"]!=test["document_ps1"]) & (test["symbol"]=="CanvasRenderingContext2D.font")])


213

In [82]:
np.sum(test["symbol"]=="CanvasRenderingContext2D.font")

2723