# Canvas-Font Fingerprinting detection
- Based on https://github.com/mozilla/openwpm-crawler/blob/master/analysis/Sample%20Analysis.ipynb
- See our 2016 study for background on Canvas-Font Fingerprinting: https://webtransparency.cs.princeton.edu/webcensus/#canvas-font-fp

In [1]:
import re
import json
import sqlite3
import pandas as pd

In [2]:
# import some analysis utilities from https://github.com/englehardt/crawl_utils
import sys
sys.path.append('./crawl_utils/')
import domain_utils as du
import analysis_utils as au

In [3]:
# use the sample sqlite for the 2018-06 stateless crawl
DB = 'sample_2018-06_1m_stateless_census_crawl.sqlite'
con = sqlite3.connect(DB)
con.row_factory = sqlite3.Row

### Load JavaScript Calls

In [4]:
js = pd.read_sql_query("SELECT * FROM javascript", con)
print "Number of javascript calls", len(js)

Number of javascript calls 501207


In [5]:
# Add the public suffix + 1 of a bunch of the URL columns
js['script_ps1'] = js['script_url'].apply(lambda x: du.get_ps_plus_1(x) if x is not None else None)
js['top_ps1'] = js['top_level_url'].apply(lambda x: du.get_ps_plus_1(x) if x is not None else None)
js['document_ps1'] = js['document_url'].apply(lambda x: du.get_ps_plus_1(x) if x is not None else None)

#### Canvas function calls

Filter JS calls and inspect for the use of canvas font fingerprinting

In [6]:
js[
    (js.symbol == 'CanvasRenderingContext2D.measureText') &
    (js.script_ps1 != js.top_ps1)
].groupby('script_ps1').top_ps1.count().sort_values(ascending=False)

script_ps1
mathtag.com                      2000
cloudflare.com                    295
musthird.com                      174
walmartimages.com                 102
targetimg1.com                     95
jrjimg.cn                          88
d309knd7es5f10.cloudfront.net      30
cdn-apple.com                      27
sndcdn.com                          2
cdnst.net                           1
cdn-net.com                         1
boxcdn.net                          1
Name: top_ps1, dtype: int64

In [7]:
font_shorthand = re.compile(r"^\s*(?=(?:(?:[-a-z]+\s*){0,2}(italic|oblique))?)(?=(?:(?:[-a-z]+\s*){0,2}(small-caps))?)(?=(?:(?:[-a-z]+\s*){0,2}(bold(?:er)?|lighter|[1-9]00))?)(?:(?:normal|\1|\2|\3)\s*){0,3}((?:xx?-)?(?:small|large)|medium|smaller|larger|[.\d]+(?:\%|in|[cem]m|ex|p[ctx]))(?:\s*\/\s*(normal|[.\d]+(?:\%|in|[cem]m|ex|p[ctx])))?\s*([-_\{\}\(\)\&!\',\*\.\"\sa-zA-Z0-9]+?)\s*$")

In [8]:
js[
    (js.symbol == 'CanvasRenderingContext2D.measureText') &
    (js.script_ps1 != js.top_ps1) & 
    (js.script_ps1 == 'mathtag.com')
].arguments.apply(lambda x: json.loads(x)["0"]).unique()

array([u'mmmmmmmmmmlli'], dtype=object)

In [9]:
js[
    (js.symbol == 'CanvasRenderingContext2D.font') &
    (js.script_ps1 != js.top_ps1) & 
    (js.script_ps1 == 'mathtag.com')
].value.apply(lambda x: re.match(font_shorthand, x).group(6)).unique()

array([u'monospace', u'sans-serif', u'serif', u'AR DARLING,monospace',
       u'AR DARLING,sans-serif', u'AR DARLING,serif',
       u'Earwig Factory,monospace', u'Earwig Factory,sans-serif',
       u'Earwig Factory,serif', u'Minya Nouvelle,monospace',
       u'Minya Nouvelle,sans-serif', u'Minya Nouvelle,serif',
       u'Burnstown Dam,monospace', u'Burnstown Dam,sans-serif',
       u'Burnstown Dam,serif', u'Sybil Green,monospace',
       u'Sybil Green,sans-serif', u'Sybil Green,serif',
       u'Stereofidelic,monospace', u'Stereofidelic,sans-serif',
       u'Stereofidelic,serif', u'Urdu Typesetting,monospace',
       u'Urdu Typesetting,sans-serif', u'Urdu Typesetting,serif',
       u'Blue Highway Linocut,monospace',
       u'Blue Highway Linocut,sans-serif', u'Blue Highway Linocut,serif',
       u'Credit Valley,monospace', u'Credit Valley,sans-serif',
       u'Credit Valley,serif', u'Velvenda Cooler,monospace',
       u'Velvenda Cooler,sans-serif', u'Velvenda Cooler,serif',
       u'Muf