# Measuring Third Party Resource Inclusion Using WebXRay Domain Ownership List
- This notebook demonstrates basic use of `http_requests` table in combination with [WebXRay Domain Ownership List](https://github.com/timlib/webXray_Domain_Owner_List)
- Based on https://github.com/mozilla/openwpm-crawler/blob/master/analysis/Sample%20Analysis.ipynb and https://github.com/dreisman/WebCensusNotebook/blob/master/demo.ipynb


In [1]:
import re
import json
import sqlite3
import pandas as pd

In [2]:
# import some analysis utilities from https://github.com/englehardt/crawl_utils
import sys
sys.path.append('./crawl_utils/')
import domain_utils as du
import analysis_utils as au

In [3]:
def load_webxray_domain_ownership_list(webxray_json_path):
    webxray_list = json.loads(open(webxray_json_path).read())
    domain_orgs = {}  # `domain name` to (org) `id` mapping
    parent_orgs = {}  # (org) `id` to `parent_id` mapping
    org_names = {}  # (org) `id` to `owner_name` mapping
    domain_owners = {}  # domain to topmost parent organization name mapping

    for data in webxray_list:
        org_names[data["id"]] = data["owner_name"]
        if data["parent_id"]:
            parent_orgs[data["id"]] = data["parent_id"]
        for domain in data["domains"]:
            domain_orgs[domain] = data["id"]

    
    for domain, org_id in domain_orgs.iteritems():
        domain_owners[domain] = org_names[get_topmost_parent(org_id, parent_orgs)]
    return domain_owners

In [4]:
def get_topmost_parent(org_id, parent_orgs):
    """Walk up the parent organizations dict."""
    while org_id in parent_orgs:
        org_id = parent_orgs[org_id]  # get the parent's id
    return org_id

### Load domain ownership mapping

In [5]:
# You should download `domain_owners.json` from the following link
# https://github.com/timlib/webXray_Domain_Owner_List/blob/master/domain_owners.json
domain_owners = load_webxray_domain_ownership_list("domain_owners.json")

In [6]:
# Correct the ownership for the Atlas domains
ATLAS_DOMAINS = ["atdmt.com", "atlassbx.com"]
for domain in ATLAS_DOMAINS:
    domain_owners[domain] = "Facebook"

In [7]:
# use the sample sqlite for the 2018-06 stateless crawl
DB = 'sample_2018-06_1m_stateless_census_crawl.sqlite'
# Load the data
con = sqlite3.connect(DB)
con.row_factory = sqlite3.Row
cur = con.cursor()
reqs = pd.read_sql_query("SELECT * FROM http_requests", con)

In [8]:
# Add the public suffix + 1 of a bunch of the URL columns
reqs['url_ps1'] = reqs['url'].apply(du.get_ps_plus_1)
reqs['top_ps1'] = reqs['top_level_url'].apply(lambda x: du.get_ps_plus_1(x) if x is not None else None)
reqs['loading_ps1'] = reqs['loading_href'].apply(lambda x: du.get_ps_plus_1(x) if x is not None else None)

In [9]:
reqs['domain_owner'] = reqs['url_ps1'].apply(lambda x: domain_owners.get(x, ""))

### How many sites are Facebook resource are loaded on?

In [10]:
total_sites = reqs['top_level_url'].nunique()

n_sites = reqs[reqs['domain_owner'] == 'Facebook'].top_level_url.nunique()
print "%d sites (%0.1f%%)" % (n_sites, 100 * n_sites / float(total_sites))

411 sites (43.5%)


### How many sites are Google resource are loaded on?

In [11]:
n_sites = reqs[reqs['domain_owner'] == 'Alphabet'].top_level_url.nunique()
print "%d sites (%0.1f%%)" % (n_sites, 100 * n_sites / float(total_sites))

771 sites (81.7%)
