In [7]:
import os
import glob
import json
from tld import get_fld

In [13]:
# Constants
DATA_DIR = '../crawl_data'
SRC_DIR = '../crawler_src'

In [10]:
# Load the datasets
os.chdir(DATA_DIR)

# Get desktop json files
data_json_desktop = glob.glob('*_desktop.json')

# Get mobile json files
data_json_mobile = glob.glob('*_mobile.json')

In [11]:
def init_data_object():
    return  {
        'longest_lifespan_cookies': [],
        'failures': {
            'timeout_failures': 0,
            'TLS_failures': 0,
            'consent_failures': 0
        },
        'page_load_times': [],
        'num_requests': [],
        'distinct_third_parties': [],
        'num_distinct_tracker_domains': [],
        'num_distinct_tracker_entities': []
    }

In [103]:
def parse_stupid_blocklist_to_something_readable(file_path):
    url_list = {}
    
    with open(file_path) as blocklist_file:
        blocklist = json.load(blocklist_file)
        
        for cat, entities in blocklist['categories'].items():
            for entity_list in entities:
                for entity, url_objects in entity_list.items():
                    for url, aliases in url_objects.items():
                        all_urls = [url]
                        all_urls += aliases
                        
                        if entity not in url_list:
                            url_list[entity] = []
                            
                        for u in all_urls:
                            
                            try:
                                url_list[entity].append(get_fld(u, fix_protocol=True))
                            except:
                                pass
                        
                        url_list[entity] = list(set(url_list[entity]))
    
    return url_list

In [113]:
def create_stats_object(json_files):
    data_object = init_data_object()
    
    for json_file in json_files:
        with open(json_file, 'r') as data_file:
            data = json.load(data_file)
            
            # Updata failure counts
            if data['failure_status']['timeout']:
                data_object['failures']['timeout_failures'] += 1
            if data['failure_status']['TLS'] != 'null':
                data_object['failures']['TLS_failures'] += 1
            if data['failure_status']['consent']:
                data_object['failures']['consent_failures'] += 1
                
            # Only proceed if there is no timeout
            if data['failure_status']['timeout']:
                continue
            
            # Append page load time
            data_object['page_load_times'].append(data['load_time'])
            
            # Append number of requests
            data_object['num_requests'].append(len(data['requests']))
            
            # Append distinct third parties
            distinct_third_parties = set([ get_fld(d['request_url'], fix_protocol=True)
                                           for d in data['requests']
                                         ])
            data_object['distinct_third_parties'].append(len(distinct_third_parties))
            
            # Append number of distinct tracker domains
            tracker_dict = parse_stupid_blocklist_to_something_readable(SRC_DIR + '/disconnectmeblocklist.json')
            num_distinct_tracker_domains = 0
            for third_party_domain in distinct_third_parties:
                for _, domains in tracker_dict.items():
                    if third_party_domain in domains:
                        num_distinct_tracker_domains += 1
            data_object['num_distinct_tracker_domains'].append(num_distinct_tracker_domains)
            
            # Append number of distinct tracker entities/companies
            entities = []
            with open(SRC_DIR + '/domain_map.json') as domain_map_json_file:
                domain_map_json = json.load(domain_map_json_file)
                
                for third_party_domain in distinct_third_parties:
                    if third_party_domain in domain_map_json.keys():
                        entities.append(domain_map_json[third_party_domain]['entityName'])
                    else:
                        print('DOMAIN NOT FOUND IN ENTITY LIST, IMPLEMENT THIS WHEN YOU SEE THIS')
                
            entities = set(entities)
            data_object['num_distinct_tracker_entities'].append(len(entities))
            
    return data_object
                
# TODO: Remove
create_stats_object(data_json_desktop)

{'longest_lifespan_cookies': [],
 'failures': {'timeout_failures': 0, 'TLS_failures': 1, 'consent_failures': 0},
 'page_load_times': [0.0],
 'num_requests': [8],
 'distinct_third_parties': [2],
 'num_distinct_tracker_domains': [2],
 'num_distinct_tracker_entities': [1]}

In [123]:
# Desktop analysis