# Import packages

In [1]:
import pandas as pd
import numpy as np
import glob
import warnings
from bs4 import BeautifulSoup
from urllib.request import urlopen
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings('ignore')
%matplotlib inline
import os
import json
import re

# Define file paths

In [2]:
# INPUT

# root folder
data_root_dpath = os.path.join("..", "data", "OPP-115")

# annotations folder
annot_dpath = os.path.join(data_root_dpath, "annotations")

# policy collection metadata file
policy_collect_metadata_fpath = os.path.join(data_root_dpath, "documentation", "policies_opp115.csv")

# Website metadata file
site_metadata_fpath = os.path.join(data_root_dpath, "documentation", "websites_opp115.csv")

# Sanitized policies folder
sanitized_pol_dpath = os.path.join(data_root_dpath, "sanitized_policies")

# OUTPUT

# processed data folder
processed_data_dpath = os.path.join(data_root_dpath, "processed_data")

# processed annotation folder
op_annotations_dpath = os.path.join(processed_data_dpath, "processed_annotations")

# processed segments folder
op_segments_dpath = os.path.join(processed_data_dpath, "processed_segments")

# master annotations file
master_annotations_115_fpath = os.path.join(processed_data_dpath, "master_annotations_115.csv")

# master data for categorical models
master_catmodel_fpath = os.path.join(processed_data_dpath, "master_catmodel_fpath.csv")

# categy-wise split annotations folder (w/o parsed JSON attr)
catsplit_annotations_115_unparsed_dpath = os.path.join(processed_data_dpath, "catsplit_annotations_115_unparsed")

# categy-wise split annotations folder (w/ parsed JSON attr)
catsplit_annotations_115_parsed_dpath = os.path.join(processed_data_dpath, "catsplit_annotations_115_parsed")

# Process annotations

In [3]:
df_list = []

for fname in glob.glob(r"{}/*.csv".format(annot_dpath)):   
    
    #Extract path basename
    basename = os.path.basename(fname)

    #Create directories if they don't exist
    os.makedirs(op_annotations_dpath, exist_ok = True)
    os.makedirs(op_segments_dpath, exist_ok = True)
    
    #Extract policyID from basename
    policy_id = basename.split('_')[0]
    policy_df = pd.read_csv(fname, header=None, usecols=[0, 4, 5, 6], names=['annotation_ID', 'segment_ID', 'category', 'attr_val'])
    
    #Set policyID in each table
    policy_df.loc[:,"policy_ID"] = policy_id
    
    #Replace extension
    santized_policy_fpath = os.path.splitext(basename)[0]+'.html'
    
    # Parse html text
    html = open(os.path.join(sanitized_pol_dpath, santized_policy_fpath), "r").read()
    soup = BeautifulSoup(html, features="html.parser")
    soup_text = soup.get_text()
    
    #Match segments with their segment IDs for each policy
    segments = soup_text.split("|||")
    segments_df = pd.DataFrame(segments, columns = ["segment_text"])
    segments_df.index.name = "segment_ID"
    segments_df.reset_index(inplace = True)
    
    #Save processed segments
    segments_df.to_csv(os.path.join(op_segments_dpath, basename), index = False)
    
    policy_df_merged = policy_df.merge(segments_df, on='segment_ID', how = "inner")
    
    #Save processed policies
    policy_df_merged.to_csv(os.path.join(op_annotations_dpath, basename), index = False)
    
    df_list.append(policy_df_merged)
    
master_annotations_df = pd.concat(df_list, axis=0, ignore_index=True)
master_annotations_df.to_csv(master_annotations_115_fpath, index = False)
master_annotations_df[["segment_ID", "segment_text", "category"]].to_csv(master_annotations_115_fpath, index = False)

# Check for missing values

In [4]:
#Check for null values
assert(master_annotations_df.isnull().any(axis=1).any() == False)
#missing_data_rows = master_annotations_df.index[master_annotations_df.isnull().any(axis=1)]

# Split the master dataframe wrt the categories

In [5]:
# Groupby category and split
cat_dfs_list = [df for _, df in master_annotations_df.groupby('category')]

#Directory exist check
os.makedirs(catsplit_annotations_115_unparsed_dpath, exist_ok = True)

#Save them into corresponding .CSV files
for df in cat_dfs_list:
    assert(len(set(df.category)) == 1)
    category = '_'.join(next(iter(set(df.category))).replace("/", "-").split(" "))
    df.to_csv(os.path.join(catsplit_annotations_115_unparsed_dpath, "{}.csv".format(category)), index = False)

# Parse Attribute-value JSON data

In [6]:
for policy_df in cat_dfs_list:
    assert(len(set(policy_df.category)) == 1)
    category = '_'.join(next(iter(set(policy_df.category))).replace("/", "-").split(" "))
    os.makedirs(catsplit_annotations_115_parsed_dpath, exist_ok = True)

    cat_list_of_dict = []
    for index, row in policy_df.iterrows():
        attr_dict = json.loads(row["attr_val"])
        cat_list_of_dict.append({ k:v['value'] for k,v in attr_dict.items() })
    cat_df = pd.DataFrame(cat_list_of_dict)
    assert(cat_df.isnull().any(axis=1).any() == False)
    pd.concat((policy_df, cat_df), axis = 1).to_csv(os.path.join(catsplit_annotations_115_parsed_dpath, "{}.csv".format(category)), index = False)

# Pre-process site metadata file

In [7]:
site_metadata_df = pd.read_csv(site_metadata_fpath)
# manually added a us rank of 0 to a missing value for policy UID 745 

alexa_rank_global = []
alexa_rank_us = []
sectors = []

for index, row in site_metadata_df.iterrows():
    sector_lst = []
    alexa_rank_global.append(re.findall(r'\d+', row["Comments"])[0])
    alexa_rank_us.append(re.findall(r'\d+', row["Comments"])[1])
            
    sector_lst = list(set([row.iloc[i].split(":")[0] for i in range(7, site_metadata_df.shape[1]) if not row.iloc[i] != row.iloc[i]]))
    sectors.append(sector_lst)


pd.DataFrame({ 'site_name': site_metadata_df["Site Human-Readable Name"].values,
              'policy_ID': site_metadata_df["Policy UID"].values,
              'alexa_rank_global': alexa_rank_global,
              'alexa_rank_us': alexa_rank_us,
              'sectors': sectors
}).to_csv(os.path.join(processed_data_dpath, "site_metadata_115.csv"), index = False)


# Majority rule

In [79]:
df1 = pd.read_csv(os.path.join(processed_data_dpath, "master_catmodel_dataset_union.csv"))
df1

Unnamed: 0,segment_text,category
0,"Effective Date: May 7, 2015 Kraft Site Pr...",Other
1,"Effective Date: May 7, 2015 Kraft Site Pr...",Policy Change
2,Information We Collect Personally-Identif...,First Party Collection/Use
3,Registration To take full advantage of our...,First Party Collection/Use
4,"Contests, Sweepstakes and Games We may pro...",First Party Collection/Use
...,...,...
6353,We do not participate in tracking networks and...,Third Party Sharing/Collection
6354,We do not participate in tracking networks and...,First Party Collection/Use
6355,Does this Policy apply in other countries? ...,Other
6356,Does this Policy apply in other countries? ...,International and Specific Audiences


In [73]:
df = pd.read_csv(os.path.join(processed_data_dpath, "master_catmodel_dataset_majority.csv"))
df

Unnamed: 0,policy_ID,segment_ID,annotator_ID,segment_text,category
0,746,0,121,"Effective Date: May 7, 2015 Kraft Site Pr...",Other
1,746,0,116,"Effective Date: May 7, 2015 Kraft Site Pr...",Other
2,746,0,116,"Effective Date: May 7, 2015 Kraft Site Pr...",Policy Change
3,746,0,118,"Effective Date: May 7, 2015 Kraft Site Pr...",Other
4,746,1,121,Information We Collect Personally-Identif...,First Party Collection/Use
...,...,...,...,...,...
13934,1545,27,82,Does this Policy apply in other countries? ...,International and Specific Audiences
13935,1545,27,88,Does this Policy apply in other countries? ...,International and Specific Audiences
13936,1545,28,84,How can you contact us with questions about t...,Other
13937,1545,28,82,How can you contact us with questions about t...,Other


In [74]:
df.segment_ID = df.segment_ID.astype('str')
df.policy_ID = df.policy_ID.astype('str')
df['polID_segID_cat'] = df[['policy_ID','segment_ID', 'category']].agg('-'.join, axis=1)
df

Unnamed: 0,policy_ID,segment_ID,annotator_ID,segment_text,category,polID_segID_cat
0,746,0,121,"Effective Date: May 7, 2015 Kraft Site Pr...",Other,746-0-Other
1,746,0,116,"Effective Date: May 7, 2015 Kraft Site Pr...",Other,746-0-Other
2,746,0,116,"Effective Date: May 7, 2015 Kraft Site Pr...",Policy Change,746-0-Policy Change
3,746,0,118,"Effective Date: May 7, 2015 Kraft Site Pr...",Other,746-0-Other
4,746,1,121,Information We Collect Personally-Identif...,First Party Collection/Use,746-1-First Party Collection/Use
...,...,...,...,...,...,...
13934,1545,27,82,Does this Policy apply in other countries? ...,International and Specific Audiences,1545-27-International and Specific Audiences
13935,1545,27,88,Does this Policy apply in other countries? ...,International and Specific Audiences,1545-27-International and Specific Audiences
13936,1545,28,84,How can you contact us with questions about t...,Other,1545-28-Other
13937,1545,28,82,How can you contact us with questions about t...,Other,1545-28-Other


In [75]:
df_ct = df.groupby('polID_segID_cat').nunique()['annotator_ID']
lst = df_ct[df_ct >= 2].index
lst

Index(['1017-0-Other', '1017-1-First Party Collection/Use', '1017-10-Other',
       '1017-11-Other', '1017-11-User Access, Edit and Deletion',
       '1017-12-Other', '1017-2-Data Retention',
       '1017-3-First Party Collection/Use',
       '1017-4-International and Specific Audiences', '1017-4-Other',
       ...
       '995-0-Other', '995-0-Third Party Sharing/Collection',
       '995-1-First Party Collection/Use',
       '995-1-Third Party Sharing/Collection',
       '995-2-First Party Collection/Use',
       '995-2-Third Party Sharing/Collection',
       '995-3-User Access, Edit and Deletion',
       '995-4-International and Specific Audiences', '995-5-Other',
       '995-6-Other'],
      dtype='object', name='polID_segID_cat', length=4458)

In [78]:
df[df.polID_segID_cat.isin(lst)][["segment_text", "category"]].drop_duplicates()

Unnamed: 0,segment_text
0,"Effective Date: May 7, 2015 Kraft Site Pr..."
4,Information We Collect Personally-Identif...
7,Registration To take full advantage of our...
10,"Contests, Sweepstakes and Games We may pro..."
13,E-mail When you ask us to send you recipes...
...,...
13921,If you no longer wish to receive our announcem...
13926,What is our policy on tracking? You may be...
13930,We do not participate in tracking networks and...
13934,Does this Policy apply in other countries? ...


In [37]:
d = {}
for i,r in df.iterrows():
    if ((r['segment_ID'], r['category']) in d.keys()) and not (r['annotator_ID'] in d[(r['segment_ID'], r['category'])]):
        d[(r['segment_ID'], r['category'])].append(r['annotator_ID'])
    else:
        d[(r['segment_ID'], r['category'])] = [r['annotator_ID']]
d_len = {k:v for k,v in d.items() if len(set(v)) >= 2}

In [39]:
d

{(0, 'Other'): [84, 82, 88],
 (1, 'Other'): [84, 82, 88],
 (2, 'Policy Change'): [84, 82, 88],
 (2, 'Other'): [84, 82],
 (3, 'First Party Collection/Use'): [84, 82, 88],
 (4, 'Other'): [84],
 (4, 'First Party Collection/Use'): [82, 88],
 (5, 'First Party Collection/Use'): [84, 82, 88],
 (6, 'First Party Collection/Use'): [84, 82, 88],
 (6, 'Third Party Sharing/Collection'): [84, 82, 88],
 (7, 'First Party Collection/Use'): [84, 82, 88],
 (8, 'First Party Collection/Use'): [84, 82, 88],
 (9, 'First Party Collection/Use'): [84, 82, 88],
 (9, 'Third Party Sharing/Collection'): [84, 82, 88],
 (10, 'Third Party Sharing/Collection'): [84, 82, 88],
 (11, 'Third Party Sharing/Collection'): [84, 82, 88],
 (11, 'Other'): [88],
 (12, 'Do Not Track'): [84, 82, 88],
 (13, 'First Party Collection/Use'): [84, 82, 88],
 (14, 'Third Party Sharing/Collection'): [84],
 (14, 'First Party Collection/Use'): [82, 88],
 (15, 'Third Party Sharing/Collection'): [84, 82],
 (16, 'Third Party Sharing/Collection'):

# Elevate Other attr

In [81]:
df = pd.read_csv(master_annotations_115_fpath)
df

Unnamed: 0,annotation_ID,annotator_ID,segment_ID,category,attr_val,policy_ID,segment_text
0,13160,121,0,Other,"{""Other Type"": {""endIndexInSegment"": 575, ""sta...",746,"Effective Date: May 7, 2015 Kraft Site Pr..."
1,12377,116,0,Other,"{""Other Type"": {""endIndexInSegment"": 575, ""sta...",746,"Effective Date: May 7, 2015 Kraft Site Pr..."
2,12378,116,0,Policy Change,"{""Change Type"": {""endIndexInSegment"": 36, ""sta...",746,"Effective Date: May 7, 2015 Kraft Site Pr..."
3,11991,118,0,Other,"{""Other Type"": {""endIndexInSegment"": 575, ""sta...",746,"Effective Date: May 7, 2015 Kraft Site Pr..."
4,13161,121,1,First Party Collection/Use,"{""Collection Mode"": {""endIndexInSegment"": -1, ...",746,Information We Collect Personally-Identif...
...,...,...,...,...,...,...,...
23189,2953,82,27,International and Specific Audiences,"{""Audience Type"": {""endIndexInSegment"": 740, ""...",1545,Does this Policy apply in other countries? ...
23190,3119,88,27,International and Specific Audiences,"{""Audience Type"": {""endIndexInSegment"": 740, ""...",1545,Does this Policy apply in other countries? ...
23191,2393,84,28,Other,"{""Other Type"": {""endIndexInSegment"": 303, ""sta...",1545,How can you contact us with questions about t...
23192,2952,82,28,Other,"{""Other Type"": {""endIndexInSegment"": 303, ""sta...",1545,How can you contact us with questions about t...


In [83]:
import json
for i,r in df.iterrows():
    if r['category'] == "Other":
        attr_dict = json.loads(r["attr_val"])
        df.iloc[i, 3] = attr_dict["Other Type"]["value"]
df

Unnamed: 0,annotation_ID,annotator_ID,segment_ID,category,attr_val,policy_ID,segment_text
0,13160,121,0,Introductory/Generic,"{""Other Type"": {""endIndexInSegment"": 575, ""sta...",746,"Effective Date: May 7, 2015 Kraft Site Pr..."
1,12377,116,0,Introductory/Generic,"{""Other Type"": {""endIndexInSegment"": 575, ""sta...",746,"Effective Date: May 7, 2015 Kraft Site Pr..."
2,12378,116,0,Policy Change,"{""Change Type"": {""endIndexInSegment"": 36, ""sta...",746,"Effective Date: May 7, 2015 Kraft Site Pr..."
3,11991,118,0,Introductory/Generic,"{""Other Type"": {""endIndexInSegment"": 575, ""sta...",746,"Effective Date: May 7, 2015 Kraft Site Pr..."
4,13161,121,1,First Party Collection/Use,"{""Collection Mode"": {""endIndexInSegment"": -1, ...",746,Information We Collect Personally-Identif...
...,...,...,...,...,...,...,...
23189,2953,82,27,International and Specific Audiences,"{""Audience Type"": {""endIndexInSegment"": 740, ""...",1545,Does this Policy apply in other countries? ...
23190,3119,88,27,International and Specific Audiences,"{""Audience Type"": {""endIndexInSegment"": 740, ""...",1545,Does this Policy apply in other countries? ...
23191,2393,84,28,Privacy contact information,"{""Other Type"": {""endIndexInSegment"": 303, ""sta...",1545,How can you contact us with questions about t...
23192,2952,82,28,Privacy contact information,"{""Other Type"": {""endIndexInSegment"": 303, ""sta...",1545,How can you contact us with questions about t...


In [84]:
df[["category"]].drop_duplicates()

Unnamed: 0,category
0,Introductory/Generic
2,Policy Change
4,First Party Collection/Use
47,User Choice/Control
59,Third Party Sharing/Collection
67,Practice not covered
85,Privacy contact information
98,Data Retention
158,Do Not Track
209,International and Specific Audiences
