# Import packages

In [1]:
import pandas as pd
import numpy as np
import glob
import warnings
from bs4 import BeautifulSoup
from urllib.request import urlopen
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings('ignore')
%matplotlib inline
import os
import json
import re

# Define file paths

In [4]:
# INPUT

# root folder
data_root_dpath = os.path.join("..", "..", "data", "OPP-115")

# annotations folder
annot_dpath = os.path.join(data_root_dpath, "annotations")

# policy collection metadata file
policy_collect_metadata_fpath = os.path.join(data_root_dpath, "documentation", "policies_opp115.csv")

# Website metadata file
site_metadata_fpath = os.path.join(data_root_dpath, "documentation", "websites_opp115.csv")

# Sanitized policies folder
sanitized_pol_dpath = os.path.join(data_root_dpath, "sanitized_policies")

# OUTPUT

# processed data folder
processed_data_dpath = os.path.join(data_root_dpath, "processed_data")

# processed annotation folder
op_annotations_dpath = os.path.join(processed_data_dpath, "processed_annotations")

# processed segments folder
op_segments_dpath = os.path.join(processed_data_dpath, "processed_segments")

# master annotations file
master_annotations_115_fpath = os.path.join(processed_data_dpath, "master_annotations_115.csv")

# categy-wise split annotations folder (w/o parsed JSON attr)
catsplit_annotations_115_unparsed_dpath = os.path.join(processed_data_dpath, "catsplit_annotations_115_unparsed")

# categy-wise split annotations folder (w/ parsed JSON attr)
catsplit_annotations_115_parsed_dpath = os.path.join(processed_data_dpath, "catsplit_annotations_115_parsed")

# Process annotations

In [5]:
for fname in glob.glob(r"{}/*.csv".format(annot_dpath)):   
    
    #Extract path basename
    basename = os.path.basename(fname)

    #Create directories if they don't exist
    os.makedirs(op_annotations_dpath, exist_ok = True)
    os.makedirs(op_segments_dpath, exist_ok = True)
    
    #Extract policyID from basename
    policy_id = basename.split('_')[0]
    policy_df = pd.read_csv(fname, header=None, usecols=[0, 4, 5, 6], names=['annotation_ID', 'segment_ID', 'category', 'attr_val'])
    
    #Set policyID in each table
    policy_df.loc[:,"policy_ID"] = policy_id
    
    #Replace extension
    santized_policy_fpath = os.path.splitext(basename)[0]+'.html'
    
    # Parse html text
    html = open(os.path.join(sanitized_pol_dpath, santized_policy_fpath), "r").read()
    soup = BeautifulSoup(html, features="html.parser")
    soup_text = soup.get_text()
    
    #Match segments with their segment IDs for each policy
    segments = soup_text.split("|||")
    segments_df = pd.DataFrame(segments, columns = ["segment_text"])
    segments_df.index.name = "segment_ID"
    segments_df.reset_index(inplace = True)
    
    #Save processed segments
    segments_df.to_csv(os.path.join(op_segments_dpath, basename), index = False)
    
    policy_df_merged = policy_df.merge(segments_df, on='segment_ID', how = "inner")
    
    #Save processed policies
    policy_df_merged.to_csv(os.path.join(op_annotations_dpath, basename), index = False)

# Merge all site-wise annotation into a master dataframe

In [None]:
df_list = []

for fname in glob.glob(r"{}/*.csv".format(op_annotations_dpath)):
    policy_df = pd.read_csv(fname)
    df_list.append(policy_df)

master_annotations_df = pd.concat(df_list, axis=0, ignore_index=True)
master_annotations_df.to_csv(os.path.join(master_annotations_115_fpath), index = False)

# Check for missing values

In [None]:
#Check for null values
assert(master_annotations_df.isnull().any(axis=1).any() == False)
#missing_data_rows = master_annotations_df.index[master_annotations_df.isnull().any(axis=1)]

# Split the master dataframe wrt the categories

In [None]:
# Groupby category and split
cat_dfs_list = [df for _, df in master_annotations_df.groupby('category')]

#Directory exist check
os.makedirs(catsplit_annotations_115_unparsed_dpath, exist_ok = True)

#Save them into corresponding .CSV files
for df in cat_dfs_list:
    assert(len(set(df.category)) == 1)
    category = '_'.join(next(iter(set(df.category))).replace("/", "-").split(" "))
    df.to_csv(os.path.join(catsplit_annotations_115_unparsed_dpath, "{}.csv".format(category)), index = False)

# Parse Attribute-value JSON data

In [None]:
for fname in glob.glob(r"{}/*.csv".format(catsplit_annotations_115_unparsed_dpath)):   
    policy_df = pd.read_csv(fname)
    assert(len(set(policy_df.category)) == 1)
    category = '_'.join(next(iter(set(policy_df.category))).replace("/", "-").split(" "))
    os.makedirs(catsplit_annotations_115_parsed_dpath, exist_ok = True)

    cat_list_of_dict = []
    for index, row in policy_df.iterrows():
        attr_dict = json.loads(row["attr_val"])
        cat_list_of_dict.append({ k:v['value'] for k,v in attr_dict.items() })
    cat_df = pd.DataFrame(cat_list_of_dict)
    assert(cat_df.isnull().any(axis=1).any() == False)
    pd.concat((policy_df, cat_df), axis = 1).to_csv(os.path.join(catsplit_annotations_115_parsed_dpath, "{}.csv".format(category)), index = False)

# Pre-process site metadata file

In [None]:
site_metadata_df.head()

In [None]:
site_metadata_df = pd.read_csv(site_metadata_fpath)
# manually added a us rank of 0 to a missing value for policy UID 745 

alexa_rank_global = []
alexa_rank_us = []
sectors = []

for index, row in site_metadata_df.iterrows():
    sector_lst = []
    alexa_rank_global.append(re.findall(r'\d+', row["Comments"])[0])
    alexa_rank_us.append(re.findall(r'\d+', row["Comments"])[1])
            
    sector_lst = list(set([row.iloc[i].split(":")[0] for i in range(7, site_metadata_df.shape[1]) if not row.iloc[i] != row.iloc[i]]))
    sectors.append(sector_lst)


pd.DataFrame({ 'site_name': site_metadata_df["Site Human-Readable Name"].values,
              'policy_ID': site_metadata_df["Policy UID"].values,
              'alexa_rank_global': alexa_rank_global,
              'alexa_rank_us': alexa_rank_us,
              'sectors': sectors
}).to_csv(os.path.join(processed_data_dpath, "site_metadata_115.csv"), index = False)
