In [1]:
from bs4 import BeautifulSoup, SoupStrainer
from bs4.element import Comment
import requests
from datetime import datetime
import time
import json
import os
import unicodedata
import urllib.parse
import re
import pandas as pd
import tqdm.notebook as tq
import pickle

keyword_list = {'terms of use', 'privacy policy', 'privacy', 'terms', 'policy & safety',
                'policy and safety', 'cookie and privacy policy', 'cookie & privacy policy',
                'privacy policy and cookies', 'privacy policy & cookies', 'privacy and cookies policy',
                'privacy & cookies policy', 'privacy notice',
                'legal', 'cookie policy', 'terms of service & honor code',
                'terms of service and honor code', 'terms (updated)', 'privacy (updated)',
                'privacy and cookies', 'privacy & cookies', 'terms and conditions',
                'terms & conditions', 'ts&cs', 't&cs', 't&c' 'policies', 'our policies', 'cookies',
                'policy', 'legal notices', 'user agreement', 'site usage agreement', 'cookies statement'}

In [3]:
print(keyword_list)

{'cookie policy', 'terms of service and honor code', 'legal', 'user agreement', 'privacy policy', 'privacy (updated)', 'our policies', 'policy & safety', 'privacy & cookies', 'privacy policy & cookies', 'terms (updated)', 'cookie and privacy policy', 'terms & conditions', 'ts&cs', 'privacy and cookies', 'privacy', 'privacy policy and cookies', 'legal notices', 'policy and safety', 'privacy & cookies policy', 'terms of service & honor code', 'cookies statement', 'privacy notice', 'policy', 't&cs', 'terms and conditions', 'terms of use', 'terms', 't&cpolicies', 'cookies', 'cookie & privacy policy', 'site usage agreement', 'privacy and cookies policy'}


In [3]:
websites = pd.read_csv('/project/Data/majestic_million.csv',
                      usecols=['GlobalRank','Domain','TLD'])
websites.head()

Unnamed: 0,GlobalRank,Domain,TLD
0,1,google.com,com
1,2,facebook.com,com
2,3,youtube.com,com
3,4,twitter.com,com
4,5,instagram.com,com


In [4]:
websites = websites[(websites.GlobalRank < 10000) & (websites.TLD == 'com')]['Domain']
websites = list("https://" + websites)

In [6]:
import multiprocessing
from joblib import Parallel, delayed

num_cores = multiprocessing.cpu_count()

if __name__ == "__main__":
    corpus = Parallel(n_jobs=num_cores)(delayed(get_terms_from_website)(website)
                                         for website in tq.tqdm(websites))

corpus = list(set([p for terms in corpus for p in terms if p]))

pickle.dump(corpus, open("/project/Data/unlabelled_terms_medium.p", "wb"))

  0%|          | 0/5162 [00:00<?, ?it/s]

In [7]:
len(corpus)

32425

In [None]:
# corpus = [terms for website in tq.tqdm(websites) \
#           for terms in get_terms_from_website(website)]

# # remove duplicates
# corpus = list(set([paragraph for terms in corpus for paragraph in terms]))

# pickle.dump(corpus, open("/project/Data/unlabelled_terms_medium.p", "wb"))

  0%|          | 0/5162 [00:00<?, ?it/s]

In [5]:
def get_terms_from_website(website):
    '''
    Searches website for pages containing terms and conditions.
    Returns a list of the paragraphs on all such pages.
    '''
    try:
        html_text = requests.get(website, timeout=30).content
    except:
        return []
    
    soup = BeautifulSoup(html_text, 'lxml', parse_only=SoupStrainer('a'))
    terms_links = soup.find_all(lambda tag: tag.text.lower().strip() in keyword_list)
    terms_links = extract_hrefs(terms_links, website)
    
    filtered_pages = [p for link in terms_links
                      for p in filter_page(paragraphs_from_html(link))]
    
    #filtered_pages = [filter_page(paragraphs_from_html(link)) for link in terms_links]
    
    return [filtered_page for filtered_page in filtered_pages if filtered_page]
    

def paragraphs_from_html(link):
    '''
    Retrieves paragraphs from a html page.
    '''
    try:
        body = requests.get(link, timeout=30).content
    except:
        return []
    
    soup = BeautifulSoup(body, 'lxml')
    texts = soup.findAll('p', text=True) 
    return [unicodedata.normalize("NFKD", x.string).strip() for x in texts]

def filter_text(text):
    '''
    Function for ensuring quality texts are used. Returns True if text satisfies criteria,
    else returns False.
    '''
    
    # must have more than 10 words
#     if len(text.split(' ')) <= 10:
#         return False
    
    # must have at least 3 sentences
    if len([x for x in text.replace('?', '.').replace('!', '.').split('.') if x]) < 3:
        return False
    
    # must end in terminal punctuation mark
    if text.strip()[-1] not in '.!?':
        return False
    
    return True

def filter_page(page):
    '''
    Iterates over a page of paragraphs and removes paragraphs that do not
    satisfy the criteria in filter_text()
    '''
    return [p for p in page if filter_text(p)]
    
def extract_hrefs(anchors, domain):
    '''
    Returns a list of href attributes from a list of anchor tags.
    
    Ensures hrefs are absolute paths by checking for the absence of
    "http" in the href, since this suggests the href is only a relative
    path, e.g. "/legal/terms-of-use" as opposed to "https://example.com/legal/terms-of-use"
    If a relative path is detected, it is made absolute by prepending with `domain`, which
    should take form "https://example.com".
    '''
    hrefs = []
    for anchor in anchors:
        anchor = anchor.get('href')
        
        if not anchor:
            continue
            
        if "http" not in anchor:
            anchor = urllib.parse.urljoin(domain, anchor)
        hrefs.append(anchor)
    return hrefs