In [7]:
from policy_processing import *
import sqlite3
import pandas as pd
import difflib
from cleantext import clean
import nltk
from tqdm import trange

# Helper Functions to create the Html Files

In [42]:
def create_html_collection(documents, site_nr, domain,wc = 80):
    
    versions = len(documents)
    file_name = f"HTML_DIFFS/policy_{site_nr}_collection.html"
    
    html = ""
    
    for version in range(versions - 1):
        a = sentences = nltk.sent_tokenize(documents[version])
        b = sentences = nltk.sent_tokenize(documents[version + 1])
        difference = difflib.HtmlDiff(wrapcolumn=wc)

        
        with open(file_name, "a") as file:
            html = f"<h1>{domain}</h1>"
            
            html += difference.make_file(fromlines=a, 
                                        tolines=b, 
                                        fromdesc=f"version {version}", 
                                        todesc=f"version {version + 1}")
            
            file.write(html)
    
    return html

# Read in and clean the Policy Data

In [3]:
cleaning_func = lambda text : clean(text,
    fix_unicode=True,               # fix various unicode errors
    to_ascii=True,                  # transliterate to closest ASCII representation
    lower=False,                     # lowercase text
    no_line_breaks=False,           # fully strip line breaks as opposed to only normalizing them
    no_urls=False,                  # replace all URLs with a special token
    no_emails=False,                # replace all email addresses with a special token
    no_phone_numbers=False,         # replace all phone numbers with a special token
    no_numbers=False,               # replace all numbers with a special token
    no_digits=False,                # replace all digits with a special token
    no_currency_symbols=False,      # replace all currency symbols with a special token
    no_punct=False,                 # remove punctuations
    replace_with_punct="",          # instead of removing punctuations you may replace them
    replace_with_url="<URL>",
    replace_with_email="<EMAIL>",
    replace_with_phone_number="<PHONE>",
    replace_with_number="<NUMBER>",
    replace_with_digit="0",
    replace_with_currency_symbol="<CUR>",
    lang="en"              
)

In [10]:
conn = sqlite3.connect('../datasets/small10k.sqlite')
df = pd.read_sql("SELECT * FROM small10k", con=conn)
all_site_ids = list(set(df.site_id))

number_of_files_to_create = 20

In [45]:
for i in trange(number_of_files_to_create):
    
    site_id = all_site_ids[i]
    
    # sort first by year, then by phase
    data = create_data(df.sort_values(by=['year', 'phase']), site_id)
    
    # get_the actual strings
    policy_texts = get_policy_texts(data)

    # cleaned documents using above function
    documents = clean_text(policy_texts, cleaning_func)
    
    create_html_collection(documents, site_id, data["domain"].values[0] ,wc = 90)

100%|██████████| 20/20 [00:05<00:00,  3.60it/s]


In [38]:
h

''