# Contrastive Keyword Extraction

The following Notebook implements the proposed Constastive Keyword Extraction approach. Depicted in the below image.

<img src="pipeline.png" alt="Alternative text" />

In [1]:
import sqlite3
import pandas as pd
from news_processing import *
from keyword_extraction import *
import sentence_importance
from sentence_comparision import calculate_change_importances, match_sentences, detect_changes
import util
from sklearn.metrics.pairwise import cosine_similarity
from difflib import *
import nltk
import torch
import string
from tqdm import trange
import markdown
from cleantext import clean
from yake.highlight import *
import numpy as np
from sumy_text_rank import *



# Princeton-Leuven Longitudinal Corpus of Privacy Policies

Website: https://privacypolicies.cs.princeton.edu/

Used Coloumns:
* site_id - index to find different versions of a policy
* year - publication year of this version
* phase - used if multiple version, where released in same year
* policy_title
* policy_text - the actual policy
* length
* categories - set of categories that describe the site
* domain

In [5]:
conn = sqlite3.connect('../datasets/small10k.sqlite')
df = pd.read_sql("SELECT * FROM small10k", con=conn)
site_id = 98325

In [6]:
def create_data(df, sid):
    data = df[df.site_id == sid]
    return data

In [7]:
# sort first by year, then by phase
data = create_data(df.sort_values(by=['year', 'phase']), site_id)
data

Unnamed: 0,site_id,year,phase,policy_title,policy_text,length,categories,domain
4263,98325,2009,B,Privacy Policy - Pay Per Click (PPC) & Bid Opt...,"SearchIgnite, Inc. (“Searchignite”), a wholly ...",12116,business,searchignite.com
4261,98325,2011,A,Privacy Policy - Pay Per Click (PPC) & Bid Opt...,"LAST UPDATED: June 9, 2010\n\nWe, SearchIgnite...",14308,business,searchignite.com
4259,98325,2011,B,Privacy Policy,"LAST UPDATED: July 14, 2011\n\nWe, IgnitionOne...",14368,business,searchignite.com
4260,98325,2012,A,Privacy Policy,"LAST UPDATED: April 4, 2012\n\nIgnitionOne, In...",14924,business,searchignite.com
4262,98325,2012,B,Privacy Policy,"LAST UPDATED: July 3, 2012\n\nIgnitionOne, Inc...",18403,business,searchignite.com


In [8]:
def get_policy_texts(data):
    return list(data['policy_text'])

In [9]:
policy_texts = get_policy_texts(data)

# Clean the Policy Text

In [10]:
cleaning_func = lambda text : clean(text,
    fix_unicode=True,               # fix various unicode errors
    to_ascii=True,                  # transliterate to closest ASCII representation
    lower=False,                     # lowercase text
    no_line_breaks=False,           # fully strip line breaks as opposed to only normalizing them
    no_urls=False,                  # replace all URLs with a special token
    no_emails=False,                # replace all email addresses with a special token
    no_phone_numbers=False,         # replace all phone numbers with a special token
    no_numbers=False,               # replace all numbers with a special token
    no_digits=False,                # replace all digits with a special token
    no_currency_symbols=False,      # replace all currency symbols with a special token
    no_punct=False,                 # remove punctuations
    replace_with_punct="",          # instead of removing punctuations you may replace them
    replace_with_url="<URL>",
    replace_with_email="<EMAIL>",
    replace_with_phone_number="<PHONE>",
    replace_with_number="<NUMBER>",
    replace_with_digit="0",
    replace_with_currency_symbol="<CUR>",
    lang="en"              
)

In [11]:
def clean_text(documents, cleaning_function):
    cleaned_documents = []
    
    for document in documents:
        cleaned_documents.append(cleaning_function(document))
    
    return cleaned_documents

In [12]:
# cleaned documents in chronological order
documents = clean_text(policy_texts, cleaning_func)

# Combine $I_S$ and $I_C$ into Final Score -> Keywords

In [13]:
def final_score(document, version,changed_indices, new_indices, matched_dict, ranking, I_c, max_ngram, 
                combinator=util.alpha_combination, k=0, alpha_gamma=0.5, min_ngram = 1):
    
    sentences = nltk.sent_tokenize(document) 
    
    
    I_s = ranking[version]
    
    #sen_frequencies = util.build_sentence_level_freqs(document)
    
    
    keywords = {}
        
    # changed sentences    
    for i in changed_indices:
        matched_idx, score = matched_dict[i][k]
        # get the actual sentence
        # matched_sentence = sentences[matched_idx]
        I_ci = I_c[i]
        I_si = I_s[int(matched_idx)] 
        
        s_c = combinator(I_ci, I_si, alpha_gamma)
        
        #print(s_c)
        
        current_freqs = util.build_sentence_freqs_max_ngram(sentences[matched_idx], 
                                                       higher_ngram=max_ngram, lower_ngram=min_ngram)
        
        #print(current_freqs)
        
        for word, freq in current_freqs.items():
            keywords[word] = keywords.get(word, 0) + (freq * s_c)
            
        
    """
    # newly added sentence:
    for i in new_indices:
        
        current_freqs = util.build_sentence_freqs(sentences[i])
        
        for word, freq in current_freqs.items():
            keywords[word] = keywords.get(word, 0) + (freq * I_s[i])
            
   """     
    # total "IMPORTANCE COUNT
    total_count = sum(keywords.values())
    
    # sort keywords + normalize
    keywords = {k: v/total_count  for k, v in sorted(keywords.items(), key=lambda item: item[1], 
                                                 reverse=True)}
    
    
    return keywords

In [14]:
def contrastive_extraction(documents, max_ngram, min_ngram=1, 
                           importance_estimator= sentence_importance.text_rank_importance,
                           combinator=util.alpha_combination, threshold=0.6, top_k=1, alpha_gamma=0.5, 
                           matching_model='all-MiniLM-L6-v2', w0 = 3, w1 = 1, w2 = 1, show_changes=False):
    
    versions = len(documents)
    
    
    #rank all sentences in their respective version in the total document catalogue
    # available esitmators: text_rank_importance, yake_weighted_importance, yake_unweighted_importance 
    ranking = importance_estimator(documents)
    
    # intermediate keywords
    keyword_collection = {version:{} for version in range(versions-1)}
    
    changed_sentences = {version: [] for version in range(versions-1)}
    
    matched_dicts = {version: {} for version in range(versions-1)}
    
    additions = {version: {} for version in range(versions-1)}
    
    deletions = {version: {} for version in range(versions-1)}
    
    for i in trange(versions-1):
        
        i_next = i + 1
        
        # matching
        matched_dict = match_sentences(documents[i], documents[i+1], k = top_k, model=matching_model)
        
        
        matched_dicts[i] = matched_dict
        
        # determine WHAT has changed
        changed_indices, new_indices, adds, delet = detect_changes(matched_dict, documents[i], documents[i+1], 
                                           important_indices=[], show_output=show_changes)
        
        additions[i] = adds
        
        deletions[i] = delet
        
        changed_sentences[i] = changed_indices
        
        # determine HOW important the change was
        I_c = calculate_change_importances(changed_indices, matched_dict, ranking ,threshold, 
                                           version=i, w0 = w0, w1 = w1, w2=w2)
        
        # calculate keywords between two subsequent versions
        intermediate_keywords = final_score(documents[i+1], i+1, changed_indices, new_indices, matched_dict, 
                                            ranking, I_c, max_ngram, combinator, alpha_gamma=alpha_gamma, 
                                            min_ngram= min_ngram)
        
        # add to overall dictonary
        # index n: contrastive keywords for versions n and n+1
        keyword_collection[i] = intermediate_keywords
    
    return keyword_collection, matched_dicts, changed_sentences, additions, deletions

In [15]:
keywords, matched_dicts, changed_sentences, added, deleted = contrastive_extraction(documents, max_ngram=2, min_ngram=1, 
                                                                    show_changes=False)

100%|██████████| 4/4 [00:25<00:00,  6.41s/it]


In [20]:
keywords.values()

dict_values([{'information': tensor(0.0302), 'personal': tensor(0.0243), 'personal information': tensor(0.0243), 'searchignite': tensor(0.0193), 'privacy': tensor(0.0180), 'use': tensor(0.0176), 'inc': tensor(0.0173), 'searchignite inc': tensor(0.0173), 'collect': tensor(0.0131), 'disclose': tensor(0.0113), 'disclose personal': tensor(0.0113), 'may': tensor(0.0112), 'last': tensor(0.0101), 'updated': tensor(0.0100), 'last updated': tensor(0.0100), 'june': tensor(0.0099), '9': tensor(0.0099), '2010': tensor(0.0099), 'concerned': tensor(0.0099), 'issues': tensor(0.0099), 'want': tensor(0.0099), 'familiar': tensor(0.0099), 'defined': tensor(0.0099), 'updated june': tensor(0.0099), 'june 9': tensor(0.0099), '9 2010': tensor(0.0099), '2010 searchignite': tensor(0.0099), 'inc concerned': tensor(0.0099), 'concerned privacy': tensor(0.0099), 'privacy issues': tensor(0.0099), 'issues want': tensor(0.0099), 'want familiar': tensor(0.0099), 'familiar collect': tensor(0.0099), 'collect use': tenso

In [23]:
def combine_keywords(keywords):
    total_keywords = {}
    
    # normalize keyword values
    normalization_term = len(keywords)
    
    for idx in keywords:
        current_keywords = keywords[idx]
        
        for keyword, value in current_keywords.items():
            
            # sum up all keywords in the different versions
            total_keywords[keyword] = total_keywords.get(keyword, 0) + (value / normalization_term)
    
    # sorted the keywords
    sorted_keywords = {k: v for k, v in sorted(total_keywords.items(), key=lambda item: item[1], 
                                            reverse=True)}
    
    return sorted_keywords

In [None]:
total_keywords = combine_keywords(keywords)

In [None]:
kw_frame = pd.DataFrame({'keyword' : total_keywords.keys(), 'score': total_keywords.values()})

In [22]:
k = pd.DataFrame({'keywords': keywords.values(), 'changed': changed_sentences.values()})
k

Unnamed: 0,keywords,changed
0,"{'information': tensor(0.0302), 'personal': te...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
1,"{'services': tensor(0.0195), 'information': te...","[0, 1, 2, 28, 29, 30, 31, 32, 33, 34, 35, 37, ..."
2,"{'information': tensor(0.0302), 'site': tensor...","[0, 1, 2, 15, 37]"
3,"{'information': tensor(0.0275), 'privacy': ten...","[0, 13, 17, 28, 69, 74, 81, 91, 98]"


# Save Keywords to csv file

In [None]:
kw_frame.to_csv(f"keywords_{site_id}.csv", index=False)

In [None]:
pd.read_csv(f"keywords_{site_id}.csv").head(10)

In [None]:
df = pd.read_csv(f"keywords_31160.csv")

# create collection

In [24]:
all_site_ids = list(set(df.site_id))

number_of_files_to_create = 20

for i in trange(number_of_files_to_create):
    
    site_id = all_site_ids[i]
    
    # sort first by year, then by phase
    data = create_data(df.sort_values(by=['year', 'phase']), site_id)
    
    # get_the actual strings
    policy_texts = get_policy_texts(data)

    # cleaned documents using above function
    documents = clean_text(policy_texts, cleaning_func)
    
    keywords, matched_dicts, changed_sentences, added, deleted = contrastive_extraction(documents, max_ngram=2,
                                                                 min_ngram=1, show_changes=False)
    
    # combine keywords
    total_keywords = combine_keywords(keywords)
    
    total_frame = pd.DataFrame({'keyword' : total_keywords.keys(), 'score': total_keywords.values()})
    
    # save total
    total_frame.to_csv(f"dataframes/keywords_{site_id}.csv", index=False)
    
    # save itermediate
    intermediate_frame = pd.DataFrame({'keywords': keywords.values(), 'changed': changed_sentences.values(), 
                                       'added':added.values(), 'deleted': deleted.values()})
    
    intermediate_frame.to_csv(f"dataframes/inter_keywords_{site_id}.csv", index=False)

  0%|          | 0/20 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
  5%|▌         | 1/20 [00:00<00:04,  4.62it/s]
  0%|          | 0/3 [00:00<?, ?it/s][A
 33%|███▎      | 1/3 [00:03<00:07,  3.52s/it][A
 67%|██████▋   | 2/3 [00:07<00:03,  3.63s/it][A
100%|██████████| 3/3 [00:11<00:00,  3.78s/it][A
 10%|█         | 2/20 [00:11<02:03,  6.88s/it]
  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:04<00:04,  4.80s/it][A
100%|██████████| 2/2 [00:09<00:00,  4.62s/it][A
 15%|█▌        | 3/20 [00:21<02:19,  8.21s/it]
  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [00:07<00:22,  7.35s/it][A
 50%|█████     | 2/4 [00:14<00:14,  7.05s/it][A
 75%|███████▌  | 3/4 [00:20<00:06,  6.71s/it][A
100%|██████████| 4/4 [00:26<00:00,  6.68s/it][A
 20%|██        | 4/20 [00:50<04:24, 16.55s/it]
0it [00:00, ?it/s][A
 25%|██▌       | 5/20 [00:51<02:42, 10.82s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:04<00:00,  4.64s/it][A
 30%|███       | 6/20 [00:56

# Show Keywords in Context

In [None]:
import markdown
from IPython.core.display import HTML

In [None]:
def top_keywords(keywords, n):
    bound = min(len(keywords), n)
    
    top_keys = list(keywords.keys())[:bound]
    
    return {k:keywords[k] for k in top_keys}

In [None]:
def colour_map(keywords, n):
    """
    score -> between 0 -> 1
    """
    colouring = {}
    colours = np.linspace(100, 255, n, endpoint=True)
    
    i = 0
    for kw, score in keywords.items():
        colouring[kw] = colours[i]
        i += 1
    return colouring

In [None]:
def create_stylesheet(keywords, colouring):
    css_string = "<style>"
    for label, score in keywords.items():
        css_string += f" b.{label} {{background-color: rgb(0,{colouring[label]},0);}}"

    css_string += " </style>"
    
    return css_string

In [None]:
def highlight_keywords(document, intermediate_keywords, changed_indices, matched_dict, ngram):
    sentences = nltk.sent_tokenize(document)
    
    g_values = colour_map(intermediate_keywords, len(intermediate_keywords))
    
    for i in changed_indices:
        
        matched_idx, _ = matched_dict[i][0]

        words = nltk.word_tokenize(sentences[int(matched_idx)])
        

        current_word = 0
        for word in nltk.ngrams(words, ngram):
            word = " ".join(word)
            if word.lower() in intermediate_keywords.keys():
                words[current_word] = f"<b class=\"{word.lower()}\">" +  word +"</b>"

            current_word += 1

        sentences[int(matched_idx)] = " ".join(words)
    
    highlighted_string = " ".join(sentences)
    
    highlighted_string += create_stylesheet(intermediate_keywords, g_values)
    
    display(HTML(markdown.markdown(highlighted_string)))
    

In [None]:

highlight_keywords(documents[4], keywords[3], changed_sentences[3], matched_dicts[3], ngram=2)

In [None]:
def mark_sentences(document, changed_sentences, matched_dict):
    sentences = nltk.sent_tokenize(document)
    
    for i in changed_sentences:
        matched_idx, score = matched_dict[i][0]
        sentences[i] = "<b class='changed'>" + sentences[matched_idx] + "</b>"
    
    string = " ".join(sentences)
    string += "<style>b.changed {background-color: #ffff99;} b.new {background-color: #A3FF66;} </sytle>"

    display(HTML(markdown.markdown(string)))

In [None]:
def mark_sentences_orig(document, changed_sentences):
    sentences = nltk.sent_tokenize(document)
    
    for i in changed_sentences:
        sentences[i] = "<b class='old'>" + sentences[i] + "</b>"
    
    string = " ".join(sentences)
    string += "<style>b.old {background-color: rgb(0, 200, 0);} b.new {background-color: #A3FF66;} </sytle>"

    display(HTML(markdown.markdown(string)))

In [None]:
print(mark_sentences(documents[2], changed_sentences[1], matched_dicts[1]))

In [None]:
print(mark_sentences_orig(documents[1], changed_sentences[1]))

In [None]:
def create_html(documents, version_x, version_y, site_nr, wc = 80):
    a = sentences = nltk.sent_tokenize(documents[version_x])
    b = sentences = nltk.sent_tokenize(documents[version_y])
    difference = difflib.HtmlDiff(wrapcolumn=wc)
    
    file_name = f"policy_{site_nr}_versions_{version_x}_{version_y}.html"
    
    with open(name, "w") as file:
        html = difference.make_file(fromlines=a, 
                                    tolines=b, 
                                    fromdesc=f"version {version_x}", 
                                    todesc=f"version {version_y}")
        
        file.write(html)
    
    return html

In [None]:
def create_html_collection(documents, site_nr, wc = 80):
    
    versions = len(documents)
    file_name = f"policy_{site_nr}_collection.html"
    
    for version in range(versions - 1):
        a = sentences = nltk.sent_tokenize(documents[version])
        b = sentences = nltk.sent_tokenize(documents[version + 1])
        difference = difflib.HtmlDiff(wrapcolumn=wc)


        with open(file_name, "a") as file:
            html = difference.make_file(fromlines=a, 
                                        tolines=b, 
                                        fromdesc=f"version {version}", 
                                        todesc=f"version {version + 1}")

            file.write(html)
    
    return html

In [None]:
create_html_collection(documents, site_id)

In [None]:
import difflib
version_x = 1
version_y = 2

wrap_column = 80

html = create_html(documents, version_x, version_y, file_name, wrap_column)

In [None]:
from IPython.core.display import display, HTML
display(HTML(html))

In [None]:
deleted[0]

In [None]:
added[0]

In [None]:
keywords[1]

In [None]:

deleted

In [None]:
changed_sentences