In [1]:
import sqlite3
import pandas as pd
from news_processing import *
from keyword_extraction import *
import sentence_importance
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util
from difflib import *
import nltk
import torch
import string
#import util

In [2]:
conn = sqlite3.connect('../datasets/small10k.sqlite')
df = pd.read_sql("SELECT * FROM small10k", con=conn)

In [3]:
def create_data(df, sid):
    data = df[df.site_id == sid]
    return data

In [4]:
data = create_data(df, 31160)

In [5]:
def get_policy_texts(data):
    return list(data['policy_text']) 

In [41]:
documents = get_policy_texts(data)
print(len(documents))

31


In [7]:
def drop_unimportant_indices(indices, important_indices):
    
    # of some version!
    return list(set(indices) & set(important_indices))
    

In [8]:
def match_sentences(document_a, document_b, k = 1, model='all-MiniLM-L6-v2', threshold=0.6):
    
    # Model to be used to create Embeddings which we will use for semantic search
    embedder = SentenceTransformer(model)
    
    # Use the sentences in A as queries
    queries = nltk.sent_tokenize(document_a)
    
    # Use the sentences in B as our corpus
    corpus = nltk.sent_tokenize(document_b)
    
    # Create embeddings using B
    corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)
    
    # matched_sentences dict:
    # key = query_idx
    # value = list of matched sentences and score pairs = [(matched_sentence, similarity_score)]
    matched_sentences = {i:[] for i in range(len(queries))}
    
    # Find the closest k most similar sentences using cosine similarity
    top_k = min(k, len(corpus))
    for query_idx in range(len(queries)):
        query = queries[query_idx]
        query_embedding = embedder.encode(query, convert_to_tensor=True)

        # We use cosine-similarity and torch.topk to find the highest k scores
        cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
        top_results = torch.topk(cos_scores, k=top_k)
        
        # loop over top results
        for score, idx in zip(top_results[0], top_results[1]):
            
            # fill the matched sentences dictonary
            #if score > threshold:
            matched_sentences[query_idx].append((idx, score))
            
    
    return matched_sentences

In [9]:
matched_dict_01 = match_sentences(documents[0], documents[1], k = 1, model='all-MiniLM-L6-v2')
matched_dict_01

{0: [(tensor(0), tensor(0.9889))],
 1: [(tensor(1), tensor(1.0000))],
 2: [(tensor(2), tensor(1.0000))],
 3: [(tensor(3), tensor(1.0000))],
 4: [(tensor(4), tensor(1.0000))],
 5: [(tensor(5), tensor(1.))],
 6: [(tensor(6), tensor(1.0000))],
 7: [(tensor(7), tensor(1.0000))],
 8: [(tensor(8), tensor(1.0000))],
 9: [(tensor(9), tensor(1.0000))],
 10: [(tensor(10), tensor(1.0000))],
 11: [(tensor(11), tensor(1.0000))],
 12: [(tensor(12), tensor(1.))],
 13: [(tensor(13), tensor(1.0000))],
 14: [(tensor(14), tensor(1.0000))],
 15: [(tensor(15), tensor(1.0000))],
 16: [(tensor(16), tensor(1.0000))],
 17: [(tensor(17), tensor(1.0000))],
 18: [(tensor(18), tensor(1.0000))],
 19: [(tensor(19), tensor(1.0000))],
 20: [(tensor(20), tensor(1.))],
 21: [(tensor(21), tensor(1.0000))],
 22: [(tensor(22), tensor(1.))],
 23: [(tensor(23), tensor(1.0000))],
 24: [(tensor(24), tensor(1.0000))],
 25: [(tensor(25), tensor(1.0000))],
 26: [(tensor(26), tensor(1.0000))],
 27: [(tensor(27), tensor(1.0000))],


In [10]:
def get_all_matched_indices(matched_dict, top_k=1):
    return [int(index[k][0]) for index in matched_dict.values() for k in range(top_k)]

In [11]:
matched_indices = get_all_matched_indices(matched_dict_01)
matched_indices

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 38,
 39,
 40,
 41,
 42,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 87,
 88,
 89,
 90,
 93,
 94,
 95,
 96,
 97,
 100,
 101,
 102]

In [12]:
def find_added_indices(matched_indices, corpus_length):
    corpus_indices = list(range(corpus_length))
    
    return list(set(corpus_indices) - set(matched_indices))

# Syntactic change

In [13]:
def syntactic_ratio(a, b):
    # match sentence a and b
    s = SequenceMatcher(lambda x: x == " ", a, b)
    
    # return their ratio
    # rule-of-thumb: ratio > 0.6 -> similar
    return s.ratio()

In [14]:
def find_additions_deletions(a, b):
    """
    finds the differnce inform of
    two strings addition/ deletions
    between two strings
    """
    
    # init differ
    d = Differ()
    
    # compare the two 
    diff = d.compare(a, b)
    changes = [change for change in diff if change.startswith('-') or  change.startswith('+')]
    
    # output:
    additions = ""
    deletions = ""
    
    for change in changes:
        type_of_change  = 'addition' if change[0] == '+' else 'deletion'
        
        # remove unwanted symbols
        actual_change = change[2:]
        
        if type_of_change == 'addition':
            additions += actual_change
            
        else:
            deletions += actual_change
    
    return additions, deletions

In [15]:
#sentences, indices = sentence_importance.lex_rank(documents, 5)

In [16]:
keyword_counts = sentence_importance.yake_weighted_keyword_frequency(documents, ngram_size=3)
sentences, indices, ranking = sentence_importance.rank_yake(documents, keyword_counts)
ranking

{0: {5: 23.42513856901186,
  35: 21.713192685780896,
  15: 19.711432676381282,
  36: 18.58886781998731,
  3: 16.10585299883709,
  76: 15.044364413512588,
  11: 14.060624979737687,
  38: 12.591718268498543,
  13: 12.285475145085005,
  42: 12.167829355993273,
  56: 11.854654874632967,
  14: 11.74246237715976,
  49: 11.23901957405886,
  70: 10.839272963532085,
  69: 10.31057672140857,
  77: 9.582104629235463,
  39: 9.290948254456074,
  81: 9.040976924562106,
  55: 8.583591788380739,
  86: 8.529544542125088,
  52: 8.515324676645985,
  26: 8.28241117584913,
  41: 8.182147978924219,
  10: 7.995133199469406,
  0: 7.946938559907765,
  68: 7.6607275497831315,
  71: 7.35677010733142,
  37: 7.306415018234889,
  66: 6.951221559513532,
  6: 6.753273851864608,
  34: 6.65757709571262,
  17: 6.079099945577344,
  67: 6.039466365396448,
  75: 5.791310213393855,
  50: 5.6879871962146575,
  82: 5.4668531755743714,
  7: 5.378250807353537,
  33: 5.1756074809562405,
  62: 5.003279829322501,
  21: 4.944077512

In [17]:
def detect_changes(matched_dict, document_a, document_b, important_indices, top_k=1, semantic_lower_bound = 0.7
                   ,show_output=False):
    
    # Use the sentences in A as queries
    queries = nltk.sent_tokenize(document_a)
    
    changed_sentences = []
    
    deleted_sentences = []
    
    corpus = nltk.sent_tokenize(document_b)
    
    matched_indices = []
    
    for query_idx in range(len(queries)):
        
        # current query
        query = queries[query_idx]
        
        # give lower bound on number of matched sentences
        top_k = min(top_k, len(matched_dict[query_idx]))
        
        
        
        
        for k in range(top_k):
            
            # get current matched_sentence + score
            matched_idx, score = matched_dict[query_idx][k]
            
            matched_indices.append(int(matched_idx))
            
            # get the actual sentence
            matched_sentence = corpus[int(matched_idx)]
            
            
            # extract addtions and deletions
            additions, deletions = find_additions_deletions(query, matched_sentence)
            
            # get syntactic ratio
            ratio = syntactic_ratio(query, matched_sentence)
            
            if show_output:
                print(f"query: {query}\nmatched: {matched_sentence}\nSemantic Resemblence: {score:.4f}\n"
                      f"Syntactic Resemblence: {ratio:.4f}\n")

                print(f"added in newer version:{additions}\ndeleted from older version: {deletions}")
                
                print("------------------------------------------------------------------------------\n")

            #
            if ratio < 1.0:
                changed_sentences.append(query_idx)
            
            if score < semantic_lower_bound:
                deleted_sentences.append(query_idx)
                
                
    #drop_unimportant_indices(changed_sentences, important_indices=important_indices[version])
    
    
    new_sentences = find_added_indices(matched_indices, len(corpus))

    return changed_sentences, new_sentences, deleted_sentences      
            
            
            

In [18]:
changed_sentences, new_sentences, deleted = detect_changes(matched_dict_01, documents[0], documents[1],important_indices=indices,
                                      semantic_lower_bound=0.7 ,show_output=True)
changed_sentences

query: DocuSign Express Privacy Policy (11/03)

The privacy practices of this statement apply to our services available under the domain and sub domains of  www.docusign.com (the "Site") (including docusign.net, docusignexpress.com, and docusign-inc.com).
matched: DocuSign Express™ Privacy Policy (11/03)

Overview

The privacy practices of this statement apply to our services available under the domain and sub domains of  www.docusign.com (the "Site") (including docusign.net, docusignexpress.com, and docusign-inc.com).
Semantic Resemblence: 0.9889
Syntactic Resemblence: 0.9783

added in newer version:™Overview


deleted from older version: 
------------------------------------------------------------------------------

query: By visiting this website you agree to be bound by the terms and conditions of this Privacy Policy.
matched: By visiting this website you agree to be bound by the terms and conditions of this Privacy Policy.
Semantic Resemblence: 1.0000
Syntactic Resemblence: 1.000


added in newer version:
deleted from older version: 
------------------------------------------------------------------------------

query: However, such personal information will be deactivated from member viewing and will only be available to select DocuSign personal.
matched: However, such personal information will be deactivated from member viewing and will only be available to select DocuSign personal.
Semantic Resemblence: 1.0000
Syntactic Resemblence: 1.0000

added in newer version:
deleted from older version: 
------------------------------------------------------------------------------

query: Except as otherwise expressly included in this Privacy Policy, this document only addresses the use and disclosure of information we collect from you.
matched: Except as otherwise expressly included in this Privacy Policy, this document only addresses the use and disclosure of information we collect from you.
Semantic Resemblence: 1.0000
Syntactic Resemblence: 1.0000

added in newer ve

[0, 29, 51]

In [19]:
new_sentences

[66, 67, 36, 37, 98, 99, 43, 44, 85, 86, 55, 56, 91, 92]

In [20]:
def I_c(changed_idx, new_indices, matched_dict, ranking, threshold, version,w0, w1, w2):
    
    matched_idx, score = matched_dict[changed_idx][0]
    
    I_s = ranking[version][changed_idx]
    
    next_I_s = ranking[version + 1][int(matched_idx)]
    
    if score < threshold:
        # Hypothesis 2
        I_c = next_I_s * (w2/ w1 * score) 
    else:    
        # Hypothesis 1
        I_c = I_s * (w0/ w1 * score)
    
    return I_c

In [21]:
def Importance_added_sentence(new_idx, ranking, next_version):
    
    next_I_s = ranking[next_version][new_idx]
    
    return next_I_s
    

In [24]:
changed_sentences

[0, 29, 51]

In [25]:
import markdown
from IPython.core.display import HTML

In [73]:
def mark_sentences(document, changed_sentences, new_sentences):
    sentences = nltk.sent_tokenize(document)
    
    for i in changed_sentences:
        sentences[i] = "<b class='changed'>" + sentences[i] + "</b>"
    
    for i in new_sentences:
        sentences[i] = "<b class='new'>" + sentences[i] + "</b>"
    
    string = " ".join(sentences)
    string += "<style>b.changed {background-color: rgb(255,255,0);} b.new {background-color: #A3FF66;} </sytle>"

    display(HTML(markdown.markdown(string)))
    

In [74]:
mark_sentences(cleand_docs[1], changed_sentences, new_sentences)

In [None]:
new_sentences

In [None]:
!pip install clean-text[gpl]

In [28]:
from cleantext import clean

In [54]:
cleaning_function = lambda text : clean(text,
    fix_unicode=True,               # fix various unicode errors
    to_ascii=True,                  # transliterate to closest ASCII representation
    lower=False,                     # lowercase text
    no_line_breaks=False,           # fully strip line breaks as opposed to only normalizing them
    no_urls=False,                  # replace all URLs with a special token
    no_emails=False,                # replace all email addresses with a special token
    no_phone_numbers=False,         # replace all phone numbers with a special token
    no_numbers=False,               # replace all numbers with a special token
    no_digits=False,                # replace all digits with a special token
    no_currency_symbols=False,      # replace all currency symbols with a special token
    no_punct=False,                 # remove punctuations
    replace_with_punct="",          # instead of removing punctuations you may replace them
    replace_with_url="<URL>",
    replace_with_email="<EMAIL>",
    replace_with_phone_number="<PHONE>",
    replace_with_number="<NUMBER>",
    replace_with_digit="0",
    replace_with_currency_symbol="<CUR>",
    lang="en"              
)

In [55]:
def clean_text(documents, cleaning_function):
    cleaned_documents = []
    for document in documents:
        cleaned_documents.append(cleaning_function(document))
    
    return cleaned_documents

In [57]:
cleand_docs = clean_text(documents, cleaning_function)

In [70]:
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
import matplotlib as mpl
import matplotlib.pyplot as plt

viridis = mpl.colormaps['viridis'].resampled(8)

AttributeError: module 'matplotlib' has no attribute 'colormaps'