<a href="https://colab.research.google.com/github/LukasEder1/CKE-Demo/blob/main/CKE_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Demonstration: Contrastive Keyword Extraction from Versioned Documents

In [2]:
!git clone "https://github.com/LukasEder1/CKE-Demo"
%cd "/content/CKE-Demo"

fatal: destination path 'CKE-Demo' already exists and is not an empty directory.
/content/CKE-Demo


In [3]:
%pip install -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/boudinfl/pke.git (from -r requirements.txt (line 15))
  Cloning https://github.com/boudinfl/pke.git to /tmp/pip-req-build-bax3h_l2
  Running command git clone --filter=blob:none --quiet https://github.com/boudinfl/pke.git /tmp/pip-req-build-bax3h_l2
  Resolved https://github.com/boudinfl/pke.git to commit ebd6e5754b4156a61a4ec6c4c283e821d11a36be
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [4]:
import pickle

from contrastive_keyword_extraction import extract_contrastive_keywords

import string
import sentence_comparision
import sentence_importance
import utilities
import keyword_extraction
import nltk
import pandas as pd

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
with open("docs.pkl", "rb") as file:
  documents = pickle.load(file)

In [6]:
stopwords = nltk.corpus.stopwords.words("english")

### Select one of the preset documents or Input a List with two Document versions

In [28]:
documents.keys()

dict_keys([17313, 16159, 17736, 17748, 3299, 90232, 98445, 98447, 106601, 106604, 99880, 0, 1])

In [31]:
documents[0]

['In this paper, we introduce TextRank - a graph-based ranking model for text processing, and show how this model can be successfully used in natural language applications. In particular, we propose two innovative unsupervised methods for keyword and sentence extraction, and show that the results obtained compare favorably with previously published results on established benchmark.',
 'TextRank, a graph-based ranking system, is introduced in this paper. Ranking model for text processing, and demonstrate how this model can be used successfully in natural language processing applications. We propose two novel unsupervised methods for keyword and sentence extraction in particular, and demonstrate that the results obtained compare favorably with previously published results on established benchmarks.']

In [29]:
versioned_document = documents[17313]

## Extract Contrastive Keywords

In [41]:
def display_keywords(keywords, k=10):
  display(pd.DataFrame({"Keyword": keywords.keys(), "Score": keywords.values()}).head(k))

In [36]:
threshold = 0.6
# Choose any model: https://www.sbert.net/examples/applications/semantic-search/README.html
model = 'all-MiniLM-L6-v2' 
num_splits = 1
max_ngram = 2


combined_kws, former_kws, latter_kws = extract_contrastive_keywords(versioned_document, 
                                                      max_ngram=max_ngram, # Maximum n-gram size of Keywords
                                                      min_ngram=1, 
                                                      extra_stopwords=stopwords, # Remove english Stopwords ([] = do not consider any stopwords)
                                                      importance_estimator= sentence_importance.text_rank_importance,  # alt: sentence_importance.yake_weighted_importance
                                                      match_sentences=sentence_comparision.match_sentences_semantic_search, # alt: sentence_comparision.match_sentences_tfidf_weighted
                                                      threshold=threshold, # Matching Threshold
                                                      symbols_to_remove=string.punctuation, # Remove certain Symbols
                                                      matching_model=model, # Matching Model: Only relevant for Semantic Search
                                                      top_k=num_splits, # Max Number of Sentences a Sentence can possibly split into
                                                      )        

In [43]:
display_keywords(former_kws, k=10)

Unnamed: 0,Keyword,Score
0,attorneyclient privilege,0.031013
1,fbi agents,0.024171
2,fire mueller,0.022812
3,dead,0.019156
4,furious president,0.019156
5,president blasted,0.019156
6,blasted displeasure,0.019156
7,displeasure early,0.019156
8,early tuesday,0.019156
9,tuesday saying,0.019156


## Sentence Matching


In [9]:
def get_matched_indices(matched_dict):
    """ Get indices of matched Sentences

    Args:
        matched_dict (dict):Keys: Indices of Document A, 
                            Values: List of Pairs <Index of Document B| semantic similarity>

    Returns:
        List of all sentences in version B, that have been matched to
    """
    return [i for i in list(matched_dict.keys()) if len(matched_dict[i]) > 0]

In [10]:
def display_matches(matched_dict):
    original_indices = []
    matched_indices = []
    matched_score = []


    for i in get_matched_indices(matched_dict):
        original_indices += len(matched_dict[i]) * [i]
        for idx, score in  matched_dict[i]:
            matched_indices.append(int(idx))
            matched_score.append(float(score))
    
    return pd.DataFrame({"source sentence position": original_indices, 
        "matched sentence position": matched_indices,
        "semantic similarity":matched_score}).reset_index(drop=True)

In [56]:
matched_dict, removed = sentence_comparision.match_sentences_semantic_search(document_a=versioned_document[0],
                                                                             document_b=versioned_document[1],
                                                                             threshold=0.6,
                                                                             k=1,
                                                                             model="all-MiniLM-L6-v2")

In [57]:
display_matches(matched_dict)

Unnamed: 0,source sentence position,matched sentence position,semantic similarity
0,0,0,1.0
1,1,1,0.655811
2,2,1,0.893363
3,3,2,1.0
4,4,3,0.98448
5,5,4,0.993359
6,6,5,0.998991
7,7,9,0.942158
8,8,10,1.0
9,9,11,1.0


## Extract Added and Deleted Content between Matched Sentence Pairs

In [13]:
changed_sentences, new_sentences, additions, deletions, matched_indices, unified_delitions = sentence_comparision.detect_changes(matched_dict, 
                                                                                                        versioned_document[0],
                                                                                                        versioned_document[1],
                                                                                                        max_ngram=2,
                                                                                                        show_output=True)

query: WASHINGTON (AP) -- Federal agents who raided the office of President Donald Trump's personal attorney, Michael Cohen, were looking for information about payments to a former Playboy playmate and a porn actress who claim to have had affairs with Trump, two people familiar with the investigation said.
 
matched: WASHINGTON (AP) -- Federal agents who raided the office of President Donald Trump's personal attorney, Michael Cohen, were looking for information about payments to a former Playboy Playmate and a porn actress who claim to have had affairs with Trump, two people familiar with the investigation said.
 
Semantic Resemblence: 1.0000
Syntactic Resemblence: 0.9371

added in newer version:[]
deleted from older version: []
------------------------------------------------------------------------------

query: Public corruption prosecutors in the U.S.

matched: Public corruption prosecutors in the U.S. attorney's office in Manhattan are trying to determine if there was any fraud re

In [63]:
print("Classification of All Sentences:")
print(f"New (Index in Newer Version): {new_sentences}")
print(f"Deleted (Index in Older Version): {removed}")
print(f"Changed (Index in Older Version): {changed_sentences}")
print(f"Changed (Index in Newer Version): {list(set(matched_indices))}")

Classification of All Sentences:
New (Index in Newer Version): [34, 37, 6, 7, 8, 38, 39, 40, 41, 42, 43]
Deleted (Index in Older Version): [14, 15, 16, 17, 27, 36]
Changed (Index in Older Version): [0, 1, 2, 3, 4, 5, 6, 7, 13, 18, 22, 25, 29, 31, 35, 37, 42]
Changed (Index in Newer Version): [0, 1, 2, 3, 4, 5, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 36, 44, 45]


## Sentence Importance

In [53]:
from IPython.display import display_html 

def display_importance(ranking):
  
  df1 = pd.DataFrame({"Position":ranking[0].keys(), "Importance": ranking[0].values()}).reset_index(drop=True)
  df2 = pd.DataFrame({"Position":ranking[1].keys(), "Importance": ranking[1].values()}).reset_index(drop=True)
  
  print("Importance Older Version")
  display(df1)

  print("\nImportance Newer Version")
  display(df2)

In [54]:
ranking = sentence_importance.text_rank_importance(versioned_document)

In [55]:
display_importance(ranking)

Importance Older Version


Unnamed: 0,Position,Importance
0,0,0.038304
1,5,0.037126
2,7,0.036473
3,32,0.034177
4,2,0.034013
5,35,0.032337
6,3,0.032077
7,19,0.030949
8,25,0.029293
9,29,0.028767


Importance Newer Version


Unnamed: 0,Position,Importance
0,4,0.03694
1,1,0.036623
2,0,0.036061
3,6,0.03485
4,28,0.033566
5,31,0.031276
6,39,0.031263
7,2,0.030227
8,16,0.029883
9,22,0.028438


In [64]:
ranking_yake = sentence_importance.yake_weighted_importance(versioned_document)

In [65]:
ranking_yake

{0: {27: 0.11010884786037167,
  0: 0.09242128672799052,
  13: 0.08615269267902681,
  31: 0.0824452803037995,
  8: 0.07869700012991117,
  1: 0.06316372825465688,
  34: 0.05788149468010807,
  21: 0.04715654000303769,
  20: 0.0416764298659606,
  33: 0.03961765623218134,
  26: 0.0333733583356287,
  12: 0.03321665111423752,
  11: 0.028349024570938398,
  23: 0.027629475681651416,
  3: 0.024877987695592584,
  10: 0.024515090587269552,
  18: 0.020665299252474633,
  29: 0.019618792060338328,
  9: 0.018157998858689964,
  22: 0.017026291378611783,
  32: 0.012384205693188773,
  6: 0.011892171533816722,
  36: 0.010279604883261313,
  24: 0.005921924332469099,
  7: 0.005361043905466949,
  4: 0.0038170543276644987,
  2: 0.0031121676283810975,
  5: 2.828831901621779e-05,
  14: 2.828831901621779e-05,
  15: 2.828831901621779e-05,
  16: 2.828831901621779e-05,
  17: 2.828831901621779e-05,
  19: 2.828831901621779e-05,
  25: 2.828831901621779e-05,
  28: 2.828831901621779e-05,
  30: 2.828831901621779e-05,
  3