In [1]:
import yake
import sqlite3
import pandas as pd
from news_processing import *
from keyword_extraction import *
from summa import summarizer
import difflib
import nltk
from sentence_transformers import SentenceTransformer, util
import numpy as np
from LexRank import degree_centrality_scores
import re
from yake.highlight import *

In [2]:
conn = sqlite3.connect('../datasets/ap-matched-sentences.db')
connection = sqlite3.connect('../datasets/ap.db')
pd.options.display.max_colwidth = 500

In [3]:
article_id = 2
data = create_data(article_id, conn)
versions = get_versions(data)
print(f"Available versions: {versions}\n")
documents = get_documents(data)

Available versions: {0.0, 1.0, 2.0, 3.0}



In [4]:
def find_important_indices(important_sentences, document):
    
    # split into sentences
    sentences = nltk.sent_tokenize(document)
    
    # list of indices corresponding to the important sentences 
    important_indices = []
    
    for important_sentence in important_sentences:
        for idx in range(len(sentences)):
            if important_sentence == sentences[idx]:
                important_indices.append(idx)
                
    return important_indices

## TextRank Importance Ranking

In [5]:
def text_rank(documents):
    number_of_documents = len(documents)
    important_sentences = {version:[] for version in range(number_of_documents)}
    important_indices = {version:[] for version in range(number_of_documents)}
    for current in range(number_of_documents):
        
        # current document
        document = documents[current]
        
        # find most important sentences
        summary = summarizer.summarize(document).strip()
        
        # tokenize into seprate sentences
        important_sentences[current] = nltk.sent_tokenize(summary)
            
        # find corresponding indices in original corpus
        important_indices[current] = find_important_indices(important_sentences[current], document)
        
    return important_sentences, important_indices

In [6]:
sentences, indices = text_rank(documents)
indices

{0: [4, 5, 6, 17, 18, 19, 20, 21, 24],
 1: [4, 5, 6, 8, 9, 10, 11, 18, 19, 20, 21, 22, 23, 26],
 2: [4, 5, 6, 8, 9, 10, 11, 19, 20, 21, 22, 23, 24, 27],
 3: [1, 6, 10, 15, 20, 29, 34, 39, 47, 51, 62, 63]}

## LexRank importance Ranking

In [7]:
"""
This example uses LexRank (https://www.aaai.org/Papers/JAIR/Vol22/JAIR-2214.pdf)
to create an extractive summarization of a long document.
The document is splitted into sentences using NLTK, then the sentence embeddings are computed. We
then compute the cosine-similarity across all possible sentence pairs.
We then use LexRank to find the most central sentences in the document, which form our summary.
"""

def lex_rank(documents, n=10, model='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model)

    # Our input document we want to summarize
    # As example, we take the first section from Wikipedia
    
    number_of_documents = len(documents)
    important_sentences = {version:[] for version in range(number_of_documents)}
    important_indices = {version:[] for version in range(number_of_documents)}
    
    for current in range(number_of_documents):
    #Split the document into sentences
        sentences = nltk.sent_tokenize(documents[current])

        #Compute the sentence embeddings
        embeddings = model.encode(sentences, convert_to_tensor=True)

        #Compute the pair-wise cosine similarities
        cos_scores = util.cos_sim(embeddings, embeddings).numpy()

        #Compute the centrality for each sentence
        centrality_scores = degree_centrality_scores(cos_scores, threshold=None)

        #We argsort so that the first element is the sentence with the highest score
        most_central_sentence_indices = np.argsort(-centrality_scores)


        #use the n sentences with the highest scores
        top_sentences = []
        top_indices = []
        for idx in most_central_sentence_indices[0:n]:
            top_sentences.append(sentences[idx].strip())
            top_indices.append(idx)
            
        important_sentences[current] = top_sentences
        important_indices[current] = top_indices
        
    return important_sentences, important_indices

In [8]:
sentences, indices = lex_rank(documents, 5)

In [9]:
print(indices)

{0: [0, 7, 20, 11, 3], 1: [0, 22, 13, 24, 3], 2: [0, 13, 23, 28, 25], 3: [59, 1, 32, 61, 39]}


## Yake! Keyword Highlighting and Ranking

In [10]:
def yake_keyword_frequency(documents, ngram_size=3):
    
    number_of_documents = len(documents)
    
    # dictonary of form: version -> keywords
    keywords = extract_yake(documents) 
    
    # init highlighter object
    highlighter = TextHighlighter(max_ngram_size = ngram_size)
    
    # save the sentence specific keyword counts for all documents
    keyword_counts = []
    
    for current in range(number_of_documents):
        
        # current number of sentences
        number_of_sentences = len(nltk.sent_tokenize(documents[current]))
        
        # dictonary to count number of keywords in a sentence
        keyword_count = {sentence:0 for sentence in range(number_of_sentences)}
        
        # highlight keywords in text
        highlightet_text = highlighter.highlight(documents[current], keywords[current])
        
        # split document into sentences
        highlightet_sentences = nltk.sent_tokenize(highlightet_text)
        
        # used to determine the current position in the corpus
        sentence_position = 0
        
        for sentence in highlightet_sentences:
            
            # find all keywords in current sentence
            current_keywords = re.findall(r"<kw>(.*?)</kw>", sentence)
            
            keyword_count[sentence_position] = len(current_keywords)
            
            sentence_position += 1
        
        keyword_counts.append(keyword_count)
        
    return keyword_counts

In [11]:
def yake_weighted_keyword_frequency(documents, ngram_size=3):
    
    number_of_documents = len(documents)
    
    # dictonary of form: version -> keywords
    keywords = extract_yake(documents)
    
    # init highlighter object
    highlighter = TextHighlighter(max_ngram_size = ngram_size)
    
    # save the sentence specific keyword counts for all documents
    keyword_counts = []
    
    for current in range(number_of_documents):
        
        # current number of sentences
        number_of_sentences = len(nltk.sent_tokenize(documents[current]))
        
        # dictonary to count number of keywords in a sentence
        keyword_count = {sentence:0 for sentence in range(number_of_sentences)}
        
        # highlight keywords in text
        highlightet_text = highlighter.highlight(documents[current], keywords[current])
        
        # split document into sentences
        highlightet_sentences = nltk.sent_tokenize(highlightet_text)
        
        # used to determine the current position in the corpus
        sentence_position = 0
        
        for sentence in highlightet_sentences:
            
            # find all keywords in current sentence
            current_keywords = re.findall(r"<kw>(.*?)</kw>", sentence)
            
            # combine the score of the keywords
            # take the inverse since YAKE! goes from lowest -> highest
            combined_keyword_scores = sum([1/score for keyword, score in keywords[current] 
                                           if keyword in current_keywords])
            
            # weighted count
            keyword_count[sentence_position] = len(current_keywords) * combined_keyword_scores
            
            sentence_position += 1
        
        keyword_counts.append(keyword_count)
        
    return keyword_counts

In [12]:
def rank_yake(documents, keyword_counts, top_n=5):
    number_of_documents = len(documents)
    
    important_sentences = {version:[] for version in range(number_of_documents)}
    important_indices = {version:[] for version in range(number_of_documents)}
    ranking = {version:{} for version in range(number_of_documents)}
    
    eps = 0.01
    # document index
    current = 0
    
    
    for counts in keyword_counts:
        
        
        sentences  = nltk.sent_tokenize(documents[current])
        
        # get the length of each sentence
        sentence_lengths = [len(sentence) for sentence in sentences]
        
        # normalize keyword counts by sentence length
        # In order to not give longer sentences more importance
        normalized_counts = {k: (v/sentence_lengths[i]) + eps for i, (k, v) in enumerate(counts.items())}
    
        # sort counts by number of keyword frequency (highest -> lowest)
        sorted_counts = {k: v for k, v in sorted(normalized_counts.items(), key=lambda item: item[1], 
                                                 reverse=True)}
        
        # sorted sentence idices, by highest keyword frequencies
        top_keys = list(sorted_counts.keys())
        
        # return the top_n most important sentences
        important_sentences[current] = [sentences[idx] for idx in top_keys[:top_n]]
        
        # return the top_n most important indices
        important_indices[current] = top_keys[:top_n]
        
        ranking[current] = sorted_counts
        
        current += 1
    
    return important_sentences, important_indices, ranking

In [13]:
keyword_weighted_counts = yake_weighted_keyword_frequency(documents, ngram_size=3)
sentencesw, indicesw, rankingw = rank_yake(documents, keyword_weighted_counts)
rankingw

{0: {0: 6.02022619263948,
  1: 5.712770200464182,
  11: 4.343472071951212,
  6: 3.4422221341927437,
  24: 3.200820555933037,
  16: 2.816171006197526,
  8: 2.0408555102275625,
  3: 1.6733713377295332,
  2: 1.6706501034595735,
  20: 1.5210146827600213,
  22: 1.3945596093333295,
  14: 1.1772215448464776,
  15: 0.9974404063783755,
  26: 0.9699398996430829,
  27: 0.9483120366679418,
  4: 0.678659578454766,
  10: 0.4988691549753754,
  13: 0.41379083931679345,
  9: 0.1578269639822564,
  7: 0.14161905955894438,
  28: 0.09045076413888356,
  5: 0.08857981613565372,
  21: 0.07575117515982185,
  17: 0.07264298754984397,
  29: 0.05257290416008814,
  12: 0.01,
  18: 0.01,
  19: 0.01,
  23: 0.01,
  25: 0.01},
 1: {0: 5.92351382464146,
  1: 5.618470637479485,
  13: 5.573170662197833,
  26: 4.581179338114108,
  6: 3.3738759571430443,
  18: 2.859242129898881,
  10: 2.0958297452273604,
  2: 1.698715729872559,
  3: 1.6420221874518608,
  22: 1.49707648407027,
  24: 1.3727942452057185,
  16: 1.2501489030492

In [14]:
kw_counts = yake_keyword_frequency(documents, ngram_size=3)
sentences, indices, ranking = rank_yake(documents, kw_counts)
ranking

{0: {6: 0.051666666666666666,
  1: 0.05040404040404041,
  0: 0.05,
  11: 0.03702702702702703,
  24: 0.03142857142857143,
  3: 0.028181818181818183,
  16: 0.027777777777777776,
  14: 0.027543859649122808,
  8: 0.027341040462427746,
  20: 0.024705882352941175,
  22: 0.023452914798206277,
  10: 0.021235955056179777,
  2: 0.021152416356877323,
  15: 0.020526315789473684,
  27: 0.020256410256410257,
  4: 0.020050251256281407,
  26: 0.01819672131147541,
  21: 0.01735294117647059,
  17: 0.017142857142857144,
  7: 0.016514657980456027,
  28: 0.015952380952380954,
  5: 0.01581395348837209,
  13: 0.014901960784313726,
  29: 0.014854368932038835,
  9: 0.014255319148936171,
  12: 0.01,
  18: 0.01,
  19: 0.01,
  23: 0.01,
  25: 0.01},
 1: {6: 0.051666666666666666,
  1: 0.05040404040404041,
  0: 0.05,
  13: 0.042432432432432436,
  26: 0.03857142857142857,
  3: 0.028181818181818183,
  18: 0.027777777777777776,
  16: 0.027543859649122808,
  10: 0.027341040462427746,
  22: 0.024705882352941175,
  24: 0

In [15]:
ranking

{0: {6: 0.051666666666666666,
  1: 0.05040404040404041,
  0: 0.05,
  11: 0.03702702702702703,
  24: 0.03142857142857143,
  3: 0.028181818181818183,
  16: 0.027777777777777776,
  14: 0.027543859649122808,
  8: 0.027341040462427746,
  20: 0.024705882352941175,
  22: 0.023452914798206277,
  10: 0.021235955056179777,
  2: 0.021152416356877323,
  15: 0.020526315789473684,
  27: 0.020256410256410257,
  4: 0.020050251256281407,
  26: 0.01819672131147541,
  21: 0.01735294117647059,
  17: 0.017142857142857144,
  7: 0.016514657980456027,
  28: 0.015952380952380954,
  5: 0.01581395348837209,
  13: 0.014901960784313726,
  29: 0.014854368932038835,
  9: 0.014255319148936171,
  12: 0.01,
  18: 0.01,
  19: 0.01,
  23: 0.01,
  25: 0.01},
 1: {6: 0.051666666666666666,
  1: 0.05040404040404041,
  0: 0.05,
  13: 0.042432432432432436,
  26: 0.03857142857142857,
  3: 0.028181818181818183,
  18: 0.027777777777777776,
  16: 0.027543859649122808,
  10: 0.027341040462427746,
  22: 0.024705882352941175,
  24: 0

In [16]:

ranking

{0: {6: 0.051666666666666666,
  1: 0.05040404040404041,
  0: 0.05,
  11: 0.03702702702702703,
  24: 0.03142857142857143,
  3: 0.028181818181818183,
  16: 0.027777777777777776,
  14: 0.027543859649122808,
  8: 0.027341040462427746,
  20: 0.024705882352941175,
  22: 0.023452914798206277,
  10: 0.021235955056179777,
  2: 0.021152416356877323,
  15: 0.020526315789473684,
  27: 0.020256410256410257,
  4: 0.020050251256281407,
  26: 0.01819672131147541,
  21: 0.01735294117647059,
  17: 0.017142857142857144,
  7: 0.016514657980456027,
  28: 0.015952380952380954,
  5: 0.01581395348837209,
  13: 0.014901960784313726,
  29: 0.014854368932038835,
  9: 0.014255319148936171,
  12: 0.01,
  18: 0.01,
  19: 0.01,
  23: 0.01,
  25: 0.01},
 1: {6: 0.051666666666666666,
  1: 0.05040404040404041,
  0: 0.05,
  13: 0.042432432432432436,
  26: 0.03857142857142857,
  3: 0.028181818181818183,
  18: 0.027777777777777776,
  16: 0.027543859649122808,
  10: 0.027341040462427746,
  22: 0.024705882352941175,
  24: 0

In [19]:
# Import the necessary modules
from gensim.summarization.summarizer import summarize, sentences, rank_sentences

# Define the text that you want to summarize
text = "Insert the text that you want to summarize here"

# Use the sentences function to extract the individual sentences
sentences = sentences(text)

# Use the rank_sentences function to assign a score to each sentence
scores = rank_sentences(sentences, summary)

# Print the score for each sentence
for sentence, score in zip(sentences, scores):
    print(f"Sentence: {sentence}")
    print(f"Score: {score}")

ModuleNotFoundError: No module named 'gensim.summarization'

In [18]:
!pip install gensim



In [24]:
g = summarizer.get_graph(documents[0])

In [29]:
g.edges()

[('tradit suggest time donald trump set asid say speak style rise inaugur moment',
  'buck tradit ignor altogeth got donald trump inaugur moment'),
 ('buck tradit ignor altogeth got donald trump inaugur moment',
  'tradit suggest time donald trump set asid say speak style rise inaugur moment'),
 ('tradit suggest time donald trump set asid say speak style rise inaugur moment',
  'trump stand west capitol friday deliv inaugur address side wait come bear unifi messag divid nation decid play persona disrupt establish order'),
 ('trump stand west capitol friday deliv inaugur address side wait come bear unifi messag divid nation decid play persona disrupt establish order',
  'tradit suggest time donald trump set asid say speak style rise inaugur moment'),
 ('tradit suggest time donald trump set asid say speak style rise inaugur moment',
  'trump tend balanc act style content tell launch presid'),
 ('trump tend balanc act style content tell launch presid',
  'tradit suggest time donald trump 

In [37]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [41]:
!pip uninstall pytextrank

Found existing installation: pytextrank 2.1.0
Uninstalling pytextrank-2.1.0:
  Would remove:
    /home/lukas/ml/lib/python3.6/site-packages/pytextrank-2.1.0.dist-info/*
    /home/lukas/ml/lib/python3.6/site-packages/pytextrank/*
Proceed (Y/n)? ^C
[31mERROR: Operation cancelled by user[0m


In [39]:


import pytextrank
nlp.add_pipe("textrank", last=True)
doc = nlp(documents[0])

ValueError: [E002] Can't find factory for 'textrank' for language English (en). This usually happens when spaCy calls `nlp.create_pipe` with a custom component name that's not registered on the current language class. If you're using a Transformer, make sure to install 'spacy-transformers'. If you're using a custom component, make sure you've added the decorator `@Language.component` (for function components) or `@Language.factory` (for class components).

Available factories: attribute_ruler, tok2vec, merge_noun_chunks, merge_entities, merge_subtokens, token_splitter, doc_cleaner, parser, beam_parser, lemmatizer, trainable_lemmatizer, entity_linker, ner, beam_ner, entity_ruler, tagger, morphologizer, senter, sentencizer, textcat, spancat, future_entity_ruler, span_ruler, textcat_multilabel, en.lemmatizer

In [60]:
!pip install sumy

Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl (97 kB)
     |████████████████████████████████| 97 kB 1.2 MB/s            
Collecting pycountry>=18.2.23
  Downloading pycountry-22.3.5.tar.gz (10.1 MB)
     |████████████████████████████████| 10.1 MB 644 kB/s            
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting breadability>=0.1.20
  Downloading breadability-0.1.20.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting chardet
  Downloading chardet-5.0.0-py3-none-any.whl (193 kB)
     |████████████████████████████████| 193 kB 8.7 MB/s            
[?25hCollecting lxml>=2.0
  Downloading lxml-4.9.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (6.4 MB)
     |████████████████████████████████| 6.4 MB 4.8 MB/s            
Building wheels for collected packages: breadability, 

In [92]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer

def text_rank_importance(documents):  
    # Summarize using sumy TextRank
    summarizer = TextRankSummarizer()
    
    versions = len(documents)
    
    ranking = {version:{} for version in range(versions)}
    
    for version in range(versions):
        importance = {}
        
          # Creating text parser using tokenization
        parser = PlaintextParser.from_string(documents[version], Tokenizer("english"))
        ranked = summarizer.rate_sentences(parser.document)
        
        for i, score in enumerate(ranked.values()):
            importance[i] = score
        
        # sort indices by importance
        sorted_importance = {k: v for k, v in sorted(importance.items(), key=lambda item: item[1], 
                                                 reverse=True)}
        
        
        ranking[version] = sorted_importance
        
    return ranking

In [97]:
text_rank_importance(documents)[2]

{2: 0.05557051236740845,
 4: 0.04545005150808341,
 15: 0.04456532930859665,
 11: 0.04249849243123687,
 29: 0.041269843359644,
 25: 0.0391743341358215,
 28: 0.03841579130460456,
 30: 0.0373086443841249,
 32: 0.034960333747662636,
 23: 0.0343488998515863,
 7: 0.034094211310151855,
 31: 0.03386563518247975,
 27: 0.033211295675658,
 26: 0.031937807100405995,
 0: 0.03156691443014241,
 16: 0.03130058070524169,
 13: 0.030556022122641437,
 10: 0.029231831624076263,
 24: 0.02903318780274637,
 19: 0.02850592981580828,
 5: 0.027637006361198417,
 12: 0.02762347378096177,
 18: 0.027579514818786028,
 3: 0.02592389463510852,
 20: 0.02358671362145832,
 9: 0.021646686532713396,
 14: 0.021405218588119865,
 8: 0.020036516077635295,
 17: 0.01979878668006709,
 1: 0.019065102825902045,
 6: 0.014407807046615853,
 22: 0.0138315354710631,
 21: 0.010592079376824987}