In [1]:
import sqlite3
import pandas as pd
from news_processing import *
from keyword_extraction import *
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

In [2]:
conn = sqlite3.connect('ap-matched-sentences.db')
connection =sqlite3.connect('bbc.db')
pd.options.display.max_colwidth = 500

In [3]:
article_id = 2
get_url(article_id, connection)

0    http://www.bbc.co.uk/news/uk-38659120
Name: url, dtype: object

In [4]:
data = create_data(article_id, conn)
versions = get_versions(data)
print(f"Available versions: {versions}\n")
documents = get_documents(data)
get_url(1, connection)
#print(f"Article version 0: {documents[0]}")

Available versions: {0.0, 1.0, 2.0, 3.0}



0    http://www.bbc.co.uk/news/world-us-canada-38659068
Name: url, dtype: object

## Available Extraction Methods in PKE
pke.unsupervised.
'FirstPhrases',
 'KPMiner',
 'MultipartiteRank',
 'PositionRank',
 'SingleRank',
 'TextRank',
 'TfIdf',
 'TopicRank',
 'TopicalPageRank',
 'YAKE'

In [5]:
#keywords_pke = extract(documents, extractor=pke.unsupervised.YAKE) # pke, version

## Visualize Keywords using YAKE!

In [6]:
#print_keywords(keywords_pke)

In [7]:
#keyword_summary(keywords_pke)

## KEYBERT Extraction

In [8]:
keywords_bert = keybert(documents, use_mmr=True, diversity=0.7, ngram_range=(1,2))

## Visualize Keywords using KEYBERT

In [9]:
print_keywords(keywords_bert)

Version: 0
('trump tone', 0.58)
('bucking tradition', 0.3895)
('inaugural address', 0.2492)
('shut down', 0.1668)
('coming due', 0.1406)


Version: 1
('trump tone', 0.58)
('bucking tradition', 0.3895)
('inaugural address', 0.2492)
('shut down', 0.1668)
('wednesday estimating', 0.018)


Version: 2
('trump tone', 0.5721)
('bucking tradition', 0.3863)
('first inaugural', 0.3272)
('shut down', 0.16)
('wednesday estimating', 0.022)


Version: 3
('bless america', 0.5844)
('justice roberts', 0.3665)
('form new', 0.1844)
('decay we', 0.1752)
('transferring power', 0.1656)




In [10]:
keyword_summary(keywords_bert)

Comparision between keywords of version 0 and version 1
Only in version 0: ['coming due']
In Both versions:
Version 0: [('trump tone', 0.58), ('bucking tradition', 0.3895), ('inaugural address', 0.2492), ('shut down', 0.1668)]
Version 1: [('trump tone', 0.58), ('bucking tradition', 0.3895), ('inaugural address', 0.2492), ('shut down', 0.1668)]
Only in version 1: ['wednesday estimating']


Comparision between keywords of version 0 and version 2
Only in version 0: ['coming due', 'inaugural address']
In Both versions:
Version 0: [('trump tone', 0.58), ('bucking tradition', 0.3895), ('shut down', 0.1668)]
Version 2: [('trump tone', 0.5721), ('bucking tradition', 0.3863), ('shut down', 0.16)]
Only in version 2: ['first inaugural', 'wednesday estimating']


Comparision between keywords of version 0 and version 3
Only in version 0: ['shut down', 'coming due', 'inaugural address', 'trump tone', 'bucking tradition']
In Both versions:
Version 0: []
Version 3: []
Only in version 3: ['form new', '

## Use BERT Embeddings to compare Keywords

In [11]:
bert_embeddings = create_embeddings(keywords_bert)

In [12]:
show_cosine_similarities(keywords_bert, bert_embeddings)

Unnamed: 0,trump tone,bucking tradition,inaugural address,shut down,wednesday estimating
trump tone,1.0,0.565806,0.513735,0.454375,0.328035
bucking tradition,0.565806,1.0,0.571679,0.490045,0.353853
inaugural address,0.513735,0.571679,1.0,0.30358,0.480078
shut down,0.454375,0.490045,0.30358,1.0,0.424268
coming due,0.563582,0.690563,0.724129,0.517427,0.591712


Unnamed: 0,trump tone,bucking tradition,first inaugural,shut down,wednesday estimating
trump tone,1.0,0.565806,0.360087,0.454375,0.328035
bucking tradition,0.565806,1.0,0.523467,0.490045,0.353853
inaugural address,0.513735,0.571679,0.760976,0.30358,0.480078
shut down,0.454375,0.490045,0.230974,1.0,0.424268
coming due,0.563582,0.690563,0.579163,0.517427,0.591712


Unnamed: 0,bless america,justice roberts,form new,decay we,transferring power
trump tone,0.665505,0.509182,0.445967,0.517913,0.532147
bucking tradition,0.470234,0.532236,0.500318,0.610615,0.687522
inaugural address,0.544169,0.531338,0.629482,0.419439,0.52159
shut down,0.351418,0.44664,0.417822,0.82773,0.556115
coming due,0.552774,0.525605,0.717332,0.65985,0.709775


Unnamed: 0,trump tone,bucking tradition,first inaugural,shut down,wednesday estimating
trump tone,1.0,0.565806,0.360087,0.454375,0.328035
bucking tradition,0.565806,1.0,0.523467,0.490045,0.353853
inaugural address,0.513735,0.571679,0.760976,0.30358,0.480078
shut down,0.454375,0.490045,0.230974,1.0,0.424268
wednesday estimating,0.328035,0.353853,0.395322,0.424268,1.0


Unnamed: 0,bless america,justice roberts,form new,decay we,transferring power
trump tone,0.665505,0.509182,0.445967,0.517913,0.532147
bucking tradition,0.470234,0.532236,0.500318,0.610615,0.687522
inaugural address,0.544169,0.531338,0.629482,0.419439,0.52159
shut down,0.351418,0.44664,0.417822,0.82773,0.556115
wednesday estimating,0.331872,0.331575,0.547308,0.493992,0.379075


Unnamed: 0,bless america,justice roberts,form new,decay we,transferring power
trump tone,0.665505,0.509182,0.445967,0.517913,0.532147
bucking tradition,0.470234,0.532236,0.500318,0.610615,0.687522
first inaugural,0.429571,0.345621,0.639268,0.355252,0.402888
shut down,0.351418,0.44664,0.417822,0.82773,0.556115
wednesday estimating,0.331872,0.331575,0.547308,0.493992,0.379075
