In [1]:
# Run if working locally
%load_ext autoreload
%autoreload 2
%load_ext nb_black

<IPython.core.display.Javascript object>

In [4]:
import sqlite3
from sqlite3 import Error
import pickle
import os, sys
import config

config.root_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.insert(0, config.root_path)

from src.dataset.dataset import RawData
from src.dataset.wikisection_preprocessing import (
    tokenize,
    clean_sentence,
    preprocess_text_segmentation,
    format_data_for_db_insertion,
)
from src.dataset.utils import truncate_by_token
from db.dbv2 import Table, AugmentedTable, TrainTestTable
import pprint

from utils.metrics import windowdiff, pk

from src.bertkeywords.src.similarities import Embedding, Similarities
from src.bertkeywords.src.keywords import Keywords
from src.encoders.coherence_v2 import Coherence
from src.dataset.utils import flatten, dedupe_list, truncate_string

<IPython.core.display.Javascript object>

In [5]:
# initialize the coherence library
max_words_per_step = 3
coherence = Coherence(max_words_per_step=max_words_per_step)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
No sentence-transformers model found with name /Users/amitmaraj/.cache/torch/sentence_transformers/bert-base-uncased. Creating a new one with MEAN pooling.

<IPython.core.display.Javascript object>

In [6]:
# initialize the keywords and embeddings library
pp = pprint.PrettyPrinter(indent=4)
similarities_lib = Similarities("bert-base-uncased")
keywords_lib = Keywords(similarities_lib.model, similarities_lib.tokenizer)
embedding_lib = Embedding(similarities_lib.model, similarities_lib.tokenizer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
No sentence-transformers model found with name /Users/amitmaraj/.cache/torch/sentence_transformers/bert-base-uncased. Creating a new one with MEAN pooling.

<IPython.core.display.Javascript object>

In [7]:
dataset_type = "city"
table = Table(dataset_type)
augmented_table = AugmentedTable(dataset_type)
train_test_table = TrainTestTable(dataset_type)

<IPython.core.display.Javascript object>

In [8]:
data = table.get_all()

text_data = [x[1] for x in data]
text_labels = [x[2] for x in data]

<IPython.core.display.Javascript object>

In [12]:
text_labels[50:70], text_data[50:70]

([0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 ['The remains of an ancient village prove that the town was used for hydrotherapy in ancient times. There are nine hydrothermal sources. There are plenty of mineral springs.\n',
  'The Banya Palace summerhouse of Boris III with its picturesque yard-garden, called by the locals “The Palace,” is in the town of Banya. In 1927 Tzar Boris III took a cure for rheumatism in the country house of the manufacturer I. Bagarov. Pleased at his stay, he decided to build up an estate. It was in a courtyard with luxurious verdure and was finished in 1929.\n',
  'Harvard was founded in 1871 when the railroad was extended to that point. It was named after Harvard University, in Massachusetts.\n',
  'Harvard is located at (40.620276, -98.096554).\nAccording to the United States Census Bureau, the city has a total area of , all of it land.\n',
  'As of the census of 2010, there were 1,013 people, 372 households, and 248 families residing in t

<IPython.core.display.Javascript object>

In [9]:
all_segments = table.get_all_segments()
text_segments = [[y[1] for y in x] for x in all_segments]
segments_labels = [[1 if i == 0 else 0 for i, y in enumerate(x)] for x in all_segments]

<IPython.core.display.Javascript object>

In [8]:
samples = 5
max_tokens = 400

for i, (segment, labels) in enumerate(
    zip(text_segments[:samples], segments_labels[:samples])
):
    for sentence, label in zip(segment, labels):
        # this is the training case. During inference, we will have no idea
        # when segments start and when they end.
        pass

<IPython.core.display.Javascript object>

In [9]:
text_labels[:25]

[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

<IPython.core.display.Javascript object>

In [769]:
pruning = 0  # remove the lowest n important words from coherence map
pruning_min = 10  # only prune after n words in the coherence map


def get_weighted_average(weighted_similarities, weights):
    return sum(weighted_similarities) / sum(weights)


# importance testing
def compare_coherent_words(coherence_map, keywords_current, suppress_errors=False):
    word_comparisons = []
    weights = []
    for i, keywords in enumerate(coherence_map[::-1]):
        for word_tuple in keywords:
            word = word_tuple[0]
            for second_word_tuple in keywords_current:
                second_word = second_word_tuple[0]

                try:
                    word_one_emb = word_tuple[2]
                    word_two_emb = second_word_tuple[2]

                    # this weight is a recipricol function that will grow smaller the further the keywords are away
                    # we want to put more importance on the current words, so we apply twice as much weight.
                    if i == 0:
                        weight = 2 / (i + 1)
                    else:
                        weight = 1 / (i + 1)

                    word_comparisons.append(
                        (
                            word,
                            second_word,
                            weight
                            * embedding_lib.get_similarity(word_one_emb, word_two_emb),
                        )
                    )
                    weights.append(weight)
                except AssertionError as e:
                    if not suppress_errors:
                        print(e, word, second_word)

    return word_comparisons, weights


# TODO: add weighted average: https://www.google.com/search?q=weighted+average&rlz=1C5CHFA_enCA1019CA1024&sxsrf=APwXEdcb6dhJ5L_mvWvrWr4AxQcxOFB01g:1681098698316&tbm=isch&source=iu&ictx=1&vet=1&fir=V-LTDKtCElo89M%252C2WVwd1NrPkHFOM%252C_%253BVGk_lj0HALhXQM%252C2WVwd1NrPkHFOM%252C_%253ByzfbB4i3SpPTFM%252C5e7an03wLAdfhM%252C_%253B47HYmoDH6WlThM%252CsRXbJWfpyOLEOM%252C_%253BOsB4jtfzenfuyM%252CHKcmLkpfJ3xWqM%252C_&usg=AI4_-kRmBXgUWAm_nR3vDsLT17TqM5AvSQ&sa=X&ved=2ahUKEwi6hvvVtJ7-AhXJkIkEHe4JCX4Q_h16BAgoEAE#imgrc=V-LTDKtCElo89M
def coherence_tester(
    text_data, text_labels, max_tokens=400, max_str_length=30, prediction_thresh=0.35
):
    coherence_map = []
    predictions = []
    for i, (row, label) in enumerate(zip(text_data, text_labels)):
        # compare the current sentence to the previous one
        if i == 0:
            predictions.append((0, 0))
        else:
            prev_row = text_data[i - 1]

            row = truncate_by_token(row, max_tokens)
            prev_row = truncate_by_token(prev_row, max_tokens)

            cohesion, keywords_prev, keywords_current = coherence.get_coherence(
                [row, prev_row], coherence_threshold=0.2
            )

            # add the keywords to the coherence map
            coherence_map.append(cohesion)
            if pruning > 0 and len(coherence_map) >= pruning_min:
                print("pruning...", len(coherence_map))
                sorted_map = sorted(
                    coherence_map, key=lambda tup: tup[1]
                )  # sort asc by importance based on keybert
                coherence_map = sorted_map[pruning:][
                    ::-1
                ]  # get the last n - pruning values and reverse the list
                print("done pruning...", len(coherence_map))

            # truncate the strings for printing
            truncated_row = truncate_string(row, max_str_length)
            truncated_prev_row = truncate_string(prev_row, max_str_length)
            print(
                f"Coherence Map: {[[x[0] for x in c] for c in coherence_map]}, KW Curr: {[x[0] for x in keywords_current]}"
            )

            # compute the word comparisons between the previous (with the coherence map)
            # and the current (possibly the first sentence in a new segment)
            word_comparisons_with_coherence, weights = compare_coherent_words(
                [*coherence_map, keywords_prev], keywords_current
            )

            similarities_with_coherence = [
                comparison[2] for comparison in word_comparisons_with_coherence
            ]
            avg_similarity_with_coherence = sum(similarities_with_coherence) / (
                len(similarities_with_coherence) or 1
            )
            weighted_avg_similarity_with_coherence = get_weighted_average(
                similarities_with_coherence, weights
            )
            print(f"weighted: {weighted_avg_similarity_with_coherence}")

            # if the two sentences are similar, create a cohesive prediction
            # otherwise, predict a new segment
            if weighted_avg_similarity_with_coherence > prediction_thresh:
                print(
                    f"Label: {label}, Prediction: {0}, logit: {weighted_avg_similarity_with_coherence}"
                )
                predictions.append((weighted_avg_similarity_with_coherence, 0))
            else:
                # start of a new segment, empty the map
                coherence_map = []
                print(
                    f"Label: {label}, Prediction: {1}, logit: {weighted_avg_similarity_with_coherence}"
                )
                predictions.append((weighted_avg_similarity_with_coherence, 1))

            print("===============================================")

    return predictions

<IPython.core.display.Javascript object>

In [770]:
start = 75
num_samples = 50
max_tokens = 256  # want to keep this under 512
max_str_length = 30

true_labels = text_labels[start : start + num_samples]

predictions = coherence_tester(
    text_data[start : start + num_samples],
    true_labels,
    max_tokens=max_tokens,
    max_str_length=max_str_length,
)

['1963', 'suva', 'sports']
['fiji', 'broadcasting', 'television']
Got the keywords in 0.6738 seconds
Got the embeddings and comparisons in 0.0006 seconds
Coherence Map: [['fiji', 'suva', 'broadcasting']], KW Curr: ['1963', 'suva', 'sports']
weighted: tensor([0.3807])
Label: 0, Prediction: 0, logit: tensor([0.3807])
['fiji', 'broadcasting', 'television']
['suva', 'tv', 'shows']
Got the keywords in 1.0819 seconds
Got the embeddings and comparisons in 0.0006 seconds
Coherence Map: [['fiji', 'suva', 'broadcasting'], ['suva', 'fiji', 'tv']], KW Curr: ['fiji', 'broadcasting', 'television']
weighted: tensor([0.3865])
Label: 0, Prediction: 0, logit: tensor([0.3865])
['suva', 'tv', 'shows']
['nausori', 'ships', 'nadi']
Got the keywords in 1.0789 seconds
Got the embeddings and comparisons in 0.0007 seconds
Coherence Map: [['fiji', 'suva', 'broadcasting'], ['suva', 'fiji', 'tv'], ['nausori', 'suva', 'ships']], KW Curr: ['suva', 'tv', 'shows']
weighted: tensor([0.5235])
Label: 0, Prediction: 0, lo

['total', 'census', 'city']
['population', 'residents', 'racial']
Got the keywords in 0.3228 seconds
Got the embeddings and comparisons in 0.0006 seconds
Coherence Map: [['fiji', 'suva', 'broadcasting'], ['suva', 'fiji', 'tv'], ['nausori', 'suva', 'ships'], ['civoniceva', 'nausori', 'suva'], ['kashiwara', 'civoniceva', 'yamato'], ['kashiwara', 'kashiwara', 'kashiwara'], ['havana', 'kashiwara', 'havana'], ['marvinville', 'havana', 'greenville'], ['population', 'marvinville', 'residing'], ['alvarado', 'population', 'called'], ['total', 'alvarado', 'census'], ['population', 'total', 'residents'], ['population', 'population', 'households'], [], [], ['population', 'total', 'residents']], KW Curr: ['total', 'census', 'city']
weighted: tensor([0.3561])
Label: 0, Prediction: 0, logit: tensor([0.3561])
['population', 'residents', 'racial']
['population', 'householder', 'households']
Got the keywords in 0.9618 seconds
Got the embeddings and comparisons in 0.0006 seconds
Coherence Map: [['fiji', 

['2006', 'crime', 'guilt']
['1885', '1889', 'water']
Got the keywords in 1.1601 seconds
Got the embeddings and comparisons in 0.0011 seconds
Coherence Map: [['oklahoma', 'historic', 'dallas'], ['racial', 'oklahoma', 'population'], ['city', 'racial', 'town'], ['university', 'city', 'institution'], ['schools', 'university', 'school'], ['pontotoc', 'schools', 'tech'], ['crime', 'pontotoc', 'guilt'], ['water', 'crime', 'water']], KW Curr: ['2006', 'crime', 'guilt']
weighted: tensor([0.4488])
Label: 1, Prediction: 0, logit: tensor([0.4488])
['1885', '1889', 'water']
['1907', '1903', '1922']
Got the keywords in 1.2390 seconds
Got the embeddings and comparisons in 0.0000 seconds
Coherence Map: [['oklahoma', 'historic', 'dallas'], ['racial', 'oklahoma', 'population'], ['city', 'racial', 'town'], ['university', 'city', 'institution'], ['schools', 'university', 'school'], ['pontotoc', 'schools', 'tech'], ['crime', 'pontotoc', 'guilt'], ['water', 'crime', 'water'], []], KW Curr: ['1885', '1889', 

['1878', '1894', 'nashville']
['hohenwald', '5520', '5479']
Got the keywords in 0.5714 seconds
Got the embeddings and comparisons in 0.0002 seconds
Coherence Map: [[], []], KW Curr: ['1878', '1894', 'nashville']
weighted: tensor([0.3154])
Label: 0, Prediction: 1, logit: tensor([0.3154])
['hohenwald', '5520', '5479']
['population', 'racial', 'households']
Got the keywords in 0.4993 seconds
Got the embeddings and comparisons in 0.0101 seconds
Coherence Map: [['population', 'hohenwald', 'racial']], KW Curr: ['hohenwald', '5520', '5479']
weighted: tensor([0.3703])
Label: 0, Prediction: 0, logit: tensor([0.3703])
['population', 'racial', 'households']
['279574', 'water', 'states']
Got the keywords in 0.8482 seconds
Got the embeddings and comparisons in 0.0008 seconds
Coherence Map: [['population', 'hohenwald', 'racial'], ['water', 'population', 'states']], KW Curr: ['population', 'racial', 'households']
weighted: tensor([0.4863])
Label: 1, Prediction: 0, logit: tensor([0.4863])
['279574', '

<IPython.core.display.Javascript object>

In [765]:
print([x[1] for x in predictions])
print(true_labels)

[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0]
[0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0]


<IPython.core.display.Javascript object>

In [766]:
pred_string = "".join(str([x[1] for x in predictions]))
true_string = "".join(str(true_labels))

<IPython.core.display.Javascript object>

In [767]:
avg_k = len(true_labels) // (true_labels.count(1) + 1)  # get avg segment size

<IPython.core.display.Javascript object>

In [768]:
wd_score = windowdiff(pred_string, true_string, avg_k)
pk_score = pk(pred_string, true_string, avg_k)

print(f"k = {avg_k}")
print(f"wd = {wd_score}")
print(f"pk = {pk_score}")

k = 4
wd = 0.3741496598639456
pk = 0.3673469387755102


<IPython.core.display.Javascript object>

## Prediction Tuning

In [755]:
pred_thresh = 0.33

<IPython.core.display.Javascript object>

In [756]:
modified_predictions = [
    1 if x < pred_thresh else 0 for x in [x[0] for x in predictions]
]

pred_string = "".join(str(modified_predictions))
true_string = "".join(str(true_labels))

avg_k = len(true_labels) // (true_labels.count(1) + 1)  # get avg segment size

<IPython.core.display.Javascript object>

In [757]:
print(pred_string)
print(true_string)

[1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0]


<IPython.core.display.Javascript object>

In [758]:
wd_score = windowdiff(pred_string, true_string, avg_k)
pk_score = pk(pred_string, true_string, avg_k)

print(f"k = {avg_k}")
print(f"wd = {wd_score}")
print(f"pk = {pk_score}")

k = 4
wd = 0.36054421768707484
pk = 0.35374149659863946


<IPython.core.display.Javascript object>

## KeyBERT Embedding Comparison

In [172]:
curr = 230
prev = curr - 1

<IPython.core.display.Javascript object>

In [205]:
cohesion = coherence.get_coherence(
    [text_data[curr], text_data[prev]], coherence_threshold=0.25
)
print([k[0] for k in cohesion])

Got the keywords in 0.6567 seconds
Got the embeddings and comparisons in 0.0007 seconds
['cantonese', 'languages', 'vietnamese', 'communes']


<IPython.core.display.Javascript object>

In [206]:
# get the keywords for the current sentences
keywords_current = keywords_lib.get_keywords_with_kb_embeddings(text_data[curr])
keywords_prev = keywords_lib.get_keywords_with_kb_embeddings(text_data[prev])

# compute the word comparisons between the previous (with the coherence map)
# and the current (possibly the first sentence in a new segment)
word_comparisons_with_coherence, weights = compare_coherent_words(
    [keywords_prev], keywords_current
)

<IPython.core.display.Javascript object>

In [207]:
[(x[0], x[1]) for x in keywords_current], [(x[0], x[1]) for x in keywords_prev]

([('township', 0.2304),
  ('communes', 0.1857),
  ('hải', 0.1399),
  ('wards', 0.1397),
  ('đông', 0.1224)],
 [('cantonese', 0.5038),
  ('mandarin', 0.464),
  ('languages', 0.3483),
  ('language', 0.343),
  ('vietnamese', 0.3184)])

<IPython.core.display.Javascript object>

# KeyBERT Embedding Testing

In [679]:
docs = [
    "Hi my name is Devarsh",
    "Devarsh likes to play Basketball.",
    "I love to watch Cricket.",
    "I am a strong programmer. And my name is Devarsh",
]

<IPython.core.display.Javascript object>

In [680]:
from keybert import KeyBERT

kw_model = KeyBERT()
doc_embeddings, word_embeddings = kw_model.extract_embeddings(
    docs, min_df=1, stop_words="english"
)
keywords = kw_model.extract_keywords(
    docs,
    min_df=1,
    stop_words="english",
    doc_embeddings=doc_embeddings,
    word_embeddings=word_embeddings,
)

<IPython.core.display.Javascript object>

In [681]:
len(doc_embeddings)

4

<IPython.core.display.Javascript object>

In [682]:
len(word_embeddings)

10

<IPython.core.display.Javascript object>

In [683]:
keywords

[[('devarsh', 0.6267), ('hi', 0.5216)],
 [('devarsh', 0.6549),
  ('basketball', 0.5558),
  ('play', 0.3787),
  ('likes', 0.2284)],
 [('cricket', 0.7118), ('watch', 0.3656), ('love', 0.307)],
 [('programmer', 0.5942), ('devarsh', 0.5528), ('strong', 0.3452)]]

<IPython.core.display.Javascript object>

In [701]:
kw_model = KeyBERT()
import torch


def get_keywords_with_embeddings_test(
    data,
) -> list[tuple[str, float, torch.Tensor]]:
    doc_embeddings, word_embeddings = kw_model.extract_embeddings(data)

    keywords = kw_model.extract_keywords(
        data, doc_embeddings=doc_embeddings, word_embeddings=word_embeddings
    )

    keywords_with_embeddings = []
    count = 0
    print(len(word_embeddings))
    for i, (kw, we) in enumerate(zip(keywords, word_embeddings)):
        for j, words in enumerate(kw):
            keywords_with_embeddings.append((words[0], words[1], torch.tensor(we)))
            count += 1

    return keywords_with_embeddings

<IPython.core.display.Javascript object>

In [702]:
embeddings = get_keywords_with_embeddings_test(docs)

10


<IPython.core.display.Javascript object>

In [703]:
len(embeddings)

12

<IPython.core.display.Javascript object>