In [60]:
# Run if working locally
%load_ext autoreload
%autoreload 2
%load_ext nb_black

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [61]:
import sqlite3
from sqlite3 import Error
import pickle
import os, sys
import torch
import config

config.root_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.insert(0, config.root_path)

from src.dataset.dataset import RawData
from src.dataset.wikisection_preprocessing import (
    tokenize,
    clean_sentence,
    preprocess_text_segmentation,
    format_data_for_db_insertion,
)
from src.dataset.utils import truncate_by_token
from db.dbv2 import Table, AugmentedTable, TrainTestTable
import pprint

from utils.metrics import windowdiff, pk

from src.bertkeywords.src.similarities import Embedding, Similarities
from src.bertkeywords.src.keywords import Keywords
from src.encoders.coherence_v2 import Coherence
from src.dataset.utils import flatten, dedupe_list, truncate_string

<IPython.core.display.Javascript object>

In [62]:
dataset_type = "city"
table = Table(dataset_type)
augmented_table = AugmentedTable(dataset_type)
train_test_table = TrainTestTable(dataset_type)

<IPython.core.display.Javascript object>

In [63]:
data = table.get_all()

text_data = [x[1] for x in data]
text_labels = [x[2] for x in data]

<IPython.core.display.Javascript object>

In [64]:
all_segments = table.get_all_segments()
text_segments = [[y[1] for y in x] for x in all_segments]
segments_labels = [[1 if i == 0 else 0 for i, y in enumerate(x)] for x in all_segments]

<IPython.core.display.Javascript object>

In [65]:
all_samples = [item for sublist in all_segments for item in sublist]

<IPython.core.display.Javascript object>

In [66]:
len(all_segments), len(all_samples), (len(all_samples) / len(all_segments))

(13678, 92833, 6.78703026758298)

<IPython.core.display.Javascript object>

In [67]:
samples = 5
max_tokens = 400

for i, (segment, labels) in enumerate(
    zip(text_segments[:samples], segments_labels[:samples])
):
    for sentence, label in zip(segment, labels):
        # this is the training case. During inference, we will have no idea
        # when segments start and when they end.
        pass

<IPython.core.display.Javascript object>

In [68]:
text_labels[:25]

[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

<IPython.core.display.Javascript object>

In [69]:
# initialize the coherence library
max_words_per_step = 3
coherence = Coherence(max_words_per_step=max_words_per_step, kb_embeddings=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
No sentence-transformers model found with name /Users/amitmaraj/.cache/torch/sentence_transformers/bert-base-uncased. Creating a new one with MEAN pooling.

<IPython.core.display.Javascript object>

In [57]:
def get_weighted_average(weighted_similarities, weights):
    return sum(weighted_similarities) / sum(weights)


# importance testing
def compare_coherent_words(
    coherence_map,
    keywords_current,
    suppress_errors=False,
    same_word_multiplier=2,  # if set to 1, don't amplify the same words found
    no_same_word_penalty=2,  # if set to 1, don't penalize for not finding the same word.
):
    word_comparisons = []
    weights = []
    for i, keywords in enumerate(coherence_map[::-1]):
        for word_tuple in keywords:
            word = word_tuple[0]
            for second_word_tuple in keywords_current:
                second_word = second_word_tuple[0]
                second_word_importance = second_word_tuple[1]

                try:
                    word_one_emb = word_tuple[2]
                    word_two_emb = second_word_tuple[2]

                    if same_word_multiplier > 1:
                        flattened_coherence_words_only = [
                            element[0]
                            for sublist in coherence_map
                            for element in sublist
                        ]

                        num_occurrences = flattened_coherence_words_only.count(
                            second_word
                        )

                        if num_occurrences > 0:
                            # amplify words that are found as duplicates in the coherence map
                            # if the word shows up 1 time, amplify the weight by 2 times
                            weighting_multiplier = flattened_coherence_words_only.count(
                                second_word
                            ) + (same_word_multiplier - 1)
                        else:
                            # no same word penalty
                            weighting_multiplier = (
                                1 / no_same_word_penalty
                            )  # reduce the importance of this word

                    else:
                        weighting_multiplier = 1  # set to 1 in case this is turned off.

                    # this weight is a recipricol function that will grow smaller the further the keywords are away
                    # we want to put more importance on the current words, so we apply twice as much weight.
                    if i == 0:
                        weight = (weighting_multiplier * 2) / (i + 1)
                    else:
                        weight = (weighting_multiplier * 1) / (i + 1)

                    # multiply the weighting factor by the importance of the second word
                    weight *= second_word_importance

                    word_comparisons.append(
                        (
                            word,
                            second_word,
                            weight
                            * coherence.embedding_lib.get_similarity(
                                torch.Tensor(word_one_emb), torch.Tensor(word_two_emb)
                            ),
                        )
                    )
                    weights.append(weight)
                except AssertionError as e:
                    if not suppress_errors:
                        print(e, word, second_word)

    return word_comparisons, weights


# TODO: add weighted average: https://www.google.com/search?q=weighted+average&rlz=1C5CHFA_enCA1019CA1024&sxsrf=APwXEdcb6dhJ5L_mvWvrWr4AxQcxOFB01g:1681098698316&tbm=isch&source=iu&ictx=1&vet=1&fir=V-LTDKtCElo89M%252C2WVwd1NrPkHFOM%252C_%253BVGk_lj0HALhXQM%252C2WVwd1NrPkHFOM%252C_%253ByzfbB4i3SpPTFM%252C5e7an03wLAdfhM%252C_%253B47HYmoDH6WlThM%252CsRXbJWfpyOLEOM%252C_%253BOsB4jtfzenfuyM%252CHKcmLkpfJ3xWqM%252C_&usg=AI4_-kRmBXgUWAm_nR3vDsLT17TqM5AvSQ&sa=X&ved=2ahUKEwi6hvvVtJ7-AhXJkIkEHe4JCX4Q_h16BAgoEAE#imgrc=V-LTDKtCElo89M
def coherence_tester(
    text_data,
    text_labels,
    max_tokens=128,
    max_str_length=30,
    prediction_thresh=0.48,
    coherence_threshold=0.2,
    pruning=1,  # remove one sentence worth of keywords
    pruning_min=6,  # remove the first sentence in the coherence map once it grows passed 6
    dynamic_threshold=False,
    coherence_dump_on_prediction=False,
    threshold_warmup=10,  # number of iterations before using dynamic threshold
    last_n_threshold=5,  # will only consider the last n thresholds for dynamic threshold
    batch_size=10,
):
    coherence_map = []
    predictions = []
    thresholds = []

    prev_sentence = None

    # set up batching
    for batch_num in range(0, len(text_data) // batch_size):
        # create the current batch to iterate over.
        # this method relies on previous sentence as it always keeps track
        curr_batch = text_data[
            batch_num * batch_size : batch_num * batch_size + batch_size
        ]

        curr_batch_labels = text_labels[
            batch_num * batch_size : batch_num * batch_size + batch_size
        ]

        for i, (row, label) in enumerate(zip(curr_batch, curr_batch_labels)):
            threshold = prediction_thresh
            if dynamic_threshold and (i + 1) > threshold_warmup:
                last_n_thresholds = thresholds[(0 - last_n_threshold) :]
                last_n_thresholds.sort()
                mid = len(last_n_thresholds) // 2
                threshold = (last_n_thresholds[mid] + last_n_thresholds[~mid]) / 2
                print(f"median threshold: {threshold}")
            # compare the current sentence to the previous one
            if prev_sentence is None:
                predictions.append(
                    (torch.tensor(0, dtype=torch.int8), 0)
                )  # predict a 0 since it's the start
                print(f"Label: {label}, Prediction: {0}")
                prev_sentence = row
                pass
            else:
                print(f"Sample Number: {i}")

                row = truncate_by_token(row, max_tokens)
                prev_row = truncate_by_token(prev_sentence, max_tokens)

                cohesion, keywords_prev, keywords_current = coherence.get_coherence(
                    [row, prev_row], coherence_threshold=coherence_threshold
                )

                # add the keywords to the coherence map
                coherence_map.append(cohesion)
                if pruning > 0 and len(coherence_map) >= pruning_min:
                    print("pruning...", len(coherence_map))
                    coherence_map = coherence_map[
                        pruning:
                    ]  # remove the pruning amount from the beginning of the list
                    print("done pruning...", len(coherence_map))

                # truncate the strings for printing
                truncated_row = truncate_string(row, max_str_length)
                truncated_prev_row = truncate_string(prev_row, max_str_length)
                print(
                    f"Coherence Map: {[[x[0] for x in c] for c in coherence_map]}, KW Curr: {[x[0] for x in keywords_current]}"
                )

                # compute the word comparisons between the previous (with the coherence map)
                # and the current (possibly the first sentence in a new segment)
                word_comparisons_with_coherence, weights = compare_coherent_words(
                    [*coherence_map, keywords_prev], keywords_current
                )

                similarities_with_coherence = [
                    comparison[2] for comparison in word_comparisons_with_coherence
                ]
                
                weighted_avg_similarity_with_coherence = get_weighted_average(
                    similarities_with_coherence, weights
                )
                print(f"weighted: {weighted_avg_similarity_with_coherence}")

                # if the two sentences are similar, create a cohesive prediction
                # otherwise, predict a new segment
                if weighted_avg_similarity_with_coherence > threshold:
                    print(
                        f"Label: {label}, Prediction: {0}, logit: {weighted_avg_similarity_with_coherence}"
                    )
                    predictions.append((weighted_avg_similarity_with_coherence, 0))
                else:
                    if coherence_dump_on_prediction:
                        # start of a new segment, empty the map
                        coherence_map = []
                    print(
                        f"Label: {label}, Prediction: {1}, logit: {weighted_avg_similarity_with_coherence}"
                    )
                    predictions.append((weighted_avg_similarity_with_coherence, 1))

                thresholds.append(weighted_avg_similarity_with_coherence)
                print("===============================================")

                prev_sentence = row

    return predictions

<IPython.core.display.Javascript object>

In [58]:
start = 100
num_samples = 50
max_tokens = 128  # want to keep this under 512
max_str_length = 30

true_labels = text_labels[start : start + num_samples]

predictions = coherence_tester(
    text_data[start : start + num_samples],
    true_labels,
    max_tokens=max_tokens,
    max_str_length=max_str_length,
)

Label: 0, Prediction: 0
Sample Number: 1
Coherence Map: [['chickasaw', 'ecu', 'ehacoffice', 'legalshield']], KW Curr: ['chickasaw', 'legalshield', 'plasticware', 'wrangler']
weighted: tensor([0.4932])
Label: 0, Prediction: 0, logit: tensor([0.4932])
Sample Number: 2
Coherence Map: [['chickasaw', 'ecu', 'ehacoffice', 'legalshield'], ['glenwood', 'ecu', 'ehacoffice', 'cartography']], KW Curr: ['ecu', 'ehacoffice', 'cartography', 'accreditation']
weighted: tensor([0.5707])
Label: 0, Prediction: 0, logit: tensor([0.5707])
Sample Number: 3
Coherence Map: [['chickasaw', 'ecu', 'ehacoffice', 'legalshield'], ['glenwood', 'ecu', 'ehacoffice', 'cartography'], ['pontotoc', 'glenwood', 'technology', 'area']], KW Curr: ['glenwood', 'schools', 'high', 'secondary']
weighted: tensor([0.6830])
Label: 0, Prediction: 0, logit: tensor([0.6830])
Sample Number: 4
Coherence Map: [['chickasaw', 'ecu', 'ehacoffice', 'legalshield'], ['glenwood', 'ecu', 'ehacoffice', 'cartography'], ['pontotoc', 'glenwood', 'tec

pruning... 6
done pruning... 5
Coherence Map: [['rivadavia', 'borja', 'buoys', 'maciel'], ['puzolanic', 'petroquimica', 'bricklaying', 'borja'], ['puzolanic', 'petroquimica', 'bricklaying', 'rivadavia'], ['universitario', 'ferroviario', 'esgrima', 'nautico'], ['strunkovka', 'zolotonosha', 'universitario', 'ferroviario']], KW Curr: ['universitario', 'ferroviario', 'esgrima', 'nautico']
weighted: tensor([0.4636])
Label: 1, Prediction: 1, logit: tensor([0.4636])
Sample Number: 0
pruning... 6
done pruning... 5
Coherence Map: [['puzolanic', 'petroquimica', 'bricklaying', 'borja'], ['puzolanic', 'petroquimica', 'bricklaying', 'rivadavia'], ['universitario', 'ferroviario', 'esgrima', 'nautico'], ['strunkovka', 'zolotonosha', 'universitario', 'ferroviario'], ['strunkovka', 'zolotonosha', '1576', 'chattanooga']], KW Curr: ['strunkovka', 'zolotonosha', '1576', 'magdeburg']
weighted: tensor([0.5237])
Label: 1, Prediction: 0, logit: tensor([0.5237])
Sample Number: 1
pruning... 6
done pruning... 5


pruning... 6
done pruning... 5
Coherence Map: [['اچمی', 'larestani', 'ajami', 'language'], ['اچمی', 'larestani', 'ajami', 'mcarthur'], ['mcarthur', 'census', 'land', 'city'], ['census', 'land', 'city', 'male'], ['594', '642', 'male', 'census']], KW Curr: ['male', 'census', 'female', 'population']
weighted: tensor([0.7001])
Label: 0, Prediction: 0, logit: tensor([0.7001])
Sample Number: 9
pruning... 6
done pruning... 5
Coherence Map: [['اچمی', 'larestani', 'ajami', 'mcarthur'], ['mcarthur', 'census', 'land', 'city'], ['census', 'land', 'city', 'male'], ['594', '642', 'male', 'census'], ['dryland', 'stateline', 'vansycle', 'canola']], KW Curr: ['594', '642', 'census', 'female']
weighted: tensor([0.6192])
Label: 0, Prediction: 0, logit: tensor([0.6192])
Sample Number: 0
pruning... 6
done pruning... 5
Coherence Map: [['mcarthur', 'census', 'land', 'city'], ['census', 'land', 'city', 'male'], ['594', '642', 'male', 'census'], ['dryland', 'stateline', 'vansycle', 'canola'], ['dryland', 'whea

<IPython.core.display.Javascript object>

In [59]:
print([x[1] for x in predictions])
print(true_labels)

[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0]


<IPython.core.display.Javascript object>

In [54]:
pred_string = "".join(str([x[1] for x in predictions]))
true_string = "".join(str(true_labels))

<IPython.core.display.Javascript object>

In [55]:
avg_k = len(true_labels) // (true_labels.count(1) + 1)  # get avg segment size

<IPython.core.display.Javascript object>

In [56]:
wd_score = windowdiff(pred_string, true_string, avg_k)
pk_score = pk(pred_string, true_string, avg_k)

print(f"k = {avg_k}")
print(f"wd = {wd_score}")
print(f"pk = {pk_score}")

k = 5
wd = 0.2876712328767123
pk = 0.273972602739726


<IPython.core.display.Javascript object>

## Prediction Tuning

In [46]:
pred_thresholds = [
    0.2,
    0.21,
    0.22,
    0.23,
    0.24,
    0.25,
    0.26,
    0.27,
    0.28,
    0.29,
    0.3,
]  # bert base uncased
pred_thresholds = [
    0.4,
    0.41,
    0.42,
    0.43,
    0.44,
    0.45,
    0.46,
    0.47,
    0.48,
    0.49,
    0.5,
]  # labse
# pred_thresholds = [
#     0.06,
#     0.07,
#     0.08,
#     0.09,
#     0.1,
#     0.11,
#     0.12,
#     0.13,
#     0.14,
#     0.15,
#     0.16,
#     0.17,
#     0.18,
#     0.19,
#     0.2,
#     0.11,
#     0.06,
# ]  # sentence-transformers
# pred_thresholds = [
#     0.6,
#     0.61,
#     0.62,
#     0.63,
#     0.64,
#     0.65,
#     0.66,
#     0.67,
#     0.68,
#     0.69,
#     0.7,
# ]  # USE
# pred_thresholds = [
#     0.65,
#     0.66,
#     0.67,
#     0.68,
#     0.69,
#     0.7,
#     0.71,
#     0.72,
#     0.73,
#     0.74,
#     0.75,
#     0.76,
#     0.77,
#     0.78,
#     0.79,
#     0.64,
# ]  # Roberta

<IPython.core.display.Javascript object>

In [47]:
for pred_thresh in pred_thresholds:
    modified_predictions = [
        1 if x < pred_thresh else 0 for x in [x[0] for x in predictions]
    ]

    pred_string = "".join(str(modified_predictions))
    true_string = "".join(str(true_labels))

    avg_k = len(true_labels) // (true_labels.count(1) + 1)  # get avg segment size

    wd_score = windowdiff(pred_string, true_string, avg_k)
    pk_score = pk(pred_string, true_string, avg_k)

    print(f"pred_thresh = {pred_thresh}")
    print(f"k = {avg_k}")
    print(f"wd = {wd_score}")
    print(f"pk = {pk_score}")
    print("===========================================")

pred_thresh = 0.4
k = 5
wd = 0.3082191780821918
pk = 0.3082191780821918
pred_thresh = 0.41
k = 5
wd = 0.3082191780821918
pk = 0.3082191780821918
pred_thresh = 0.42
k = 5
wd = 0.3082191780821918
pk = 0.3082191780821918
pred_thresh = 0.43
k = 5
wd = 0.3082191780821918
pk = 0.3082191780821918
pred_thresh = 0.44
k = 5
wd = 0.3082191780821918
pk = 0.3082191780821918
pred_thresh = 0.45
k = 5
wd = 0.3150684931506849
pk = 0.3150684931506849
pred_thresh = 0.46
k = 5
wd = 0.3150684931506849
pk = 0.3150684931506849
pred_thresh = 0.47
k = 5
wd = 0.2945205479452055
pk = 0.2808219178082192
pred_thresh = 0.48
k = 5
wd = 0.2945205479452055
pk = 0.2808219178082192
pred_thresh = 0.49
k = 5
wd = 0.3698630136986301
pk = 0.3424657534246575
pred_thresh = 0.5
k = 5
wd = 0.4315068493150685
pk = 0.3904109589041096


<IPython.core.display.Javascript object>

In [107]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

print(pred_string)
print(true_string)

tn, fp, fn, tp = confusion_matrix(true_labels, modified_predictions).ravel()
precision, recall, f1, _ = precision_recall_fscore_support(
    true_labels, modified_predictions, average="macro"
)

[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

<IPython.core.display.Javascript object>

In [108]:
wd_score = windowdiff(pred_string, true_string, avg_k)
pk_score = pk(pred_string, true_string, avg_k)

print(f"k = {avg_k}")
print(f"wd = {wd_score}")
print(f"pk = {pk_score}")
print(f"tn = {tn}")
print(f"fp = {fp}")
print(f"fn = {fn}")
print(f"tp = {tp}")
print(f"precision = {precision}")
print(f"recall = {recall}")
print(f"f1 = {f1}")

k = 6
wd = 0.3076751946607342
pk = 0.30567296996662957
tn = 1203
fp = 55
fn = 215
tp = 27
precision = 0.5888231449310262
recall = 0.533925028577435
f1 = 0.5328849028400597


<IPython.core.display.Javascript object>

## KeyBERT Embedding Comparison

In [172]:
curr = 230
prev = curr - 1

<IPython.core.display.Javascript object>

In [None]:
# initialize the keywords and embeddings library
pp = pprint.PrettyPrinter(indent=4)
similarities_lib = Similarities("bert-base-uncased")
keywords_lib = Keywords(similarities_lib.model, similarities_lib.tokenizer)
embedding_lib = Embedding(similarities_lib.model, similarities_lib.tokenizer)

In [205]:
cohesion = coherence.get_coherence(
    [text_data[curr], text_data[prev]], coherence_threshold=0.25
)
print([k[0] for k in cohesion])

Got the keywords in 0.6567 seconds
Got the embeddings and comparisons in 0.0007 seconds
['cantonese', 'languages', 'vietnamese', 'communes']


<IPython.core.display.Javascript object>

In [206]:
# get the keywords for the current sentences
keywords_current = keywords_lib.get_keywords_with_kb_embeddings(text_data[curr])
keywords_prev = keywords_lib.get_keywords_with_kb_embeddings(text_data[prev])

# compute the word comparisons between the previous (with the coherence map)
# and the current (possibly the first sentence in a new segment)
word_comparisons_with_coherence, weights = compare_coherent_words(
    [keywords_prev], keywords_current
)

<IPython.core.display.Javascript object>

In [207]:
[(x[0], x[1]) for x in keywords_current], [(x[0], x[1]) for x in keywords_prev]

([('township', 0.2304),
  ('communes', 0.1857),
  ('hải', 0.1399),
  ('wards', 0.1397),
  ('đông', 0.1224)],
 [('cantonese', 0.5038),
  ('mandarin', 0.464),
  ('languages', 0.3483),
  ('language', 0.343),
  ('vietnamese', 0.3184)])

<IPython.core.display.Javascript object>

# KeyBERT Embedding Testing

In [679]:
docs = [
    "Hi my name is Devarsh",
    "Devarsh likes to play Basketball.",
    "I love to watch Cricket.",
    "I am a strong programmer. And my name is Devarsh",
]

<IPython.core.display.Javascript object>

In [680]:
from keybert import KeyBERT

kw_model = KeyBERT()
doc_embeddings, word_embeddings = kw_model.extract_embeddings(
    docs, min_df=1, stop_words="english"
)
keywords = kw_model.extract_keywords(
    docs,
    min_df=1,
    stop_words="english",
    doc_embeddings=doc_embeddings,
    word_embeddings=word_embeddings,
)

<IPython.core.display.Javascript object>

In [681]:
len(doc_embeddings)

4

<IPython.core.display.Javascript object>

In [682]:
len(word_embeddings)

10

<IPython.core.display.Javascript object>

In [683]:
keywords

[[('devarsh', 0.6267), ('hi', 0.5216)],
 [('devarsh', 0.6549),
  ('basketball', 0.5558),
  ('play', 0.3787),
  ('likes', 0.2284)],
 [('cricket', 0.7118), ('watch', 0.3656), ('love', 0.307)],
 [('programmer', 0.5942), ('devarsh', 0.5528), ('strong', 0.3452)]]

<IPython.core.display.Javascript object>

In [701]:
kw_model = KeyBERT()
import torch


def get_keywords_with_embeddings_test(
    data,
) -> list[tuple[str, float, torch.Tensor]]:
    doc_embeddings, word_embeddings = kw_model.extract_embeddings(data)

    keywords = kw_model.extract_keywords(
        data, doc_embeddings=doc_embeddings, word_embeddings=word_embeddings
    )

    keywords_with_embeddings = []
    count = 0
    print(len(word_embeddings))
    for i, (kw, we) in enumerate(zip(keywords, word_embeddings)):
        for j, words in enumerate(kw):
            keywords_with_embeddings.append((words[0], words[1], torch.tensor(we)))
            count += 1

    return keywords_with_embeddings

<IPython.core.display.Javascript object>

In [702]:
embeddings = get_keywords_with_embeddings_test(docs)

10


<IPython.core.display.Javascript object>

In [703]:
len(embeddings)

12

<IPython.core.display.Javascript object>