In [1]:
# Run if working locally
%load_ext autoreload
%autoreload 2

In [11]:
import sqlite3
from sqlite3 import Error
import pickle
import os, sys
import config

config.root_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.insert(0, config.root_path)

from src.dataset.dataset import RawData
from src.dataset.wikisection_preprocessing import (
    tokenize,
    clean_sentence,
    preprocess_text_segmentation,
    format_data_for_db_insertion,
)

from src.dataset.utils import truncate_by_token, flatten, dedupe_list, truncate_string
from src.bertkeywords.src.similarities import Embedding, Similarities
from src.bertkeywords.src.keywords import Keywords, Keyword
from src.coherencegraph.coherence_graph import CoherenceNode, CoherenceGraph
from db.dbv2 import Table, AugmentedTable, TrainTestTable
import pprint


from utils.metrics import windowdiff, pk
import networkx as nx
import torch
import numpy as np

In [17]:
dataset_type = "city"
table = Table(dataset_type)

num_sentences = 100
offset = 300
max_segment_length = 99

all_segments = table.get_all_segments()

segments = [[y[1] for y in x][:max_segment_length] for x in all_segments]
segments_labels = [
    [1 if i == 0 else 0 for i, y in enumerate(x)][:max_segment_length] for x in all_segments
]

flattened_segments = flatten(segments)
flattened_labels = flatten(segments_labels)

segments_to_test = flattened_segments[offset:offset+num_sentences]
labels_to_test = flattened_labels[offset:offset+num_sentences]

Using dataset: wikisection_city


In [18]:
G = CoherenceGraph(coherence_threshold=0.6)

### Keywords

In [19]:
model_string="sentence-transformers/LaBSE"

similarities_lib = Similarities(model_string)
keywords_lib = Keywords(similarities_lib.model, similarities_lib.tokenizer)
embedding_lib = Embedding(
    similarities_lib.model, similarities_lib.tokenizer
)
max_words_per_step = 4
keyword_diversity = 0.5

No sentence-transformers model found with name /Users/mac/.cache/torch/sentence_transformers/bert-base-uncased. Creating a new one with MEAN pooling.


In [20]:
embedding_technique = (keywords_lib.get_batch_keywords_with_kb_embeddings)

In [21]:
# get all the keywords per sentence and truncate at max number of words
batch_keywords = [
    x[:max_words_per_step]
    for x in embedding_technique(
        segments_to_test,
        diversity=keyword_diversity,
        diverse_keywords=False,
        similar_keywords=True,
        include_numeric_keywords=True,
    )
]

batch_keywords = [[Keyword(z[0], z[1], z[2]) for z in x] for x in batch_keywords]

In [22]:
np.array(batch_keywords).shape

(100, 4)

In [23]:
def check_similarity(emb1, emb2, coherence_threshold=0.5):
    # check similarity and add to coherent dictionary
    similarity = torch.cosine_similarity(
        emb1.reshape(1, -1), emb2.reshape(1, -1)
    )

    # print(f"Similarity: {similarity}")
    if similarity[0] >= coherence_threshold:
        return True
    return False
    
def get_similarity(emb1, emb2):
    # get similarity and add to coherent dictionary
    return torch.cosine_similarity(
        emb1.reshape(1, -1), emb2.reshape(1, -1)
    )

In [24]:
w1 = batch_keywords[0][0]
w2 = batch_keywords[1][1]

kw1, kw2 = torch.Tensor(w1.embedding), torch.Tensor(w2.embedding) 

check_similarity(kw1, kw2)

True

---

### Types of predictions

1. prediction by vector: this technique gets all the related chains to the current word (based on the similarity to the previous words and their associated chains), then takes the similarity between the current word and the related chain vectors (chain vector formula can be found in the coherence graph library code). It then multiplies the importance of the current word by the calculated similarity and sums all the similarities calculated for all the words in the current sentence.

2. prediction by unique chain count: this technique is the simplest. it calculates the number of chains related to all the current words in the sentence and adds them together. once this number starts going down, it means there are less connections and thus, less cohesion with the previous chains, indicating that it could be the start of a new segment.

3. prediction by weighted similarity: this technique takes the similarity between the previous words and the current words if the current word has a strong cohesion with the previous word (e.g., similarity over some coherence threshold). That similarity is then multiplied by the number of chains related to that previous word. All these weighted similarities are then added to create a cohesion factor.

### Prediction by vector

In [342]:
G = CoherenceGraph()
temp_prev_graph = nx.Graph()
for i, sentence in enumerate(batch_keywords):
    # add 1 distance to each word
    G.balance_graph()
    G.prune_max_depth(max_depth=5)
    # create a previous graph to hold all the words from the previous sentence
    # such that we can only compare the current words with the previous sentence
    prev_graph = temp_prev_graph
    temp_prev_graph = nx.Graph()
    similarities = []
    for word in sentence:
        # create a temp graph to hold the edges, which we'll use to compose a new graph
        # after the iteration, since the graph size can't fluctuate as we iterate through
        temp_graph = nx.Graph()
        node = CoherenceNode(word.text, word.embedding, word.importance)
        for n in prev_graph.nodes():
            if check_similarity(torch.Tensor(n.vector), torch.Tensor(node.vector), coherence_threshold=0.7):
                temp_graph.add_edge(node, n, weight=get_similarity(torch.Tensor(n.vector), torch.Tensor(node.vector)))
                                    
        G = nx.compose(G,temp_graph)
        temp_prev_graph.add_node(node)
        # add the node. If it was already added, it won't be added again
        G.add_node(node)
        # create the unique chains and memoize
        node.process_unique_chains(G)

        # prediction by vector ----
        top_n_chains = 10
        if i != 0:
            chains = node.get_unique_chains()
            # get the chains vectors along with the importance of each chain
            chains_vectors = [(G.get_chain_vector(chain), G.get_chain_importance(chain)) for chain in chains]
            # sort by importance descending
            chains_vectors.sort(key = lambda x: x[1])
            # remove the importance from the tuples since we don't need them anymore.
            chains_vectors = [x[0] for x in chains_vectors]
            for chain_vector in chains_vectors[:top_n_chains]:
                # G.print_chain(chain, with_weights=False)
                similarity = node.importance * get_similarity(torch.Tensor(chain_vector), torch.Tensor(node.vector))
                similarities.append(similarity)
    if i == 0:
        print(f"label: {labels_to_test[i]} | prediction: {1}")
    else:
        # print(similarities)
        if len(similarities) == 0:
            prediction = 0
        else:
            prediction = torch.sum(torch.Tensor(similarities))
            # if prediction == 1:
            #     G.empty_graph()
        print(f"label: {labels_to_test[i]} | prediction: {prediction} | ", end="")
        print(f"words: {[str(word) for word in sentence]}")

label: 0 | prediction: 1
label: 1 | prediction: 2.953200101852417 | words: ['jodhpur', 'burhanpur', 'karni', 'junagarh', 'thakurs', 'yuvaraja']
label: 0 | prediction: 0.07586268335580826 | words: ['jabalpur', 'kurukshetra', 'guwahati', 'jodhpur', 'bilaspur', 'ludhiana']
label: 0 | prediction: 2.1345739364624023 | words: ['thar', 'climate', 'winter', 'rainfall', 'summer', 'bikaner']
label: 0 | prediction: -1.3530656099319458 | words: ['maharanas', 'grandiose', 'junagarh', 'preserved', 'museums', 'temples']
label: 0 | prediction: 2.7969272136688232 | words: ['maharajah', 'saracenic', 'bikaner', 'singh', 'laxmi', 'swinton']
label: 0 | prediction: 1.8848999738693237 | words: ['junagarh', 'bika', 'ruins', 'fort', 'years']
label: 0 | prediction: 1.4113411903381348 | words: ['karni', 'jodhpur', 'deshnoke', 'kundanlal', 'verma']
label: 0 | prediction: 2.126499891281128 | words: ['mukam', 'nokha', 'bishnois', 'temple', 'nearby']
label: 0 | prediction: 3.1367998123168945 | words: ['_____________

### Prediction by unique chain count

In [28]:
# best config currently:
# max_depth of 7
# coherence threshold of 0.6
# only similar and numeric words - 4 keywords

G = CoherenceGraph(coherence_threshold=0.7)
temp_prev_graph = nx.Graph()
prev_num_chains = 0
predictions = ""
ground_truths = ""

for i, sentence in enumerate(batch_keywords):
    # add 1 distance to each word
    G.balance_graph()
    G.prune_max_depth(max_depth=7)
    # create a previous graph to hold all the words from the previous sentence
    # such that we can only compare the current words with the previous sentence
    prev_graph = temp_prev_graph
    temp_prev_graph = nx.Graph()
    num_chains = 0
    for word in sentence:
        # create a temp graph to hold the edges, which we'll use to compose a new graph
        # after the iteration, since the graph size can't fluctuate as we iterate through
        # temp_graph = nx.Graph()
        node = CoherenceNode(word.text, word.embedding, word.importance)
        # for n in G.get_nodes_at_distance(distance=1):
        #     if check_similarity(torch.Tensor(n.vector), torch.Tensor(node.vector), coherence_threshold=0.7):
        #         temp_graph.add_edge(node, n, weight=get_similarity(torch.Tensor(n.vector), torch.Tensor(node.vector)))
                                    
        # G = nx.compose(G,temp_graph)
        # temp_prev_graph.add_node(node)
        # add the node. If it was already added, it won't be added again
        G.add_node(node)
        # create the unique chains and memoize
        node.process_unique_chains(G)

        # prediction by chain count ----
        if i != 0:
            chains = node.get_unique_chains()
            num_chains += len(chains)
    if i == 0:
        print(f"label: {labels_to_test[i]} | prediction: {1}")
    else:
        prediction = 1 if num_chains < (prev_num_chains//2) else 0
        # if num_chains == 0:
        #     prediction = 1
        if prediction == 1: 
            G.prune_max_depth(max_depth=1)
            prev_num_chains = 0
        else:
            prev_num_chains = num_chains

        predictions += str(prediction)
        ground_truths += str(labels_to_test[i])
            
        print(f"label: {labels_to_test[i]} | prediction: {prediction} | total chains: {num_chains} | ", end="")
        print(f"words: {[str(word) for word in sentence]}")


label: 1 | prediction: 1
label: 0 | prediction: 0 | total chains: 4 | words: ['joliet', 'kankakee', 'southwest', 'area']
label: 0 | prediction: 0 | total chains: 5 | words: ['648', '966', '659', 'census']
label: 0 | prediction: 0 | total chains: 7 | words: ['trashistan', 'confusion', 'song', 'songs']
label: 1 | prediction: 0 | total chains: 21 | words: ['gulag', '1937', 'settlement', 'town']
label: 0 | prediction: 0 | total chains: 29 | words: ['vymsky', 'administrative', 'divisions', 'mikun']
label: 1 | prediction: 1 | total chains: 4 | words: ['mamuretülaziz', 'elazığ', 'azık', 'atatürk']
label: 0 | prediction: 0 | total chains: 4 | words: ['khartabirt', 'carcathio', 'hurrian', 'ziyād']
label: 0 | prediction: 0 | total chains: 4 | words: ['çubukoğulları', 'safavids', 'ilkhanate', 'artuqids']
label: 0 | prediction: 0 | total chains: 4 | words: ['uninterrupted', 'elazığ', '465', '635']
label: 0 | prediction: 0 | total chains: 4 | words: ['kilisesi', 'merymana', 'ziyād', 'kharput']
labe

In [30]:
predictions, ground_truths

('000001000000000001000000000010000000000100100000000000100000010000000000000000000000100000001000000',
 '000101000000000001000000000000101100010000100010001000010100000010000000010000000000100000000000000')

In [29]:
windowdiff(ground_truths, predictions, k=7), pk(ground_truths, predictions, k=7)

(0.5161290322580645, 0.3010752688172043)

## Prediction by weighted similarity

In [74]:
coherence_threshold = 0.675
G = CoherenceGraph(coherence_threshold=coherence_threshold)
temp_prev_graph = nx.Graph()
predictions = ""
prev_similarity = 0
for i, sentence in enumerate(batch_keywords):
    # add 1 distance to each word
    G.balance_graph()
    G.prune_max_depth(max_depth=5)
    # create a previous graph to hold all the words from the previous sentence
    # such that we can only compare the current words with the previous sentence
    prev_graph = temp_prev_graph
    temp_prev_graph = nx.Graph()
    similarities = []
    for word in sentence:
        # create a temp graph to hold the edges, which we'll use to compose a new graph
        # after the iteration, since the graph size can't fluctuate as we iterate through
        # temp_graph = nx.Graph()
        node = CoherenceNode(word.text, word.embedding, word.importance)
        # for n in prev_graph.nodes():
        #     if check_similarity(torch.Tensor(n.vector), torch.Tensor(node.vector), coherence_threshold=G.coherence_threshold):
        #         temp_graph.add_edge(node, n, weight=get_similarity(torch.Tensor(n.vector), torch.Tensor(node.vector)))
                                    
        # G = nx.compose(G,temp_graph)
        # temp_prev_graph.add_node(node)
        # add the node. If it was already added, it won't be added again
        G.add_node(node)
        # create the unique chains and memoize
        node.process_unique_chains(G)

        # prediction by weighted similarity ----
        if i != 0:
            for prev_node in G.get_nodes_at_distance(distance=1):
                # # don't consider this previous node if it isn't highly related to the current node.
                # if not check_similarity(torch.Tensor(n.vector), torch.Tensor(node.vector), coherence_threshold=coherence_threshold):
                #     continue
                # get the similarity between the current node and the previous node. 
                # multiply by the importance of current node
                similarity = node.importance * get_similarity(torch.Tensor(prev_node.vector), torch.Tensor(node.vector))
                # multiply by the number of chains into the previous node
                # emphasizing the importance of that chain (theme).
                weighted_similarity = (len(prev_node.get_unique_chains())+1) * similarity
                similarities.append(weighted_similarity)

    # print(f"prev sim: {prev_similarity}")
    if i == 0:
        print(f"label: {labels_to_test[i]} | prediction: {1}")
        # predictions += str(1)
    else:
        # print(similarities)
        if len(similarities) == 0:
            prediction = 0
            predictions += str(prediction)
        else:
            total_similarity = torch.sum(torch.Tensor(similarities))
            if total_similarity < (prev_similarity*(2/4)):
                # print("less")
                prediction = 1
            else:
                prediction = 0
            prev_similarity = total_similarity
            predictions += str(prediction)
            if prediction == 1:
                # G.empty_graph()
                G.prune_max_depth(max_depth=1)
                # print("emptying graph")
                # print(len(G.get_all_nodes()))
        print(f"label: {labels_to_test[i]} | prediction: {prediction} | similarity: {total_similarity} | ", end="")
        print(f"words: {[str(word) for word in sentence]}")

label: 1 | prediction: 1
label: 0 | prediction: 0 | similarity: 6.008994102478027 | words: ['joliet', 'kankakee', 'southwest', 'area']
label: 0 | prediction: 0 | similarity: 4.844229698181152 | words: ['648', '966', '659', 'census']
label: 0 | prediction: 0 | similarity: 7.514538764953613 | words: ['trashistan', 'confusion', 'song', 'songs']
label: 1 | prediction: 0 | similarity: 8.677441596984863 | words: ['gulag', '1937', 'settlement', 'town']
label: 0 | prediction: 0 | similarity: 15.849745750427246 | words: ['vymsky', 'administrative', 'divisions', 'mikun']
label: 1 | prediction: 0 | similarity: 35.92249298095703 | words: ['mamuretülaziz', 'elazığ', 'azık', 'atatürk']
label: 0 | prediction: 1 | similarity: 9.169516563415527 | words: ['khartabirt', 'carcathio', 'hurrian', 'ziyād']
label: 0 | prediction: 0 | similarity: 10.206045150756836 | words: ['çubukoğulları', 'safavids', 'ilkhanate', 'artuqids']
label: 0 | prediction: 0 | similarity: 6.824479103088379 | words: ['uninterrupted',

In [75]:
ground_truths, predictions

('000101000000000001000000000000101100010000100010001000010100000010000000010000000000100000000000000',
 '000000100000000001100000000001000100000110000000000000010000001000000000010000000000010000001100000')

In [76]:
windowdiff(ground_truths, predictions, k=7), pk(ground_truths, predictions, k=7)

(0.5483870967741935, 0.23655913978494625)

---

### Testing Graph Functionality

In [23]:
nodes = G.get_nodes_at_distance(distance=1)
print([str(n) for n in nodes])

["Node('zurriola')", "Node('haussmannian')", "Node('cortazar')"]


In [24]:
# for n in nodes:
#     print([[str(y) for y in x] for x in G.get_all_paths_to_node(n)])

In [25]:
linear_paths = G.get_linear_paths_to_node(nodes[0])

In [26]:
for path in linear_paths:
    prev_node = None
    for node in path:
        if prev_node is not None:
            print(str(prev_node), "--", G.get_edge(prev_node, node)["weight"], "--> ", end="")
        prev_node = node
    print(str(node))

Node('zurriola') -- tensor([0.7485]) --> Node('alkartasuna') -- tensor([0.7556]) --> Node('berria') -- tensor([0.5859]) --> Node('1265') -- tensor([0.5713]) --> Node('oiasso') -- tensor([0.8044]) --> Node('ametzagaña') -- tensor([0.5119]) --> Node('overcast') -- tensor([0.5847]) --> Node('igeldo') -- tensor([0.6393]) --> Node('sebastiae')
Node('zurriola') -- tensor([0.7485]) --> Node('alkartasuna') -- tensor([0.7556]) --> Node('berria') -- tensor([0.5859]) --> Node('1265') -- tensor([0.5713]) --> Node('oiasso') -- tensor([0.8044]) --> Node('ametzagaña') -- tensor([0.5119]) --> Node('overcast') -- tensor([0.6868]) --> Node('seashore') -- tensor([0.5354]) --> Node('sebastiae')
Node('zurriola') -- tensor([0.7485]) --> Node('alkartasuna') -- tensor([0.7556]) --> Node('berria') -- tensor([0.5859]) --> Node('1265') -- tensor([0.5713]) --> Node('oiasso') -- tensor([0.8044]) --> Node('ametzagaña') -- tensor([0.5119]) --> Node('overcast') -- tensor([0.5863]) --> Node('urgull') -- tensor([0.5674

---

## Testing

In [273]:
import numpy as np
tensor1 = [[0.5, 0.6],[0.6,0.5]] #* 0.6 # multiplying with weight
tensor2 = [[0.4, 0.5],[0.7,0.8]] #* 0.4 # multiplying with weight
# pt_addition_result_ex = tensor1.add(tensor2) # addition of two tensors

np.mean([tensor1, tensor2], axis=0)

array([[0.45, 0.55],
       [0.65, 0.65]])