In [36]:
# Run if working locally
%load_ext autoreload
%autoreload 2
%load_ext nb_black

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [90]:
import sqlite3
from sqlite3 import Error
import pickle
import os, sys
import config

config.root_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.insert(0, config.root_path)

from src.dataset.dataset import RawData
from src.dataset.wikisection_preprocessing import (
    tokenize,
    clean_sentence,
    preprocess_text_segmentation,
    format_data_for_db_insertion,
)
from src.dataset.utils import truncate_by_token
from db.dbv2 import Table, AugmentedTable, TrainTestTable
import pprint


from utils.metrics import windowdiff, pk

from src.bertkeywords.src.similarities import Embedding, Similarities
from src.bertkeywords.src.keywords import Keywords
from src.encoders.coherence_v2 import Coherence
from src.dataset.utils import flatten, dedupe_list, truncate_string
from src.experimentation.coherence_v2 import SimpleExperiment, CoherenceExperiment

<IPython.core.display.Javascript object>

In [84]:
dataset_type = "city"
table = Table(dataset_type)
augmented_table = AugmentedTable(dataset_type)
train_test_table = TrainTestTable(dataset_type)

<IPython.core.display.Javascript object>

In [85]:
data = table.get_all()

text_data = [x[1] for x in data]
text_labels = [x[2] for x in data]

all_segments = table.get_all_segments()

segments = [[y[1] for y in x] for x in all_segments]
segments_labels = [[1 if i == 0 else 0 for i, y in enumerate(x)] for x in all_segments]

flattened_segments = flatten(segments)
flattened_labels = flatten(segments_labels)

<IPython.core.display.Javascript object>

In [150]:
experimentation = SimpleExperiment()

experimentation.queue_experiment(
    CoherenceExperiment(
        dataset_type="city",
        model_string="bert-base-uncased",
        max_words_per_step=3,
        start=100,
        num_samples=250,
        same_word_multiplier=2,
        no_same_word_penalty=2,
        kb_embeddings=True,
        coherence_dump_on_prediction=True,
        coherence_threshold=0.3,
        prediction_threshold=0.47,
        pruning=0,
        pruning_min=0,
        batch_size=10,
        print_metrics_summary=True,
        print_predictions_summary=True,
    )
)

<IPython.core.display.Javascript object>

In [151]:
# TODO: something wrong with the batching - FIX!!

experimentation.run()

Running experiment set: eIpdb
Running experiment: CoherenceExperiment(num_samples=250, start=100, dataset_type='city', model_string='bert-base-uncased', max_words_per_step=3, same_word_multiplier=1, no_same_word_penalty=1, prediction_threshold=0.47, coherence_threshold=0.3, coherence_dump_on_prediction=True, pruning=0, pruning_min=0, dynamic_threshold=False, threshold_warmup=10, last_n_threshold=5, kb_embeddings=True, experiment_hash='RPE3I', batch_size=10)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
No sentence-transformers model found with name /Users/amitmaraj/.cache/torch/sentence_transformers/bert-base-uncased. Creating a new one with MEAN pooling.

.........1..........2..........3..........4..........5..........6..........7..........8..........9..........10..........11..........12..........13..........14..........15..........16..........17..........18..........19..........20..........21..........22..........23..........24..........25prediction threshold: 0.45
pk score: 0.2667560321715818
wd score: 0.2667560321715818
prediction threshold: 0.46
pk score: 0.26005361930294907
wd score: 0.26005361930294907
prediction threshold: 0.47
pk score: 0.23994638069705093
wd score: 0.24262734584450402
prediction threshold: 0.48
pk score: 0.21983914209115282
wd score: 0.225201072386059
prediction threshold: 0.49
pk score: 0.2225201072386059
wd score: 0.23324396782841822
prediction threshold: 0.5
pk score: 0.23726541554959785
wd score: 0.2479892761394102
prediction threshold: 0.51
pk score: 0.2587131367292225
wd score: 0.2801608579088472
prediction threshold: 0.52
pk score: 0.2734584450402145
wd score: 0.30294906166219837
prediction threshold: 0.

<IPython.core.display.Javascript object>

In [15]:
import numpy as np

<IPython.core.display.Javascript object>

In [105]:
some_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

batch_size = 3
batch_num = 0

for i in range(0, len(some_list) // batch_size):
    print(some_list[i * batch_size : i * batch_size + batch_size])

[1, 2, 3]
[4, 5, 6]
[7, 8, 9]


<IPython.core.display.Javascript object>

In [109]:
some_list[2 : batch_size * (len(some_list) // batch_size)]

[3, 4, 5, 6, 7, 8, 9]

<IPython.core.display.Javascript object>