In [36]:
# Run if working locally
%load_ext autoreload
%autoreload 2
%load_ext nb_black

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [181]:
import sqlite3
from sqlite3 import Error
import pickle
import os, sys
import config

config.root_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.insert(0, config.root_path)

from src.dataset.dataset import RawData
from src.dataset.wikisection_preprocessing import (
    tokenize,
    clean_sentence,
    preprocess_text_segmentation,
    format_data_for_db_insertion,
)
from src.dataset.utils import truncate_by_token
from db.dbv2 import Table, AugmentedTable, TrainTestTable
import pprint


from utils.metrics import windowdiff, pk

from src.bertkeywords.src.similarities import Embedding, Similarities
from src.bertkeywords.src.keywords import Keywords
from src.encoders.coherence_v2 import Coherence
from src.dataset.utils import flatten, dedupe_list, truncate_string
from src.experimentation.coherence_v2 import SimpleExperiment, CoherenceExperiment

<IPython.core.display.Javascript object>

In [165]:
dataset_type = "city"
table = Table(dataset_type)
augmented_table = AugmentedTable(dataset_type)
train_test_table = TrainTestTable(dataset_type)

<IPython.core.display.Javascript object>

In [85]:
data = table.get_all()

text_data = [x[1] for x in data]
text_labels = [x[2] for x in data]

all_segments = table.get_all_segments()

segments = [[y[1] for y in x] for x in all_segments]
segments_labels = [[1 if i == 0 else 0 for i, y in enumerate(x)] for x in all_segments]

flattened_segments = flatten(segments)
flattened_labels = flatten(segments_labels)

<IPython.core.display.Javascript object>

In [186]:
experimentation = SimpleExperiment()

experimentation.queue_experiment(
    CoherenceExperiment(
        dataset_type="city",
        model_string="bert-base-uncased",
        max_words_per_step=4,
        start=100,
        num_samples=200,
        same_word_multiplier=1,
        no_same_word_penalty=1,
        kb_embeddings=True,
        coherence_dump_on_prediction=False,
        coherence_threshold=0.2,
        prediction_threshold=0.47,
        pruning=1,
        pruning_min=6,
        batch_size=10,
        print_metrics_summary=True,
        print_predictions_summary=True,
        keyword_diversity=0.4,
    )
)

<IPython.core.display.Javascript object>

In [187]:
experimentation.run()

Running experiment set: PWrLn
Running experiment: CoherenceExperiment(num_samples=200, start=100, dataset_type='city', model_string='bert-base-uncased', max_words_per_step=4, same_word_multiplier=1, no_same_word_penalty=1, prediction_threshold=0.47, coherence_threshold=0.2, coherence_dump_on_prediction=False, pruning=1, pruning_min=6, dynamic_threshold=False, threshold_warmup=10, last_n_threshold=5, kb_embeddings=True, experiment_hash='BTRbj', batch_size=10, keyword_diversity=0.4, print_metrics_summary=True, print_predictions_summary=True)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at /Users/amitmaraj/.cache/torch/sentence_transformers/bert-base-uncased were not used when initializing BertModel: ['

.........1..........2..........3..........4..........5..........6..........7..........8..........9..........10..........11..........12..........13..........14..........15..........16..........17..........18..........19..........20
prediction threshold: 0.4
pk score: 0.2701342281879195
wd score: 0.2701342281879195
prediction threshold: 0.41
pk score: 0.2701342281879195
wd score: 0.2701342281879195
prediction threshold: 0.42
pk score: 0.2701342281879195
wd score: 0.2701342281879195
prediction threshold: 0.43
pk score: 0.2701342281879195
wd score: 0.2701342281879195
prediction threshold: 0.44
pk score: 0.2701342281879195
wd score: 0.2701342281879195
prediction threshold: 0.45
pk score: 0.2634228187919463
wd score: 0.2634228187919463
prediction threshold: 0.46
pk score: 0.25
wd score: 0.2533557046979866
prediction threshold: 0.47
pk score: 0.2332214765100671
wd score: 0.24328859060402686
prediction threshold: 0.48
pk score: 0.24161073825503357
wd score: 0.2516778523489933
prediction thresh

<IPython.core.display.Javascript object>