In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import config
import sys
import os

config.root_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.insert(0, config.root_path)

In [3]:
from db.dbv2 import Table, AugmentedTable, TrainTestTable
from src.dataset.utils import truncate_by_token, flatten, dedupe_list, truncate_string

### === Prepare Dataset ===

In [4]:
num_sentences = 200
offset = 500
max_segment_length = 99

In [5]:
def get_data(dataset_type: str):
    table = Table(dataset_type)
    
    all_segments = table.get_all_segments()
    
    segments = [[y[1] for y in x][:max_segment_length] for x in all_segments]
    segments_labels = [
        [1 if i == 0 else 0 for i, y in enumerate(x)][:max_segment_length] for x in all_segments
    ]
    
    flattened_segments = flatten(segments)
    flattened_labels = flatten(segments_labels)
    
    segments_to_test = flattened_segments[offset:offset+num_sentences]
    labels_to_test = flattened_labels[offset:offset+num_sentences]

    return segments_to_test, labels_to_test

### === Testing ===

In [6]:
from src.determinor import Determinor
from nltk.metrics.segmentation import pk, windowdiff

In [7]:
# for dataset_type in ["choi_3_5", "choi_3_11", "choi_9_11", "choi_6_8", "city", "disease", "manifesto"]:
for dataset_type in ["committee", "academic", "product"]:
# for dataset_type in ["choi_3_5", "choi_3_11", "choi_9_11", "choi_6_8"]:
# for dataset_type in ["city", "disease"]:
# for dataset_type in ["wiki50k"]:
# for dataset_type in ["manifesto"]:
    segments, labels = get_data(dataset_type)

    print(f"evaluating {dataset_type}")
    determinor = Determinor(max_context_window=7)
    predictions = determinor.query_batch_data(segments)

    preds = [0 if p == True else 1 for p in predictions]

    str_labels = ''.join([str(x) for x in labels])
    str_predictions = ''.join([str(x) for x in preds])
    print()
    print(f"L: {str_labels}")
    print(f"P: {str_predictions}")

    for k in [2,3,4,5,6,7]:
        print(f"k: {k}, pk: {pk(str_labels, str_predictions, k=k)}, wd: {windowdiff(str_labels, str_predictions, k=k)}")

Using dataset: qmsum_committee
evaluating committee
..|.....|...|........|.||........|.....|....|........|...||..|..........|..|.|......|..|...............|..|..|...|....|...|.||........|.....|.....||...........|......|..|.|.....||..|...|.|.......|.|..
L: 00000000000000000000000000000000000000000100000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000010000000000000000000000000000
P: 00100000100010000000010110000000010000010000100000000100011001000000000010010100000010010000000000000001001001000100001000101100000000100000100000110000000000010000001001010000011001000101000000010100
k: 2, pk: 0.3768844221105528, wd: 0.3768844221105528
k: 3, pk: 0.51010101010101, wd: 0.5151515151515151
k: 4, pk: 0.5888324873096447, wd: 0.6040609137055838
k: 5, pk: 0.6428571428571429, wd: 0.6683673469387755
k: 6, pk: 0.6974358974358974, wd: 0.7384615384615385
k: 7, pk: 0.7164948453608248, wd: 0.7783505154639175
Using dataset: qmsum_ac

AssertionError: Sentences are required.

In [33]:
# for dataset_type in ["choi_3_5", "choi_3_11", "choi_9_11", "choi_6_8", "city", "disease", "manifesto"]:
# for dataset_type in ["committee", "academic", "product"]:
# for dataset_type in ["choi_3_5", "choi_3_11", "choi_9_11", "choi_6_8"]:
for dataset_type in ["city", "disease"]:
# for dataset_type in ["wiki50k"]:
# for dataset_type in ["manifesto"]:
    segments, labels = get_data(dataset_type)

    print(f"evaluating {dataset_type}")
    determinor = Determinor(max_context_window=1)
    predictions = determinor.query_batch_data(segments)

    preds = [0 if p == True else 1 for p in predictions]

    str_labels = ''.join([str(x) for x in labels])
    str_predictions = ''.join([str(x) for x in preds])
    print()
    print(f"L: {str_labels}")
    print(f"P: {str_predictions}")

    for k in [2,3,4,5,6,7]:
        print(f"k: {k}, pk: {pk(str_labels, str_predictions, k=k)}, wd: {windowdiff(str_labels, str_predictions, k=k)}")

Using dataset: wikisection_city
evaluating city
.|....|..||..|....||.|......|.|....||.|.||.|||..|.|..|.......|.|.||..||||||........|.||||.|.|..||||..|.|..|.|..||.....||...||.|.|..|.|||.....||...|.|||...|..|||.|..|..||||||.|.||||.|.|....||||...........|...|....||.|..|...|||......|...|.....||.||...|...||.|.||............|.......|||.|||...|..||.||||
L: 000000000000010100000010000000100000100001001100000000000000000001000001001000000001000000100000010000000010100010001001000000101000010100000011000000100000000000000000010010100010000100000001000000000000010000000101000000101000000000000000000000000000001000010000000000001000001000001110000000000000
P: 010000100110010000110100000010100001101011011100101001000000010101100111111000000001011110101001111001010010100110000011000110101001011100000110001011100010011101001001111110101111010100001111000000000001000100001101001000111000000100010000011011000100011010110000000000001000000011101110001001101111
k: 2, pk: 0.4180602006688963, wd: 0.5083612

## CHOI

Testing with a max content window the same size as the average segment

In [11]:
for (dataset_type, max_context_window) in [("choi_3_5", 4), ("choi_3_11", 7), ("choi_9_11", 10), ("choi_6_8", 7)]:
    segments, labels = get_data(dataset_type)

    print(f"evaluating {dataset_type}")
    determinor = Determinor(max_context_window=max_context_window)
    predictions = determinor.query_batch_data(segments)

    preds = [0 if p == True else 1 for p in predictions]

    str_labels = ''.join([str(x) for x in labels])
    str_predictions = ''.join([str(x) for x in preds])
    print()
    print(f"L: {str_labels}")
    print(f"P: {str_predictions}")

    for k in [2,3,4,5,6,7]:
        print(f"k: {k}, pk: {pk(str_labels, str_predictions, k=k)}, wd: {windowdiff(str_labels, str_predictions, k=k)}")

Using dataset: choi_3_5
evaluating choi_3_5
....|...|.||..|....|..|...|||..|..|....|..|..|....|...|.||..|....|..|....|....|..|...|...|...|....|..|..||.|.||....|.|..|.|||..|....|....|....|....|.||...|..|....|....|..|..|.|..|..|....|...|....|..|.
L: 00001000100100100001001000100001001000010010010000100010010010000100100001000010010001000100010000100100100100100001000010001001000010000100001000010010001001000010000100100001001001000010001000010010
P: 00001000101100100001001000111001001000010010010000100010110010000100100001000010010001000100010000100100110101100001010010111001000010000100001000010110001001000010000100100101001001000010001000010010
k: 2, pk: 0.06532663316582915, wd: 0.10050251256281408
k: 3, pk: 0.03535353535353535, wd: 0.14646464646464646
k: 4, pk: 0.015228426395939087, wd: 0.19289340101522842
k: 5, pk: 0.0, wd: 0.23469387755102042
k: 6, pk: 0.0, wd: 0.2717948717948718
k: 7, pk: 0.0, wd: 0.30927835051546393
Using dataset: choi_3_11
evaluating choi_3_11
...|...|......|.|..

Testing with no context window history

In [12]:
for (dataset_type, max_context_window) in [("choi_3_5", 4), ("choi_3_11", 7), ("choi_9_11", 10), ("choi_6_8", 7)]:
    segments, labels = get_data(dataset_type)

    print(f"evaluating {dataset_type}")
    determinor = Determinor(max_context_window=1)
    predictions = determinor.query_batch_data(segments)

    preds = [0 if p == True else 1 for p in predictions]

    str_labels = ''.join([str(x) for x in labels])
    str_predictions = ''.join([str(x) for x in preds])
    print()
    print(f"L: {str_labels}")
    print(f"P: {str_predictions}")

    for k in [2,3,4,5,6,7]:
        print(f"k: {k}, pk: {pk(str_labels, str_predictions, k=k)}, wd: {windowdiff(str_labels, str_predictions, k=k)}")

Using dataset: choi_3_5
evaluating choi_3_5
..|.|...|.||..|...||..|...||||.|.......|..|..|....|...|.....|...||.|||.|.|.|..|..||.||...|...|....|.||..||.|.||..|.|....|.|.|..|....|....|....|..|.|.||..||..|.|..|....|..|..|.|..|..|....|...|....|..|.
L: 00001000100100100001001000100001001000010010010000100010010010000100100001000010010001000100010000100100100100100001000010001001000010000100001000010010001001000010000100100001001001000010001000010010
P: 00101000101100100011001000111101000000010010010000100010000010001101110101010010011011000100010000101100110101100101000010101001000010000100001001010110011001010010000100100101001001000010001000010010
k: 2, pk: 0.17587939698492464, wd: 0.24120603015075376
k: 3, pk: 0.13636363636363635, wd: 0.3434343434343434
k: 4, pk: 0.07614213197969544, wd: 0.4365482233502538
k: 5, pk: 0.02040816326530612, wd: 0.5051020408163265
k: 6, pk: 0.010256410256410256, wd: 0.558974358974359
k: 7, pk: 0.005154639175257732, wd: 0.6134020618556701
Using dataset: choi_3