In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import config
import sys
import os

config.root_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.insert(0, config.root_path)

In [3]:
from db.dbv2 import Table, AugmentedTable, TrainTestTable
from src.dataset.utils import truncate_by_token, flatten, dedupe_list, truncate_string

### === Prepare Dataset ===

In [4]:
num_sentences = 300
offset = 500
max_segment_length = 99

In [5]:
def get_data(dataset_type: str):
    table = Table(dataset_type)
    
    all_segments = table.get_all_segments()
    
    segments = [[y[1] for y in x][:max_segment_length] for x in all_segments]
    segments_labels = [
        [1 if i == 0 else 0 for i, y in enumerate(x)][:max_segment_length] for x in all_segments
    ]
    
    flattened_segments = flatten(segments)
    flattened_labels = flatten(segments_labels)
    
    segments_to_test = flattened_segments[offset:offset+num_sentences]
    labels_to_test = flattened_labels[offset:offset+num_sentences]

    return segments_to_test, labels_to_test

### === Testing ===

In [6]:
from src.determinor import Determinor
from nltk.metrics.segmentation import pk, windowdiff

Testing Ollama

In [8]:
# for dataset_type in ["choi_3_5", "choi_3_11", "choi_9_11", "choi_6_8", "city", "disease", "manifesto"]:
# for dataset_type in ["committee", "academic", "product"]:
# for dataset_type in ["choi_3_5", "choi_3_11", "choi_9_11", "choi_6_8"]:
for dataset_type in ["city", "disease"]:
# for dataset_type in ["wiki50k"]:
# for dataset_type in ["manifesto"]:
    segments, labels = get_data(dataset_type)

    print(f"evaluating {dataset_type}")
    determinor = Determinor(max_context_window=5, meeting_dataset=False)
    predictions = determinor.query_batch_data(segments)

    preds = [0 if p == True else 1 for p in predictions]

    str_labels = ''.join([str(x) for x in labels])
    str_predictions = ''.join([str(x) for x in preds])
    print()
    print(f"L: {str_labels}")
    print(f"P: {str_predictions}")

    for k in [2,3,4,5,6,7,10,14,20]:
        print(f"k: {k}, pk: {pk(str_labels, str_predictions, k=k)}, wd: {windowdiff(str_labels, str_predictions, k=k)}")

Using dataset: wikisection_city
evaluating city
.............|...||..|...|....|..||||...||..||...................||....||||.|......||||||.|....|.||.||....|.|..||...|..|......|.|....|||....||||....|||................||||||||....|.|.........|............|.......||.|......|||................||...........|.|.||............|.|...|..||.|||...|........|
L: 000000000000010100000010000000100000100001001100000000000000000001000001001000000001000000100000010000000010100010001001000000101000010100000011000000100000000000000000010010100010000100000001000000000000010000000101000000101000000000000000000000000000001000010000000000001000001000001110000000000000
P: 000000000000010001100100010000100111100011001100000000000000000001100001111010000001111110100001011011000010100110001001000000101000011100001111000011100000000000000001111111100001010000000001000000000000100000001101000000111000000000000000011000000000001010110000000000001010001001101110001000000001
k: 2, pk: 0.19732441471571907, wd: 0.267558

Testing GPT 4o

In [9]:
for dataset_type in ["city", "disease"]:
    segments, labels = get_data(dataset_type)

    print(f"evaluating {dataset_type}")
    determinor = Determinor(max_context_window=5, meeting_dataset=False, openai_4o=True)
    predictions = determinor.query_batch_data(segments)

    preds = [0 if p == True else 1 for p in predictions]

    str_labels = ''.join([str(x) for x in labels])
    str_predictions = ''.join([str(x) for x in preds])
    print()
    print(f"L: {str_labels}")
    print(f"P: {str_predictions}")

    for k in [2,3,4,5,6,7,10,14,20]:
        print(f"k: {k}, pk: {pk(str_labels, str_predictions, k=k)}, wd: {windowdiff(str_labels, str_predictions, k=k)}")

Using dataset: wikisection_city
evaluating city
.|...........|.|...|.|||..|||.|...|||...||..||....|..............|..|||||||........||.||..|......|..||.||.|.|..||...|..||.....|.|....|||......||......||......||....|..||||||||...|....|.......||.........||||.......|.|......|||.............................|....|............|.|...|.....|||............|
L: 000000000000010100000010000000100000100001001100000000000000000001000001001000000001000000100000010000000010100010001001000000101000010100000011000000100000000000000000010010100010000100000001000000000000010000000101000000101000000000000000000000000000001000010000000000001000001000001110000000000000
P: 010000000000010100010111001110100011100011001100001000000000000001001111111000000001101100100000010011011010100110001001100000101000011100000011000000110000001100001001111111100010000100000001100000000011110000000101000000111000000000000000000000000000001000010000000000001010001000001110000000000001
k: 2, pk: 0.15719063545150502, wd: 0.230769

Testing GPT o1

In [10]:
for dataset_type in ["city", "disease"]:
    segments, labels = get_data(dataset_type)

    print(f"evaluating {dataset_type}")
    determinor = Determinor(max_context_window=5, meeting_dataset=False, openai_o1=True)
    predictions = determinor.query_batch_data(segments)

    preds = [0 if p == True else 1 for p in predictions]

    str_labels = ''.join([str(x) for x in labels])
    str_predictions = ''.join([str(x) for x in preds])
    print()
    print(f"L: {str_labels}")
    print(f"P: {str_predictions}")

    for k in [2,3,4,5,6,7,10,14,20]:
        print(f"k: {k}, pk: {pk(str_labels, str_predictions, k=k)}, wd: {windowdiff(str_labels, str_predictions, k=k)}")

Using dataset: wikisection_city
evaluating city
.||||||...||||||||||||||.||||||..||||||.||||||..||||.||....|.|||||....||..||||.....|||||..|....|||..|||||||||..||..||||||.|||.||||.|||||...||||||.||..|||||..|.|.||||.||.||||||.|||||||||..|||||..........||||....||||.||||...|.|.....|||.......|||....|.||.||||||||.|..........|.|..|||.||.|.||.||...|..|||
L: 000000000000010100000010000000100000100001001100000000000000000001000001001000000001000000100000010000000010100010001001000000101000010100000011000000100000000000000000010010100010000100000001000000000000010000000101000000101000000000000000000000000000001000010000000000001000001000001110000000000000
P: 011111100011111111111111011111100111111011111100111101100001011111000011001111000001111100100001110011111111100110011111101110111101111100011111101100111110010101111011011111101111111110011111000000000011110000111101111000101000001110000000111000010110111111110100000000001010011101101011011000100111
k: 2, pk: 0.49498327759197325, wd: 0.675585

In [None]:
for dataset_type in ["city", "disease"]:
    table = Table(dataset_type)
    
    all_segments = table.get_all_segments()
    
    segments = [[y[1] for y in x] for x in all_segments]
    segments_labels = [
        [1 if i == 0 else 0 for i, y in enumerate(x)] for x in all_segments
    ]
    flattened_segments = flatten(segments)
    flattened_labels = flatten(segments_labels)
    
    print(len(flattened_labels)//flattened_labels.count(1))