In [2]:
# Run if working locally
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
import sqlite3
from sqlite3 import Error
import pickle
import os, sys
import config

config.root_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.insert(0, config.root_path)

from src.dataset.dataset import RawData
from src.dataset.wikisection_preprocessing import (
    tokenize,
    clean_sentence,
    preprocess_text_segmentation,
    format_data_for_db_insertion,
    remove_blank_sentence,
    flatten_list,
)
from db.dbv2 import DB, Table, TestTable, ValidationTable

In [6]:
dataset_type = "academic"

In [11]:
dataset = RawData(dataset_type)

data = dataset.get_qmsum_data(split="test", dataset="Academic")
# cleaned_data = preprocess_text_segmentation(data)

In [12]:
data

[{'topic_list': [{'topic': 'DARPA meeting',
    'relevant_text_span': [['0', '96']]},
   {'topic': 'PLP results', 'relevant_text_span': [['521', '744']]},
   {'topic': 'Classifier segmentation',
    'relevant_text_span': [['97', '156']]}],
  'general_query_list': [{'query': 'What was the general discussion of the meeting?',
    'answer': "A pressing concern for the group is the DARPA meeting in July, which is only a short time away, and for which they would like to have some progress. PLP results for the front-end look good, with the group also reporting progress in segmentation: Thilo's segmenter will now be used and ways of improving performance investigated; The classifier segmentation is progressing well, especially in the use of prosody for identifying interruption. Work on the front end continues, with improvements of 3-5% being made."}],
  'specific_query_list': [{'query': 'What was decided on DARPA?',
    'answer': 'A pressing concern for the group is the DARPA meeting in July,

In [5]:
print("length before blank sentence removal:", len(flatten_list(cleaned_data)))
non_blank_sentences = [remove_blank_sentence(segment) for segment in cleaned_data]
print("length after blank sentence removal:", len(flatten_list(non_blank_sentences)))

length before blank sentence removal: 92839
length after blank sentence removal: 92839


<IPython.core.display.Javascript object>

## Format the Data

In [6]:
data_to_insert = format_data_for_db_insertion(non_blank_sentences)

<IPython.core.display.Javascript object>

In [7]:
data_to_insert[:5]

[('In spite of appearances, both the Basque form Donostia and the Spanish form San Sebastián have the same meaning of Saint Sebastian. The dona/done/doni element in Basque place-names signifies "saint" and is derived from Latin domine; the second part of Donostia contains a shortened form of the saint\'s name. There are two hypotheses regarding the evolution of the Basque name: one says it was *Done Sebastiáne > Donasa(b)astiai > Donasastia > Donastia > Donostia, the other one says it was *Done Sebastiane > *Done Sebastiae > *Done Sebastie > *Donesebastia > *Donasastia > *Donastia > Donostia.\n',
  1,
  None,
  0),
 ("The city is in the north of the Basque Autonomous Community, on the southern coast of the Bay of Biscay. San Sebastián's picturesque shoreline makes it a popular beach resort. The seaside environment is enhanced by hilly surroundings that are easily accessible, i.e., Urgull (at the heart of the city by the seashore), romantic Mount Ulia extending east to Pasaia, Mount Ada

<IPython.core.display.Javascript object>

## Iterative Import
Because I'm going to sleep...

In [8]:
for dataset_type in ["city", "disease"]:
    database = r"{}/db/{}.db".format(config.root_path, dataset_type)

    for split in ["test", "train", "validation"]:
        dataset = RawData(dataset_type)

        data = dataset.get_data(split=split)
        cleaned_data = preprocess_text_segmentation(data)
        non_blank_sentences = [
            remove_blank_sentence(segment) for segment in cleaned_data
        ]

        data_to_insert = format_data_for_db_insertion(non_blank_sentences)

        if split == "test":
            table = TestTable(dataset_type)
        elif split == "validation":
            table = ValidationTable(dataset_type)
        else:
            table = Table(dataset_type)

        for i, sentence in enumerate(data_to_insert):
            sentence_id = table.create_sentence(sentence)
            if i % 500 == 0:
                print(f"Entered {i} sentences...")

Entered 0 sentences...
Entered 500 sentences...
Entered 1000 sentences...
Entered 1500 sentences...
Entered 2000 sentences...
Entered 2500 sentences...
Entered 3000 sentences...
Entered 3500 sentences...
Entered 4000 sentences...
Entered 4500 sentences...
Entered 5000 sentences...
Entered 5500 sentences...
Entered 6000 sentences...
Entered 6500 sentences...
Entered 7000 sentences...
Entered 7500 sentences...
Entered 8000 sentences...
Entered 8500 sentences...
Entered 9000 sentences...
Entered 9500 sentences...
Entered 10000 sentences...
Entered 10500 sentences...
Entered 11000 sentences...
Entered 11500 sentences...
Entered 12000 sentences...
Entered 12500 sentences...
Entered 13000 sentences...
Entered 13500 sentences...
Entered 14000 sentences...
Entered 14500 sentences...
Entered 15000 sentences...
Entered 15500 sentences...
Entered 16000 sentences...
Entered 16500 sentences...
Entered 17000 sentences...
Entered 17500 sentences...
Entered 18000 sentences...
Entered 18500 sentences..

Entered 13500 sentences...
Entered 14000 sentences...
Entered 14500 sentences...
Entered 15000 sentences...
Entered 15500 sentences...
Entered 16000 sentences...
Entered 16500 sentences...
Entered 17000 sentences...
Entered 17500 sentences...
Entered 18000 sentences...
Entered 18500 sentences...
Entered 19000 sentences...
Entered 19500 sentences...
Entered 0 sentences...
Entered 500 sentences...
Entered 1000 sentences...
Entered 1500 sentences...
Entered 2000 sentences...
Entered 2500 sentences...


<IPython.core.display.Javascript object>

In [31]:
"something something {disfmarker}".replace("{disfmarker}", "")

'something something '

<IPython.core.display.Javascript object>