In [1]:
from pathlib import Path
import pandas as pd
import torch
import numpy as np

In [2]:
# Novellen

files = list(Path('/mnt/data/corpora/novellenschatz/novellenschatz3/Novellen im txt-Format/').glob('*.txt'))

In [3]:
from typing import List
import re
filename_splitter_regex = r'(.+[^,]),*_(.+?)-(.+)'

def build_dataframe(files: List[Path]) -> pd.DataFrame:
    data = []
    for file in files:
        filename = file.stem
        match = re.search(filename_splitter_regex, filename)
        if match is None:
            print(filename)
        lastname, firstname, title = match.group(1), match.group(2), match.group(3)
        text = file.read_text()
        data.append({'title': title, 'author_firstname': firstname, 'author_lastname': lastname, 'filename': filename, 'text': text})
    return pd.DataFrame.from_records(data)

In [4]:
df = build_dataframe(files)

In [5]:
df.to_csv('novellenschatz.csv', index=None)

In [6]:
df.text.str.split().apply(len).describe()


count       86.000000
mean     18649.186047
std      11330.957496
min       2933.000000
25%      12478.750000
50%      16604.000000
75%      22906.250000
max      65327.000000
Name: text, dtype: float64

In [7]:
from nltk import sent_tokenize

In [8]:
df['n_sents'] = df.text.apply(lambda text: sent_tokenize(text, language='german')).apply(len)
df['mean_sent_length'] = df.text.apply(lambda text: np.mean([len(s) for s in sent_tokenize(text, language='german')]))

In [9]:
df['n_sents'].describe()

count      86.000000
mean      956.197674
std       619.520855
min       101.000000
25%       534.000000
50%       833.000000
75%      1279.750000
max      3709.000000
Name: n_sents, dtype: float64

In [10]:
df['mean_sent_length'].describe()

count     86.000000
mean     128.497126
std       34.126777
min       64.378049
25%      101.693678
50%      124.939084
75%      143.892011
max      248.660422
Name: mean_sent_length, dtype: float64

In [11]:
sentences = df.text.apply(lambda text: sent_tokenize(text, language='german'))

In [12]:
text = sentences[1]

In [13]:
import numpy as np
from typing import Tuple, List

def shuffle_sentences(sents: List[str], window_size_mean = 10, window_size_std: int = 3, random_state: int = 42) -> Tuple[List[str], List[int]]: 
    """
    Splits a text (represented as list of sentences) into chunks with random size and shuffles the sentences within each chunk.
    Returns:
        List of tuples: The shuffled senteces and their indices in the original text order.
    """
    np.random.seed(random_state)
    orig_sents = np.array(sents.copy())
    sents = sents.copy()
    idx = list(range(len(sents)))
    shuffled_sents = []
    shuffled_idx = []
    while sents:
        n_sents = int(round(np.abs(np.random.normal(loc=window_size_mean, scale=window_size_std))))
        n_sents = n_sents if n_sents >= 2 else 2
        n_sents = min(n_sents, len(sents))
        selected_idx = np.array([idx.pop(0) for _ in range(n_sents)])
        selected_sents = np.array([sents.pop(0) for _ in range(n_sents)])
        np.random.shuffle(selected_idx)
        shuffled_selected_sents = orig_sents[selected_idx]
        shuffled_sents.append(shuffled_selected_sents)
        shuffled_idx.append(selected_idx)
        # TODO Remove me
        assert len(shuffled_selected_sents) == len(selected_idx)
    return list(zip(shuffled_sents, shuffled_idx))


def make_prepare_function(tokenizer):
    def prepare_sentence_ordering_dataset(entries):

        # Convert to list format [{k0: value00, k1: value10}, {k0: value01, k1: value11}]
        entries_as_dicts = [dict(zip(entries, values)) for values in zip(*entries.values())]

        converted_entries = []
        for entry in entries_as_dicts:
            text = entry['text']
            sents = sent_tokenize(text, language='german')
            shuffled = shuffle_sentences(sents)
            train_instances = []
            for shuffled_sents, shuffled_orig_idx in shuffled:
                train_instance = entry.copy()
                train_instance.pop('text')
                train_instance['orig_idx'] = shuffled_orig_idx
                train_text = f'{tokenizer.cls_token} ' + f' {tokenizer.cls_token} '.join(shuffled_sents)
                # TODO Remove me
                assert train_text.count(tokenizer.cls_token) == len(shuffled_orig_idx)
                train_instance['text'] = train_text
                train_instance['so_targets'] = shuffled_orig_idx.argsort()
                train_instances.append(train_instance)
            converted_entries.extend(train_instances)
        
        new_entry = {key: [entry[key] for entry in converted_entries] for key in converted_entries[0]}
        return new_entry

    
    return prepare_sentence_ordering_dataset

In [14]:
#def make_prepare_function(tokenizer):
#    def prepare_sentence_ordering_dataset(entries):
#        
#        text = entry['text']
#        sents = sent_tokenize(text, language='german')
#        shuffled = shuffle_sentences(sents)
#        train_instances = []
#        for shuffled_sents, shuffled_orig_idx in shuffled:
#            train_instance = entry.copy()
#            train_instance.pop('text')
#            train_instance['orig_idx'] = shuffled_orig_idx
#            train_text = tokenizer.cls_token.join(shuffled_sents)
#            train_instance['text'] = train_text
#            #train_instances['so_targets'] = 1 / (shuffled_orig_idx.argsort() + 1) use later
#            
#            train_instance['so_targets'] = shuffled_orig_idx.argsort()
#            train_instances.append(train_instance)
#        return train_instances
#    return prepare_sentence_ordering_dataset

In [15]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased', return_dict=True)

In [16]:
prepare_sentence_ordering_dataset = make_prepare_function(tokenizer)

In [17]:
from datasets import Dataset, DatasetDict

dataset = Dataset.from_csv('novellenschatz.csv')

# Because for each row we return multiple new ones we need to use batched mode ..
dataset = dataset.map(lambda x: prepare_sentence_ordering_dataset(x), batched=True)

Using custom data configuration default-aa38f69bc6e823bc


Downloading and preparing dataset csv/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /mnt/data/users/keller/.cache/csv/default-aa38f69bc6e823bc/0.0.0...


0 tables [00:00, ? tables/s]



  for obj in iterable:


  for obj in iterable:


Dataset csv downloaded and prepared to /mnt/data/users/keller/.cache/csv/default-aa38f69bc6e823bc/0.0.0. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?ba/s]

In [18]:
train_test = dataset.train_test_split(test_size=0.2, seed=42)

test_validation = train_test['test'].train_test_split(test_size=0.3, seed=42)

dataset = DatasetDict({
    'train': train_test['train'],
    'test': test_validation['train'],
    'val': test_validation['test']})

In [19]:
dataset

DatasetDict({
    train: Dataset({
        features: ['author_firstname', 'author_lastname', 'filename', 'orig_idx', 'so_targets', 'text', 'title'],
        num_rows: 6432
    })
    test: Dataset({
        features: ['author_firstname', 'author_lastname', 'filename', 'orig_idx', 'so_targets', 'text', 'title'],
        num_rows: 1125
    })
    val: Dataset({
        features: ['author_firstname', 'author_lastname', 'filename', 'orig_idx', 'so_targets', 'text', 'title'],
        num_rows: 483
    })
})

In [22]:
dataset.save_to_disk('novellenschatz4so')

In [21]:
! rm -r novellenschatz4so/