# Dataset Pre-Processing Notebook
The purpose of this notebook is to preprocess each file of the datasets we collected.
We want all the dataset as a single csv-file with stories as entries.

In [38]:
import csv
import os
import json

from time import time
import logging as log
import functools

from nltk.tokenize import sent_tokenize
import re

### Settings

In [39]:
log.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    level=log.INFO,
    datefmt='%Y-%m-%d %H:%M:%S')

log_enabled = True
show_notebook_results = True

### General functions

#### Decorators

In [40]:
def log_dec(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        try:
            if log_enabled:
                start_time = time()
                log.info('{} started'.format(func.__name__))
            return func(*args, **kwargs)
        except Exception as ex:
            raise ex
        finally:
            if log_enabled:
                duration = time() - start_time
                log.info('{} finished'.format(func.__name__))
    return wrapper

def run_notebook(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        if show_notebook_results:
            return func(*args, **kwargs)
        else:
            return
    return wrapper

def save_and_load_to_path(path):
    def decorator(func):
        @functools.wraps(func)
        def wrapper(*args, **kwargs):
            save_data = True
            try:
                try:
                    with open(path, 'r') as file:
                        save_data = False
                        return json.load(file)
                except:
                    return func(*args, **kwargs)
            except Exception as ex:
                raise ex
            finally:
                if save_data:
                    with open(path, 'w') as file:
                        json.dump(func(*args, **kwargs), file)
            return wrapper
        return decorator

#### Utils

In [41]:

@log_dec
def read_in_text(file_path):
    with open(file_path, 'r') as file:
        data = file.read()
    return data

@log_dec
def read_in_csv(file_path, data_range=None):
    _line_delim = '\n'
    _clm_delim = ', '

    with open(file_path, 'r') as file:
        data = csv.reader(file, delimiter=',')
        next(data) # deletes the header row of the csv-file
        if not data_range is None:
            merged_data = _line_delim.join(_clm_delim.join(row[i] for i in data_range) for row in data)
        else:
            merged_data = _line_delim.join(_clm_delim.join(row) for row in data)
    return merged_data

## Children stories text corpus


* This corpus has each sentence in a new row.
* There is no \<eos\>. Stories are mostly separated by "\n\n Title \n\n".
* We will use "\n\n" as an indicator for a new story.
* We will use "\n" as indicator for \<eos\>.

In [42]:
@run_notebook
@log_dec
def tokenize_children_stories():
    data = read_in_text('datasets_raw_data\\children_stories_text_corpus\\cleaned_merged_fairy_tales_without_eos.txt')
    data_split = list(map(lambda text: sent_tokenize(text), data.split(sep='\n\n')))
    return data_split

## Poe Short Stories


* This corpus is composed of a csv-file. The structure is: title, text, wikipedia_title, publication_date, first_published_in, classification, notes, normalized_date.
* We can thus easily export the text and apply story separation.
* We need to use ". " as \<eos\>.

In [43]:
@run_notebook
@log_dec
def tokenize_poe_short_stories():
    data = read_in_csv('datasets_raw_data\\poe_short_stories\\preprocessed_data.csv', data_range=[1])
    data_split = list(map(lambda text: sent_tokenize(text), data.split(sep='\n')))
    return data_split

## Reddit Short Stories


* Each line of [reddit_short_stories.txt](https://github.com/tdude92/reddit-short-stories/blob/main/reddit_short_stories.txt) is one full short story.
* Each short story begins with an "\<sos>" token and ends with an "\<eos>" token (eg. "\<sos> once upon a time, the end \<eos>").
* Newline characters in a story are replaced with the "\<nl>" token (eg. "\<sos> line 1 \<nl> line 2 \<eos>")

In [44]:
@run_notebook
@log_dec
def tokenize_reddit_short_stories():
    data = read_in_text('datasets_raw_data\\popular_reddit_short_stories\\reddit_short_stories.txt')
    data_strip = re.sub(r'\<sos\>|\<eos\>|\<nl\>', '', data)
    data_split = list(map(lambda text: sent_tokenize(text), data_strip.split(sep='\n\n')))
    return data_split

## Scifi Stories Text Corpus


A really big corpus, don't use for now.

## Sherlock Holmes


* Each story is stored in a separate file.
* At the end of each story is this block:
    *      ----------
        This text is provided to you "as-is" without any warranty. No
        warranties of any kind, expressed or implied, are made to you as to
        the text or any medium it may be on, including but not limited to
        warranties of merchantablity or fitness for a particular purpose.

        This text was formatted from various free ASCII and HTML variants.
        See http://sherlock-holm.es for an electronic form of this text and
        additional information about it.

        This text comes from the collection's version 3.1.
* Sentences are not well separated into lines. No obvious solution at hand.

In [45]:
@log_dec
def tokenize_single_file_sherlock_holmes(file):
    data =              read_in_text(file)
    data_strip =        re.sub(r'\n|\s{2,}', ' ', data)
    data_strip_end =    re.split(r'---', data_strip)[0]
    data_split =        sent_tokenize(data_strip_end)
    return data_split

@run_notebook
@log_dec
def tokenize_sherlock_holmes():
    data = [tokenize_single_file_sherlock_holmes('datasets_raw_data\\sherlock_holmes' + '\\' + file)
            for file in os.listdir('datasets_raw_data\\sherlock_holmes')]
    return data

## The Works of Charles Dickens


* This dataset is really messy.
* Leave for later.

# Combine datasets


We want to combine our different datasets such that we have one large corpus of appropriate sized batches.

In [None]:
@log_dec
def tokenize_all():
    story_delim = '\n\n'
    sentence_delim = '\n'
    func_list = [
        tokenize_children_stories(),
        tokenize_poe_short_stories(),
        tokenize_reddit_short_stories(),
        tokenize_sherlock_holmes(),
    ]
    corpus = [story for collection in func_list for story in collection]
    with open('datasets\\corpus.txt', 'w') as file:
        #json.dump(corpus, file)
        text_data = story_delim.join(list(map(lambda text_list: sentence_delim.join(text_list),corpus)))
        file.write(text_data)
    return text_data

tokenize_all()