# Dataset Pre-Processing Notebook
The purpose of this notebook is to preprocess each file of the datasets we collected.
We want all the dataset as a single csv-file with stories as entries.

In [2]:
import csv
import os
import json

from time import time
import logging as log
import functools

from nltk.tokenize import sent_tokenize
import re

import tensorflow as tf

### Settings

In [3]:
log.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    level=log.INFO,
    datefmt='%Y-%m-%d %H:%M:%S')

log_enabled = True
show_notebook_results = True

### General functions

#### Decorators

1. `log_dec(func)`:

    This is a decorator function that logs the start and end time of the function it decorates if logging is enabled. 

    Parameters:
    
    `func`: The function to be logged.

    How it works: 

    - The `wrapper` function is defined to wrap around the `func`.
    - If `log_enabled` is `True`, the start time of the function is logged.
    - The `func` is then executed with its arguments (`*args` and `**kwargs`).
    - If there's any exception, it's raised; otherwise, the function returns the output of `func`.
    - Finally, if `log_enabled` is `True`, the duration of the function execution is calculated and logged.

2. `run_notebook(func)`:

    This is a decorator function that controls the display of the function it decorates based on the `show_notebook_results` flag.

    Parameters:

    `func`: The function whose results are to be controlled.

    How it works:

    - The `wrapper` function is defined to wrap around `func`.
    - If `show_notebook_results` is `True`, the function is executed and its result is returned.
    - If `show_notebook_results` is `False`, the function does not execute and no result is returned.

3. `save_and_load_to_path(path)`:

    This is a decorator factory that generates a decorator for saving the output of a function to a JSON file and loading it back the next time the function is called. 

    Parameters:

    `path`: The path where the JSON file is saved.

    How it works:

    - A decorator `decorator(func)` is defined that takes a function `func` to be decorated.
    - Inside this decorator, a `wrapper` function is defined to wrap around `func`.
    - Initially, `save_data` is set to `True`.
    - It then tries to open and load the JSON file at the given `path`. If it succeeds, it sets `save_data` to `False` and returns the loaded data.
    - If it fails to load the data, it runs `func` and returns its output.
    - If there's any exception, it's raised.
    - Finally, if `save_data` is still `True` (meaning the function was run and its output wasn't already saved), it saves the output of `func` to the JSON file at `path`.
    
    Note: These decorators are higher-order function that use `functools.wraps()` to preserve the metadata of `func`.

In [40]:
def log_dec(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        try:
            if log_enabled:
                start_time = time()
                log.info('{} started'.format(func.__name__))
            return func(*args, **kwargs)
        except Exception as ex:
            raise ex
        finally:
            if log_enabled:
                duration = time() - start_time
                log.info('{} finished'.format(func.__name__))
    return wrapper

def run_notebook(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        if show_notebook_results:
            return func(*args, **kwargs)
        else:
            return
    return wrapper

def save_and_load_to_path(path):
    def decorator(func):
        @functools.wraps(func)
        def wrapper(*args, **kwargs):
            save_data = True
            try:
                try:
                    with open(path, 'r') as file:
                        save_data = False
                        return json.load(file)
                except:
                    return func(*args, **kwargs)
            except Exception as ex:
                raise ex
            finally:
                if save_data:
                    with open(path, 'w') as file:
                        json.dump(func(*args, **kwargs), file)
            return wrapper
        return decorator

#### Utils

1. `read_in_text(file_path)`:

    This function reads in a text file and returns its contents as a string. 

    Parameters:
    
    `file_path`: A string specifying the path to the text file.

    How it works:

    - The function opens the file at `file_path` in read mode.
    - It then reads the entire content of the file into a string.
    - Finally, it returns this string.

    Note: This function is decorated with `@log_dec`, which will log the start and end time of its execution if logging is enabled.

2. `read_in_csv(file_path, data_range=None)`:

    This function reads in a CSV file and returns its content as a string.
    
    Parameters:

    `file_path`: A string specifying the path to the CSV file.
    
    `data_range` (optional): A range of column indices. If provided, only these columns will be included in the output string. Default is `None`, in which case all columns are included.

    How it works:

    - The function opens the file at `file_path` in read mode.
    - It then reads the content of the file into a CSV reader object, with a comma as the delimiter.
    - The header row of the CSV file is skipped.
    - If `data_range` is not `None`, it creates a string `merged_data` that contains only the columns specified by `data_range` from each row of the CSV file, with each column separated by a comma and each row separated by a newline.
    - If `data_range` is `None`, it creates a string `merged_data` that contains all columns from each row of the CSV file, with each column separated by a comma and each row separated by a newline.
    - Finally, it returns `merged_data`.

    Note: This function is decorated with `@log_dec`, which will log the start and end time of its execution if logging is enabled.

In [41]:

@log_dec
def read_in_text(file_path):
    with open(file_path, 'r') as file:
        data = file.read()
    return data

@log_dec
def read_in_csv(file_path, data_range=None):
    _line_delim = '\n'
    _clm_delim = ', '

    with open(file_path, 'r') as file:
        data = csv.reader(file, delimiter=',')
        next(data) # deletes the header row of the csv-file
        if not data_range is None:
            merged_data = _line_delim.join(_clm_delim.join(row[i] for i in data_range) for row in data)
        else:
            merged_data = _line_delim.join(_clm_delim.join(row) for row in data)
    return merged_data

# Dataset tokenization

1. `tokenize_children_stories()`:

    This function reads in a text file containing children's stories, splits the text into paragraphs, and then tokenizes each paragraph into sentences. It returns a list of lists, where each inner list contains the sentences of a paragraph.

    How it works:

    - The function reads in a text file using the `read_in_text()` function. The file path is 'datasets_raw_data\\children_stories_text_corpus\\cleaned_merged_fairy_tales_without_eos.txt'.
    - It then splits the read text into paragraphs at double newline ('\n\n') characters.
    - Next, it tokenizes each paragraph into sentences using the `sent_tokenize()` function from the NLTK library. This results in a list of lists, where each inner list contains the sentences of a paragraph.
    - Finally, it returns this list of lists.

2. `tokenize_poe_short_stories()`:

    This function reads in a CSV file containing short stories by Edgar Allan Poe, splits the text into paragraphs, and then tokenizes each paragraph into sentences. It returns a list of lists, where each inner list contains the sentences of a paragraph.

    How it works:

    - The function reads in a CSV file using the `read_in_csv()` function. The file path is 'datasets_raw_data\\poe_short_stories\\preprocessed_data.csv', and the `data_range` parameter is set to `[1]`, meaning that only the second column of the CSV file will be read in.
    - It then splits the read text into paragraphs at newline ('\n') characters.
    - Next, it tokenizes each paragraph into sentences using the `sent_tokenize()` function from the NLTK library. This results in a list of lists, where each inner list contains the sentences of a paragraph.
    - Finally, it returns this list of lists.

    Here's the documentation for these functions:

3. `tokenize_reddit_short_stories()`:

    This function reads a text file containing popular Reddit short stories, cleans the data by removing specific tags (`<sos>`, `<eos>`, `<nl>`), and then tokenizes the data into sentences.

    How it works:

    - It reads the text file using `read_in_text()` function from the specified path.
    - It removes the specific tags from the data using `re.sub()`.
    - It splits the cleaned data into paragraphs at double newline (`\n\n`) characters.
    - It tokenizes each paragraph into sentences using `sent_tokenize()` function, resulting in a list of lists, where each inner list contains the sentences of a paragraph.
    - Finally, it returns this list of lists.

    Note: This function is decorated with `@run_notebook` and `@log_dec`, implying that its execution can be toggled and its execution time will be logged if logging is enabled.

4. `tokenize_single_file_sherlock_holmes(file)`:

    This function reads a single text file of Sherlock Holmes stories, cleans the data by removing newline characters and extra white spaces, then tokenizes the data into sentences.

    Parameters:
    
    `file`: The path of the text file to be read and tokenized.

    How it works:

    - It reads the text file using `read_in_text()` function from the specified path.
    - It removes newline characters and extra white spaces from the data using `re.sub()`.
    - It splits the cleaned data at `---` characters and takes the first part.
    - It tokenizes the cleaned data into sentences using `sent_tokenize()` function.
    - Finally, it returns the list of tokenized sentences.

    Note: This function is decorated with `@log_dec`, meaning that its execution time will be logged if logging is enabled.

5. `tokenize_sherlock_holmes()`:

    This function reads and tokenizes all the text files in the Sherlock Holmes dataset directory.

    How it works:

    - It iterates over all the files in the 'datasets_raw_data\\sherlock_holmes' directory.
    - For each file, it applies the `tokenize_single_file_sherlock_holmes()` function, which returns a list of tokenized sentences.
    - Finally, it returns a list of these lists.

    Note: This function is decorated with `@run_notebook` and `@log_dec`, meaning that its execution can be toggled and its execution time will be logged if logging is enabled.

6. `tokenize_all()`:

    This function tokenizes all the stories from different sources (children stories, Poe short stories, Reddit short stories, Sherlock Holmes stories), and writes the tokenized stories into a text file.

    How it works:

    - It calls the functions `tokenize_children_stories()`, `tokenize_poe_short_stories()`, `tokenize_reddit_short_stories()`, and `tokenize_sherlock_holmes()`.
    - It concatenates all the tokenized stories into one list.
    - It writes the concatenated list into a text file at 'datasets\\corpus.txt'.
    - Finally, it returns the text data that was written into the file.

    Note: This function is decorated with `@log_dec`, meaning that its execution time will be logged if logging is enabled.

In [42]:
@run_notebook
@log_dec
def tokenize_children_stories():
    data = read_in_text('datasets_raw_data\\children_stories_text_corpus\\cleaned_merged_fairy_tales_without_eos.txt')
    data_split = list(map(lambda text: sent_tokenize(text), data.split(sep='\n\n')))
    return data_split

In [43]:
@run_notebook
@log_dec
def tokenize_poe_short_stories():
    data = read_in_csv('datasets_raw_data\\poe_short_stories\\preprocessed_data.csv', data_range=[1])
    data_split = list(map(lambda text: sent_tokenize(text), data.split(sep='\n')))
    return data_split

In [44]:
@run_notebook
@log_dec
def tokenize_reddit_short_stories():
    data = read_in_text('datasets_raw_data\\popular_reddit_short_stories\\reddit_short_stories.txt')
    data_strip = re.sub(r'\<sos\>|\<eos\>|\<nl\>', '', data)
    data_split = list(map(lambda text: sent_tokenize(text), data_strip.split(sep='\n\n')))
    return data_split

In [45]:
@log_dec
def tokenize_single_file_sherlock_holmes(file):
    data =              read_in_text(file)
    data_strip =        re.sub(r'\n|\s{2,}', ' ', data)
    data_strip_end =    re.split(r'---', data_strip)[0]
    data_split =        sent_tokenize(data_strip_end)
    return data_split

@run_notebook
@log_dec
def tokenize_sherlock_holmes():
    data = [tokenize_single_file_sherlock_holmes('datasets_raw_data\\sherlock_holmes' + '\\' + file)
            for file in os.listdir('datasets_raw_data\\sherlock_holmes')]
    return data

In [None]:
@log_dec
def tokenize_all():
    story_delim = '\n\n'
    sentence_delim = '\n'
    func_list = [
        tokenize_children_stories(),
        tokenize_poe_short_stories(),
        tokenize_reddit_short_stories(),
        tokenize_sherlock_holmes(),
    ]
    corpus = [story for collection in func_list for story in collection]
    with open('datasets\\corpus.txt', 'w') as file:
        #json.dump(corpus, file)
        text_data = story_delim.join(list(map(lambda text_list: sentence_delim.join(text_list),corpus)))
        file.write(text_data)
    return text_data

tokenize_all()

## Scifi Stories Text Corpus


A really big corpus, don't use for now.

## The Works of Charles Dickens


* This dataset is really messy.
* Leave for later.

## bookcorpusopen

# Experimental

In [10]:
def read_txt_line(file_path):
    with open(file_path, 'r') as file:
        sentences = file.readlines()
    return sentences

def fit_text_data_to_length(sentences, length):

    combined_sentences = []
    current_combined_sentence = ""

    for sentence in sentences:
        sentence = sentence.strip()  # Remove leading/trailing whitespace
        sentence_words = sentence.split()

        # Check if combining the current sentence with the previous one exceeds the word limit
        if len(current_combined_sentence.split()) + len(sentence_words) > length:
            combined_sentences.append(current_combined_sentence)  # Save the previous combined sentence
            current_combined_sentence = sentence  # Start a new combined sentence
        else:
            current_combined_sentence += " " + sentence  # Concatenate the sentences

    # Add the last combined sentence
    if current_combined_sentence:
        combined_sentences.append(current_combined_sentence)

    dataset = tf.data.Dataset.from_tensor_slices(combined_sentences)

    return dataset, combined_sentences

def save_combined_sentences(file_path, combined_sentences):
    with open(file_path, 'w') as file:
        for sentence in combined_sentences:
            file.write(sentence + '\n')

_, sentences = fit_text_data_to_length(read_txt_line('datasets\\corpus.txt'), 512)

save_combined_sentences('datasets\\tight_fit_dataset_512.txt', sentences)
