# Preprocessing and embedding
This file includes code which classifies text chunks as (Austen, Kafka, Sturluson, God and Dostoyevsky).
The training data is text chunks from their respective works 'Pride and predjudice', 'King James Bible', 'The trial', 'Heimskringla' and 'Crime and punishment'. We obtain the texts from the Gutenberg Project.

## Importing the data

In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

In [2]:
# Importing the transformer
from sentence_transformers import SentenceTransformer
model_name = "mixedbread-ai/mxbai-embed-large-v1" 
"""
Change the default model here. To save the model locally, replace the path with your path and run this module.
At the end of your path, add the name of the folder you want create for the model, eg. ".../local_model"
When you have downloaded the model, uncomment model_name = path to use the stored model.
"""
# path = "C:/Users/jonas/OneDrive/Dokumenter/Python Scripts/embed/local_model_sentence_transformers" # (example path / for my convenience)
# model_name = path # Uncomment this line once you have downloaded the model.
model = SentenceTransformer(model_name)

  from tqdm.autonotebook import tqdm, trange


In [24]:
# Helpers
def preprocessing(filepath, text, end=False):
    if 'austen' in filepath:
        start = text.find("Chapter I.]")
        # potential other preprocessing

    elif 'dostoyevsky' in filepath:
        start = text.find("CHAPTER I")

    elif 'god' in filepath:
        start = text.find("1:1")
        end = text.find("in the sight of all Israel.") # Only old testament

    elif 'kafka' in filepath:
        start = text.find("Chapter One")

    elif 'shelley' in filepath:
        start = text.find("_To")

    elif 'tolstoy' in filepath:
        start = text.find("Chapter 1")

    elif 'sturluson' in filepath:
        start = text.find("PREFACE OF SNORRE STURLASON.")
        end = text.find("SAGA OF HARALD HARDRADE.") # Only Heimskringla
        
    elif 'cervantes' in filepath:
        start = text.find("Idle reader:")
        end = text.find("Forse altro cantera con miglior plettro.") # Only Volume I

    else:
        raise Exception("This book is not in our library!")
    
    if not end:
        end = text.find("*** END")
    
    return text[start:end].split()


In [10]:
def read_chunks(filepath, chunksize=50, max_len=0, Feedback=True):
    """
    Reads text into a list of strings with the specified number of words (discards final chunk to ensure similar length).
    """
    with open(filepath, encoding='utf-8') as infile:
        words = preprocessing( filepath, infile.read() ) # list of words, preprocessed
        if max_len:
            assert max_len > chunksize
            words = words[:max_len]

        length_words = len(words)
        n_chunks = int(length_words / chunksize)
        chunks = np.empty(n_chunks, dtype=object)
        for i, start in enumerate(range(0, length_words, chunksize)):
            if start+chunksize < length_words: # we discard the final chunk if it is shorter than 50 words
                chunks[i] = ' '.join(words[start:start+chunksize])

        if Feedback:
            print(f"Length: {length_words:,} words, on {n_chunks:,} chunks of length {chunksize}.")
        
        return chunks


In [25]:
read_chunks('../Texts/sturluson.txt', chunksize=150)

Length: 201,265 words, on 1,341 chunks of length 150.


array(['PREFACE OF SNORRE STURLASON. In this book I have had old stories written down, as I have heard them told by intelligent people, concerning chiefs who have have held dominion in the northern countries, and who spoke the Danish tongue; and also concerning some of their family branches, according to what has been told me. Some of this is found in ancient family registers, in which the pedigrees of kings and other personages of high birth are reckoned up, and part is written down after old songs and ballads which our forefathers had for their amusement. Now, although we cannot just say what truth there may be in these, yet we have the certainty that old and wise men held them to be true. Thjodolf of Hvin was the skald of Harald Harfager, and he composed a poem for King Rognvald the Mountain-high, which is called "Ynglingatal." This Rognvald was',
       'a son of Olaf Geirstadalf, the brother of King Halfdan the Black. In this poem thirty of his forefathers are reckoned up, and the

In [5]:
def embed(chunks):
    embeddings = []
    for chunk in chunks:
        embeddings.append( model.encode(chunk) )
    return np.asarray(embeddings)

In [32]:
# Change max_len and run on clusters - this might take days on the full dataset (but then will never have to be done again:)
folder = ".."
subfolder = "Texts"
filenames = ['austen', 'dostoyevsky', 'god', 'cervantes', 'sturluson']
filepaths = [os.path.join(folder, subfolder, filename) for filename in filenames]

# Uncomment to embed
# for filepath in filepaths:
#     print(filepath)
#     chunks = read_chunks(filepath+'.txt', chunksize=150) # remove max_len to run on entire text
#     embeddings = embed(chunks)
#     np.save(filepath, embeddings)

In [33]:
# Class encoding (0 through 4, alphabetically)
data = [np.load(filepath+'.npy') for filepath in filepaths]
lengths = [len(author) for author in data]

X = np.vstack( data )
y = np.repeat(np.arange(5), lengths) # improved target vector

np.save('text_data', X)
np.save('labels', y)

In [34]:
lengths

[816, 1351, 1084, 1326, 1341]