# Preprocessing and embedding
This file includes code which classifies text chunks as (Austen, Cervantes, Sturluson, God and Dostoevsky).
The training data is text chunks from their respective works _Pride and predjudice_, _King James Bible_ (Genesis through Deuteronomy), _Don Quixote_ (Volume I), _Heimskringla_ and _Crime and punishment_. We obtain the texts from the Gutenberg Project.

In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

We retrieve the model from Huggingface

In [None]:
# Importing the transformer
from sentence_transformers import SentenceTransformer
model_name = "mixedbread-ai/mxbai-embed-large-v1" 

model = SentenceTransformer(model_name)

  from tqdm.autonotebook import tqdm, trange


Function for reading in the desired parts of the books

In [None]:
# Helpers
def preprocessing(filepath, text, end=False):
    if 'austen' in filepath:
        start = text.find("Chapter I.]")

    elif 'dostoyevsky' in filepath:
        start = text.find("CHAPTER I")

    elif 'god' in filepath:
        start = text.find("1:1")
        end = text.find("in the sight of all Israel.") # Only Genesis thorugh Deuteronomy

    elif 'kafka' in filepath:
        start = text.find("Chapter One")

    elif 'shelley' in filepath:
        start = text.find("_To")

    elif 'tolstoy' in filepath:
        start = text.find("Chapter 1")

    elif 'sturluson' in filepath:
        start = text.find("PREFACE OF SNORRE STURLASON.")
        end = text.find("SAGA OF HARALD HARDRADE.") # Only Heimskringla
        
    elif 'cervantes' in filepath:
        start = text.find("Idle reader:")
        end = text.find("Forse altro cantera con miglior plettro.") # Only Volume I

    elif 'brother_karamazov' in filepath:
        start = text.find("Fyodor Pavlovitch Karamazov")

    elif 'sense_and_sensibility' in filepath:
        start = text.find("CHAPTER I.")

    elif 'wells' in filepath:
        start = text.find("Introduction")

    else:
        raise Exception("This book is not in our library!")
    
    if not end:
        end = text.find("*** END")
   
    return text[start:end].split()


Reads in the contents of the books and chunks the data into the desired length

In [4]:
def read_chunks(filepath, chunksize=50, max_len=0, Feedback=True):
    """
    Reads text into a list of strings with the specified number of words (discards final chunk to ensure similar length).
    """
    with open(filepath, encoding='utf-8') as infile:
        words = preprocessing( filepath, infile.read() ) # list of words, preprocessed
        if max_len:
            assert max_len > chunksize
            words = words[:max_len]

        length_words = len(words)
        n_chunks = int(length_words / chunksize)
        chunks = np.empty(n_chunks, dtype=object)
        for i, start in enumerate(range(0, length_words, chunksize)):
            if start+chunksize < length_words: # we discard the final chunk if it is shorter than 50 words
                chunks[i] = ' '.join(words[start:start+chunksize])

        if Feedback:
            print(f"Length: {length_words:,} words, on {n_chunks:,} chunks of length {chunksize}.")
        
        return chunks


Function for embedding the data

In [5]:
def embed(chunks):
    embeddings = []
    for chunk in chunks:
        embeddings.append( model.encode(chunk) )
    return np.asarray(embeddings)

Embedding all the data and creating a dataframe with the sentences and embeddings

In [None]:
folder = ".."
subfolder = "Texts"
filenames = ['austen', 'dostoyevsky', 'god', 'cervantes', 'sturluson']
filepaths = [os.path.join(folder, subfolder, filename) for filename in filenames]

# Uncomment to embed
df = pd.DataFrame({
    'author': pd.Series(dtype='int'),
    'text': pd.Series(dtype='str'), 
    'embedding': pd.Series(dtype='object')
})

for author_idx, filepath in enumerate(filepaths):
    chunks = read_chunks(filepath+'.txt', chunksize=150, max_len=301) # remove max_len to run on entire text
    embeddings = embed(chunks)
    author = np.ones_like(chunks)*author_idx

    df = pd.concat([df, pd.DataFrame({'author': author, 'text': chunks, 'embedding': embeddings})], ignore_index=True)

..\Texts\wells
Length: 32,442 words, on 216 chunks of length 150.


Stacking the embeddings and labels and savinng them as `npy`-files

In [33]:
# Class encoding (0 through 4, alphabetically)
data = [np.load(filepath+'.npy') for filepath in filepaths]
lengths = [len(author) for author in data]

X = np.vstack( data )
y = np.repeat(np.arange(5), lengths) # improved target vector

np.save('text_data', X)
np.save('labels', y)