# Preprocessing and embedding
This file includes code which classifies text chunks as (Austen (0), Cervantes (1), Dostoyevsky (2), God (3) and Sturluson (4)).
The training data is text chunks from their respective works 'Pride and predjudice', 'Don Quixote', 'Crime and punishment', ' The King James Bible' and 'Heimskringla'. We obtain the texts from the Gutenberg Project.

In [1]:
# Imports
import numpy as np
import pandas as pd
import os

In [2]:
# Importing the transformer
from sentence_transformers import SentenceTransformer
model_name = "mixedbread-ai/mxbai-embed-large-v1" 
"""
Change the default model here. To save the model locally, replace the path with your path and run this module.
At the end of your path, add the name of the folder you want create for the model, eg. ".../local_model"
When you have downloaded the model, uncomment model_name = path to use the stored model.
"""
# path = "C:/Users/jonas/OneDrive/Dokumenter/Python Scripts/embed/local_model_sentence_transformers" # (example path / for my convenience)
# model_name = path # Uncomment this line once you have downloaded the model.
model = SentenceTransformer(model_name)

  from tqdm.autonotebook import tqdm, trange


In [3]:
def preprocessing(filepath, text, end=False):
    if 'austen' in filepath:
        start = text.find("Chapter I.]")
        # potential other preprocessing

    elif 'dostoyevsky' in filepath:
        start = text.find("CHAPTER I")

    elif 'god' in filepath:
        start = text.find("1:1")
        end = text.find("in the sight of all Israel.") # Only Genesis thorough Deuteronomy

    elif 'kafka' in filepath:
        start = text.find("Chapter One")

    elif 'shelley' in filepath:
        start = text.find("_To")

    elif 'tolstoy' in filepath:
        start = text.find("Chapter 1")

    elif 'sturluson' in filepath:
        start = text.find("PREFACE OF SNORRE STURLASON.")
        end = text.find("SAGA OF HARALD HARDRADE.") # Only first half
        
    elif 'cervantes' in filepath:
        start = text.find("Idle reader:")
        end = text.find("Forse altro cantera con miglior plettro.") # Only Volume I

    else:
        raise Exception("This book is not in our library!")
    
    if not end:
        end = text.find("*** END")
    
    return text[start:end].split()


In [6]:
def read_chunks(filepath, chunksize=50, max_len=0, Feedback=True):
    """Reads text into a list of strings with the specified number of words (discards final chunk to ensure similar length)."""

    with open(filepath, encoding='utf-8') as infile:
        words = preprocessing( filepath, infile.read() ) # list of words, preprocessed
        if max_len:
            assert max_len > chunksize
            words = words[:max_len]

        length_words = len(words)
        n_chunks = int(length_words / chunksize)
        chunks = np.empty(n_chunks, dtype=object)

        for i, start in enumerate(range(0, length_words, chunksize)):
            if start+chunksize < length_words: # we discard the final chunk if it is shorter than 50 words
                chunks[i] = ' '.join(words[start:start+chunksize])

        if Feedback:
            print(f"Length: {length_words:,} words, on {n_chunks:,} chunks of length {chunksize}.")
        
        return chunks


In [20]:
def embed(chunks):
    embeddings = []
    for chunk in chunks:
        embeddings.append( model.encode(chunk) )
    return embeddings

In [28]:
# --- Main
folder = ".."
subfolder = "Texts"
filenames = ['austen', 'cervantes', 'dostoyevsky', 'god', 'sturluson']
filepaths = [os.path.join(folder, subfolder, filename) for filename in filenames]

# Uncomment to embed
df = pd.DataFrame({
    'author': pd.Series(dtype='int'),
    'text': pd.Series(dtype='str'), 
    'embedding': pd.Series(dtype='object')
})

for author_idx, filepath in enumerate(filepaths):
    chunks = read_chunks(filepath+'.txt', chunksize=150, max_len=301) # remove max_len to run on entire text
    embeddings = embed(chunks)
    author = np.ones_like(chunks)*author_idx

    df = pd.concat([df, pd.DataFrame({'author': author, 'text': chunks, 'embedding': embeddings})], ignore_index=True)

# and then save this df...

Length: 301 words, on 2 chunks of length 150.
Length: 301 words, on 2 chunks of length 150.
Length: 301 words, on 2 chunks of length 150.
Length: 301 words, on 2 chunks of length 150.
Length: 301 words, on 2 chunks of length 150.


In [34]:
# Retrieving embeddings and adding to df
folder = ".."
subfolder = "Texts"
filenames = ['austen', 'cervantes', 'dostoyevsky', 'god', 'sturluson']
filepaths = [os.path.join(folder, subfolder, filename) for filename in filenames]

chunks = []
for filepath in filepaths:
    chunks += list(read_chunks(filepath+'.txt', chunksize=150))

# Class encoding (0 through 4, alphabetically)
data = [np.load(filepath+'.npy') for filepath in filepaths]
lengths = [len(author) for author in data]
X = np.vstack( data )
y = np.repeat(np.arange(5), lengths)

df = pd.DataFrame({
    'author': y,
    'text': chunks, 
    'embedding': list(X)
})

df.to_csv("author_text_embedding.csv", index=False)

Length: 122,411 words, on 816 chunks of length 150.
Length: 198,975 words, on 1,326 chunks of length 150.
Length: 202,706 words, on 1,351 chunks of length 150.
Length: 162,607 words, on 1,084 chunks of length 150.
Length: 201,265 words, on 1,341 chunks of length 150.
