# Preprocessing and embedding
This file includes code which classifies text chunks as (Austen, Cervantes, Sturluson, God and Dostoyevsky).
The training data is text chunks from their respective works _Pride and predjudice_, _King James Bible_ (Genesis through Deuteronomy), _Don Quixote_, _Heimskringla_ and _Crime and punishment_. We obtain the texts from the Gutenberg Project.

In [None]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

from nltk import tokenize
import nltk

We retrieve the model from Huggingface

In [None]:
# Importing the transformer
from sentence_transformers import SentenceTransformer
model_name = "mixedbread-ai/mxbai-embed-large-v1" 

model = SentenceTransformer(model_name)

  from tqdm.autonotebook import tqdm, trange
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Function for reading in the desired parts of the books

In [None]:
# Helpers
def preprocessing(filepath, text, end=False):
    if 'austen' in filepath:
        start = text.find("Chapter I.]")

    elif 'dostoyevsky' in filepath:
        start = text.find("CHAPTER I")

    elif 'god' in filepath:
        start = text.find("1:1")
        end = text.find("in the sight of all Israel.") # Only old testament

    elif 'kafka' in filepath:
        start = text.find("Chapter One")

    elif 'shelley' in filepath:
        start = text.find("_To")

    elif 'tolstoy' in filepath:
        start = text.find("Chapter 1")

    elif 'sturluson' in filepath:
        start = text.find("PREFACE OF SNORRE STURLASON.")
        end = text.find("SAGA OF HARALD HARDRADE.") # Only Heimskringla
        
    elif 'cervantes' in filepath:
        start = text.find("Idle reader:")

    elif 'brother_karamazov' in filepath:
        start = text.find("Fyodor Pavlovitch Karamazov")

    elif 'sense_and_sensibility' in filepath:
        start = text.find("CHAPTER I.")

    elif 'wells' in filepath:
        start = text.find("Introduction")

    else:
        raise Exception("This book is not in our library!")
    
    if not end:
        end = text.find("*** END")
    
    return text[start:end].split()


Function for splitting the sentences using `nltk`

In [38]:
def split_sentences(filepath, Feedback=True):
    with open(filepath, encoding='utf-8') as infile:
        words = preprocessing(filepath, infile.read())
        words = ' '.join(words)

        sentences = tokenize.sent_tokenize(words)

    if Feedback:
        print(f"Length: {len(sentences):,} sentences.")

    return sentences

Function for embedding the sentences

In [39]:
def embed(chunks):
    embeddings = []
    for chunk in chunks:
        embeddings.append(model.encode(chunk))
    return np.asarray(embeddings)

Embedding all the sentences, the sentences are added to a list for use later

Do note that this was not the ideal way of doing this, but the sentences were embedded first, so it was done this way to avoid having to embed the sentences twice.

In [None]:
folder = ".."
subfolder = "Texts"
filenames = ['austen', 'dostoyevsky', 'god', 'cervantes', 'sturluson']
filepaths = [os.path.join(folder, subfolder, filename) for filename in filenames]

all_sentences = []

# Uncomment to embed
for filepath in filepaths:
    print(filepath)
    sentences = split_sentences(filepath+'.txt') # remove max_len to run on entire text
    all_sentences.append(sentences)
    embeddings = embed(sentences)
    np.save(filepath+'_sentences', embeddings)

..\Texts\austen
Length: 4,657 sentences.
..\Texts\dostoyevsky
Length: 11,906 sentences.
..\Texts\god
Length: 4,962 sentences.
..\Texts\cervantes
Length: 5,883 sentences.
..\Texts\sturluson
Length: 8,781 sentences.


We pick 2500 sentences from each author and filter out the sentences that are under 100 characters

In [None]:
n = 2500
sent_reduced = []
embeddings_red = []
length_threshold = 100
filepath = '../Texts/'
files = ['austen_sentences.npy', 'dostoyevsky_sentences.npy', 'god_sentences.npy', 'cervantes_sentences.npy', 'sturluson_sentences.npy']

for idx, sent in enumerate(all_sentences):
    sentences_array = np.array(sent)
    embedding = np.load(filepath+files[idx])

    # Create a vectorized function to check the length of each sentence
    length_check = np.vectorize(lambda s: len(s) >= length_threshold)

    # creating a mask and applying it to the sentences and embeddings
    mask = length_check(sentences_array)
    filtered_sentences = sentences_array[mask]
    embedding = embedding[mask]

    # picking out 2500 random sentences
    indices = np.random.choice(np.arange(0, len(filtered_sentences)), size=n, replace=False)

    filtered_sentences = filtered_sentences[indices]
    embedding = embedding[indices]

    sent_reduced.append(filtered_sentences)
    embeddings_red.append(embedding)

In [None]:
X = np.vstack(embeddings_red)
y = np.repeat(np.arange(5), 2500) 

np.save('text_data_sentences', X)
np.save('labels_sentences', y)

Creating a dataframe with embeddings and sentences

In [69]:
sentences = np.array(sent_reduced).ravel()
embedding = np.array(embeddings_red).reshape(-1, 1024)

df = pd.DataFrame(embedding, columns=[f"component {i}" for i in range(embedding.shape[1])])
df.insert(0, "sentence", sentences)

df.to_csv('../Classification/sentences_embeddings', index=False)