# Classification
This file includes code which classifies text chunks as (Austen, Shelly, Kafka, Tolstoy or Dostoyevsky).
The training data is text chunks from their respective works 'Pride and predjudice', 'Frankenstein', 'The trial', 'Anna Karenina' and 'Crime and punishment'. We obtain the texts from the Gutenberg Project.

## Importing the data

In [27]:
# Imports
import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from sklearn.model_selection import train_test_split

from textdataset import TextDataset
from neural_net import NeuralNet

In [None]:
# Importing the transformer
from sentence_transformers import SentenceTransformer
model_name = "mixedbread-ai/mxbai-embed-large-v1" 
"""
Change the default model here. To save the model locally, replace the path with your path and run this module.
At the end of your path, add the name of the folder you want create for the model, eg. ".../local_model"
When you have downloaded the model, uncomment model_name = path to use the stored model.
"""
# path = "C:/Users/jonas/OneDrive/Dokumenter/Python Scripts/embed/local_model_sentence_transformers" # (example path / for my convenience)
# model_name = path # Uncomment this line once you have downloaded the model.
model = SentenceTransformer(model_name)

In [21]:
# Helpers
def preprocessing(filepath, text):
    if 'austen' in filepath:
        start = text.find("Chapter I.]")
        # potential other preprocessing

    elif 'dostoyevsky' in filepath:
        start = text.find("CHAPTER I")
        # potential other preprocessing

    elif 'kafka' in filepath:
        start = text.find("Chapter One")
        # potential other preprocessing

    elif 'shelley' in filepath:
        start = text.find("_To")
        # potential other preprocessing

    elif 'tolstoy' in filepath:
        start = text.find("Chapter 1")
        # potential other preprocessing
    else:
        raise Exception("This book is not in our library!")
    
    end = text.find("*** END")
    return text[start:end].split()


In [22]:
def read_chunks(filepath, chunksize=50, max_len=0, Feedback=True):
    """
    Reads text into a list of strings with the specified number of words (discards final chunk to ensure similar length).
    """
    with open(filepath, encoding='utf-8') as infile:
        words = preprocessing( filepath, infile.read() ) # list of words, preprocessed
        if max_len:
            assert max_len > chunksize
            words = words[:max_len]

        length_words = len(words)
        n_chunks = int(length_words / chunksize)
        chunks = np.empty(n_chunks, dtype=object)
        for i, start in enumerate(range(0, length_words, chunksize)):
            if start+chunksize < length_words: # we discard the final chunk if it is shorter than 50 words
                chunks[i] = ' '.join(words[start:start+chunksize])

        if Feedback:
            print(f"Length: {length_words:,} words, on {n_chunks:,} chunks of length {chunksize}.")
        
        return chunks


In [23]:
def embed(chunks):
    embeddings = []
    for chunk in chunks:
        embeddings.append( model.encode(chunk) )
    return np.asarray(embeddings)

In [24]:
# Change max_len and run on clusters - this might take days on the full dataset (but then will never have to be done again:)
folder = ".."
subfolder = "Texts"
filenames = ['austen', 'dostoyevsky', 'kafka', 'shelley', 'tolstoy']
filepaths = [os.path.join(folder, subfolder, filename) for filename in filenames]

# for filepath in filepaths:
#     print(filepath)
#     chunks = read_chunks(filepath+'.txt') # remove max_len to run on entire text
#     embeddings = embed(chunks)
#     np.save(filepath, embeddings)

In [25]:
# Class encoding (0 through 4, alphabetically)
data = [np.load(filepath+'.npy') for filepath in filepaths]
lengths = [len(author) for author in data]

X = np.vstack( data )
y = np.vstack( [[[i]]*length for i, length in enumerate(lengths)] ) 
print(y)



[[0]
 [0]
 [0]
 ...
 [4]
 [4]
 [4]]


In [17]:
read_chunks('../Texts/austen.txt')[0]

Length: 122,411 words, on 2,448 chunks of length 50.


'Chapter I.] It is a truth universally acknowledged, that a single man in possession of a good fortune must be in want of a wife. However little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed'

### Dataframe

Not the best way of doing it, but a way

In [24]:
# load data
text_data = read_chunks('../Texts/austen.txt')
embeddings_data = np.load('../Texts/austen.npy') 
labels_data = y[:lengths[0]].ravel()

# Create DataFrame
df = pd.DataFrame({
    'Text': text_data,
})

# Create a DataFrame for the embeddings
embedding_columns = [f'Embedding_{i}' for i in range(embeddings_data.shape[1])]
df_embeddings = pd.DataFrame(embeddings_data, columns=embedding_columns)
df_labels = pd.DataFrame(labels_data, columns=['Labels'])

# Concatenate the text/label DataFrame with the embeddings DataFrame
df = pd.concat([df, df_embeddings, df_labels], axis=1)

df

Length: 122,411 words, on 2,448 chunks of length 50.


Unnamed: 0,Text,Embedding_0,Embedding_1,Embedding_2,Embedding_3,Embedding_4,Embedding_5,Embedding_6,Embedding_7,Embedding_8,...,Embedding_1015,Embedding_1016,Embedding_1017,Embedding_1018,Embedding_1019,Embedding_1020,Embedding_1021,Embedding_1022,Embedding_1023,Labels
0,Chapter I.] It is a truth universally acknowle...,0.284331,0.198523,0.321001,-0.224241,-0.742237,-0.527830,-0.146651,0.651597,0.721892,...,0.064762,0.014574,-0.738236,-0.034588,0.414326,0.705330,0.079874,0.260653,-0.002987,0
1,"in the minds of the surrounding families, that...",0.076889,0.027343,0.093580,0.140345,0.367784,-0.355897,-0.146437,0.422682,1.344679,...,-0.454421,-0.589578,-0.519553,1.044313,0.315445,0.209593,-0.140470,0.346846,-0.364940,0
2,"not. “But it is,” returned she; “for Mrs. Long...",0.339588,-0.940871,0.216766,0.602502,-0.659164,-0.278765,-0.031425,0.338654,0.699363,...,0.068579,-0.342067,-0.601994,0.441354,0.222831,-0.364223,0.014880,0.178083,-0.312789,0
3,hearing it.” [Illustration: “He came down to s...,0.149145,0.064928,0.101019,-0.188449,-0.492822,-0.129286,0.132692,0.402831,0.482742,...,-0.533847,-0.665060,-0.823555,0.521699,0.257297,0.023976,-0.694296,0.005413,-0.178064,0
4,"in a chaise and four to see the place, and was...",0.146287,-0.309895,0.128825,-0.141531,-0.506306,0.131742,-0.344784,1.192157,0.450028,...,-0.425849,-0.648054,-0.203158,0.706744,0.299172,0.223305,-0.565342,-0.069827,0.115047,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2443,"respect which almost overcame her affection, s...",-0.010220,-0.190580,0.207971,-0.037104,-1.292766,-0.183501,-0.409878,0.427105,-0.085560,...,-0.191353,-0.386279,-0.792590,0.331757,-0.061435,-1.001084,-0.010191,0.232712,0.321188,0
2444,a sister more than ten years younger than hims...,0.043020,-0.176304,0.330010,0.180056,-0.655494,-0.232812,0.092122,0.034753,-0.082347,...,0.120363,-0.810071,-0.149972,0.544790,0.078733,-0.531377,-0.502457,1.025330,-0.082401,0
2445,"especially of Elizabeth, that for some time al...",-0.211630,-0.400246,0.100616,0.886872,-0.676006,-0.668235,-0.701038,-0.522564,0.110424,...,-0.466563,-0.606127,-0.594379,0.529732,0.044274,-0.984253,-0.426623,0.508764,0.275507,0
2446,"affection for him, or her curiosity to see how...",0.277620,-0.042760,0.356489,0.322990,-0.128166,0.036034,-0.428013,0.764357,0.315446,...,-0.628144,-0.216785,-1.008442,0.543078,-0.381914,-0.627292,-0.306053,0.513492,-0.119869,0


## FFNN / Logistic regression

In [None]:
# splitting into train and test
X_train, y_train, X_test, y_test = train_test_split(X, y.ravel(), test_size=0.2, stratify=y.ravel())

dataset = TextDataset(torch.tensor(X_train), torch.tensor(y_test))

# splitting the data into batches
batch_size = 64
torch.manual_seed(1)
dl = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)

## Kmeans