# Classification
This file includes code which classifies text chunks as (Austen, Shelly, Kafka, Tolstoy or Dostoyevsky).
The training data is text chunks from their respective works _Pride and predjudice_, _Frankenstein_, _The trial_, _Anna Karenina_ and _Crime and punishment_. We obtain the texts from the Gutenberg Project.

## Importing the data

In [12]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from sklearn.model_selection import train_test_split

from textdataset import TextDataset
from neural_net import NeuralNet

We use the `mixedbread-ai`-embedding model from Huggingface

In [None]:
# Importing the transformer
from sentence_transformers import SentenceTransformer
model_name = "mixedbread-ai/mxbai-embed-large-v1" 
"""
Change the default model here. To save the model locally, replace the path with your path and run this module.
At the end of your path, add the name of the folder you want create for the model, eg. ".../local_model"
When you have downloaded the model, uncomment model_name = path to use the stored model.
"""
# path = "C:/Users/jonas/OneDrive/Dokumenter/Python Scripts/embed/local_model_sentence_transformers" # (example path / for my convenience)
# model_name = path # Uncomment this line once you have downloaded the model.
model = SentenceTransformer(model_name)

In [13]:
# Helpers
def preprocessing(filepath, text):
    if 'austen' in filepath:
        start = text.find("Chapter I.]")
        # potential other preprocessing

    elif 'dostoyevsky' in filepath:
        start = text.find("CHAPTER I")
        # potential other preprocessing

    elif 'kafka' in filepath:
        start = text.find("Chapter One")
        # potential other preprocessing

    elif 'shelley' in filepath:
        start = text.find("_To")
        # potential other preprocessing

    elif 'tolstoy' in filepath:
        start = text.find("Chapter 1")
        # potential other preprocessing
    else:
        raise Exception("This book is not in our library!")
    
    end = text.find("*** END")
    return text[start:end].split()


In [14]:
def read_chunks(filepath, chunksize=50, max_len=0, Feedback=True):
    """
    Reads text into a list of strings with the specified number of words (discards final chunk to ensure similar length).
    """
    with open(filepath, encoding='utf-8') as infile:
        words = preprocessing( filepath, infile.read() ) # list of words, preprocessed
        if max_len:
            assert max_len > chunksize
            words = words[:max_len]

        length_words = len(words)
        n_chunks = int(length_words / chunksize)
        chunks = np.empty(n_chunks, dtype=object)
        for i, start in enumerate(range(0, length_words, chunksize)):
            if start+chunksize < length_words: # we discard the final chunk if it is shorter than 50 words
                chunks[i] = ' '.join(words[start:start+chunksize])

        if Feedback:
            print(f"Length: {length_words:,} words, on {n_chunks:,} chunks of length {chunksize}.")
        
        return chunks


In [23]:
def embed(chunks):
    embeddings = []
    for chunk in chunks:
        embeddings.append( model.encode(chunk) )
    return np.asarray(embeddings)

Embedding the texts

In [15]:
# Change max_len and run on clusters - this might take days on the full dataset (but then will never have to be done again:)
folder = ".."
subfolder = "Texts"
filenames = ['austen', 'dostoyevsky', 'kafka', 'shelley', 'tolstoy']
filepaths = [os.path.join(folder, subfolder, filename) for filename in filenames]

# for filepath in filepaths:
#     print(filepath)
#     chunks = read_chunks(filepath+'.txt') # remove max_len to run on entire text
#     embeddings = embed(chunks)
#     np.save(filepath, embeddings)

In [16]:
# Class encoding (0 through 4, alphabetically)
data = [np.load(filepath+'.npy') for filepath in filepaths]
lengths = [len(author) for author in data]

X = np.vstack( data )
y = np.vstack( [[[i]]*length for i, length in enumerate(lengths)] ) 
print(y)

[[0]
 [0]
 [0]
 ...
 [4]
 [4]
 [4]]


## FFNN / Logistic regression

In [17]:
# splitting into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y.ravel(), test_size=0.2, stratify=y.ravel(), random_state=3)

y_train = torch.from_numpy(y_train).long()
y_test = torch.from_numpy(y_test).long()

dataset_train = TextDataset(torch.tensor(X_train), y_train)
dataset_test = TextDataset(torch.tensor(X_test), y_test)

# splitting the data into batches
batch_size = 64
torch.manual_seed(1)
dl_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True, drop_last=True)
dl_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=True, drop_last=True)

# defining the model, optimizer and loss function
model = NeuralNet()

learning_rate = 0.001
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

Training the model

In [None]:
def training_model(dataloader, model, loss_fn, optimizer, num_epochs=100):
    for epoch in range(num_epochs):
        # setting the model to train mode
        model.train()
        
        for (X, y) in dataloader:
            pred = model(X)
            loss = loss_fn(pred, y)

            # Backpropagation
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()           # resets the gradients

        if epoch % 1 == 0:
            loss = loss.item()
            print(f"Epoch {epoch+1:>4f}      loss: {loss:>7f}")


def test_loop(dataloader, model, loss_fn):
    # Set the model to evaluation mode 
    model.eval()
    size = len(dataloader)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():           # gradient computation excluded, unecessary
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(0) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [19]:
training_model(dataset_train, model, loss_fn, optimizer, num_epochs=1)

  return self._call_impl(*args, **kwargs)


Epoch 1.000000      loss: 0.904832


In [188]:
test_loop(dataset_test, model, loss_fn)

Test Error: 
 Accuracy: 85.7%, Avg loss: 1.048907 



In [24]:
import torch
from sklearn.metrics import confusion_matrix
import numpy as np

def test_loop(dataloader, model, loss_fn):
    # Set the model to evaluation mode 
    model.eval()
    size = len(dataloader)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    all_preds = []
    all_labels = []

    with torch.no_grad():           # gradient computation excluded, unnecessary
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

            # Store predictions and true labels
            all_preds.extend(pred.argmax(1).cpu().numpy())
            all_labels.extend(y.cpu().numpy())

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

    # Compute and print confusion matrix
    cm = confusion_matrix(all_labels, all_preds)
    print("Confusion Matrix:")
    print(cm)

# Example usage:
# Assuming you have a test dataloader `test_dataloader`, a trained model `model`, and a loss function `loss_fn`
test_loop(dataset_test, model, loss_fn)

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


cm = confusion_matrix(y_test, predictions)
ConfusionMatrixDisplay(cm).plot()

### Scikit-learn wrapping using `skorch`

In [None]:
import numpy as np
from sklearn.datasets import make_classification
from torch import nn
from skorch import NeuralNetClassifier

X, y = make_classification(1000, 20, n_informative=10, random_state=0)
X = X.astype(np.float32)
y = y.astype(np.int64)

net = NeuralNetClassifier(
    NeuralNet,
    max_epochs=10,
    lr=0.1,
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
)

net.fit(X, y)
y_proba = net.predict_proba(X)