In [7]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import os

from torch.distributions.categorical import Categorical

# Importing the transformer
from sentence_transformers import SentenceTransformer
model_name = "mixedbread-ai/mxbai-embed-large-v1" 
"""
Change the default model here. To save the model locally, replace the path with your path and run this module.
At the end of your path, add the name of the folder you want create for the model, eg. ".../local_model"
When you have downloaded the model, uncomment model_name = path to use the stored model.
"""
# path = "C:/Users/jonas/OneDrive/Dokumenter/Python Scripts/embed/local_model_sentence_transformers" # (example path / for my convenience)
# model_name = path # Uncomment this line once you have downloaded the model.
model = SentenceTransformer(model_name)

In [9]:
# Helpers
def preprocessing(filepath, text, end=False):
    if 'austen' in filepath:
        start = text.find("Chapter I.]")
        # potential other preprocessing

    elif 'dostoyevsky' in filepath:
        start = text.find("CHAPTER I")

    elif 'god' in filepath:
        start = text.find("1:1")
        end = text.find("in the sight of all Israel.") # Only old testament

    elif 'kafka' in filepath:
        start = text.find("Chapter One")

    elif 'shelley' in filepath:
        start = text.find("_To")

    elif 'tolstoy' in filepath:
        start = text.find("Chapter 1")

    elif 'sturluson' in filepath:
        start = text.find("PREFACE OF SNORRE STURLASON.")
        end = text.find("SAGA OF HARALD HARDRADE.") # Only Heimskringla
        
    elif 'cervantes' in filepath:
        start = text.find("Idle reader:")
        end = text.find("Forse altro cantera con miglior plettro.") # Only Volume I

    elif 'brother_karamazov' in filepath:
        start = text.find("Fyodor Pavlovitch Karamazov")

    elif 'sense_and_sensibility' in filepath:
        start = text.find("CHAPTER I.")

    elif 'wells' in filepath:
        start = text.find("Introduction")

    else:
        raise Exception("This book is not in our library!")
    
    if not end:
        end = text.find("*** END")
    
    return text[start:end]


In [26]:
# importing the generators
folder = ".."
subfolder = "Texts"
subfolder_generator = "../Generation/Models_and_Data"
filenames = ['austen', 'dostoyevsky', 'god', 'cervantes', 'sturluson']
paths = [os.path.join(subfolder_generator, author + '_generator.pt') for author in filenames]
generators = [torch.load(path) for path in paths]

# retrieving the 100 most used english words
with open("common-words.txt", encoding='utf-8') as infile:
    words = infile.read().split()

In [11]:
filepaths = [os.path.join(folder, subfolder, filename)+'.txt' for filename in filenames]

char2int = []
char_array = []
for i, filepath in  enumerate(filepaths):
    with open(filepath, encoding='utf-8') as infile:
        text = preprocessing( filepath, infile.read() ) # list of words, preprocessed

        char_set = set(text)
        chars_sorted = sorted(char_set)
        char2int.append({ch:j for j,ch in enumerate(chars_sorted)})
        char_array.append(np.array(chars_sorted))


def sample(model, starting_str, author,
           len_generated_text=500,
           scale_factor=2.0):

    encoded_input = torch.tensor([char2int[author][s] for s in starting_str])
    encoded_input = torch.reshape(encoded_input, (1, -1))

    generated_str = starting_str

    model.eval()
    hidden, cell = model.init_hidden(1)
    hidden = hidden.to('cpu')
    cell = cell.to('cpu')
    for c in range(len(starting_str)-1):
        _, hidden, cell = model(encoded_input[:, c].view(1), hidden, cell) 
    
    last_char = encoded_input[:, -1]
    for i in range(len_generated_text):
        logits, hidden, cell = model(last_char.view(1), hidden, cell) 
        logits = torch.squeeze(logits, 0)
        scaled_logits = logits * scale_factor
        m = Categorical(logits=scaled_logits)
        last_char = m.sample()
        generated_str += str(char_array[author][last_char])
        
    return generated_str

In [27]:
print(words)

['the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have', 'I', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you', 'do', 'at', 'this', 'but', 'his', 'by', 'from', 'they', 'we', 'say', 'her', 'she', 'or', 'an', 'will', 'my', 'one', 'all', 'would', 'there', 'their', 'what', 'so', 'up', 'out', 'if', 'about', 'who', 'get', 'which', 'go', 'me', 'when', 'make', 'can', 'like', 'time', 'no', 'just', 'him', 'know', 'take', 'people', 'into', 'year', 'your', 'good', 'some', 'could', 'them', 'see', 'other', 'than', 'then', 'now', 'look', 'only', 'come', 'its', 'over', 'think', 'also', 'back', 'after', 'use', 'two', 'how', 'our', 'work', 'first', 'well', 'way', 'even', 'new', 'want', 'because', 'any', 'these', 'give', 'day', 'most', 'us']


In [30]:
print(len(words))

100


In [29]:
# generating 100 chunks for each author, a sentence (80 characters) and a 150 word chunk for every word for evert
torch.manual_seed(1)

sentences_embedded = np.empty((5,100), object)
chunks_embedded = np.empty((5,100), object)
for author, generator in enumerate(generators):
    generator.to('cpu')
    for line, word in enumerate(words):
        # sentence
        sentences_embedded[author][line] = model.encode(sample(generator, word, author, len_generated_text=90))
        # 150-words
        new_words = sample(generator, word, author, len_generated_text=1000).split() # generating more than 150 words
        chunk = ' '.join(new_words[1:151] )
        chunks_embedded[author][line] = model.encode(chunk)

In [44]:
np.save('../Generation/Models_and_Data/embedded_chunks_generated_text', chunks_embedded)

In [2]:
def test_loop(dataloader, model, loss_fn):
    # Set the model to evaluation mode 
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    all_preds = []
    all_labels = []

    with torch.no_grad():           # gradient computation excluded, unnecessary
        for X, y in dataloader:
            pred = model(X)

            all_preds.extend(pred.argmax(1).tolist())
            all_labels.extend(y.tolist())

            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

        test_loss /= num_batches
        correct /= size
        print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

    return all_preds, all_labels, correct

In [3]:
import numpy as np
import torch
import os

import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.metrics import confusion_matrix

from textdataset import TextDataset
from neural_net import NeuralNet

# Classifying (sentences)
folder = "../Generation/Models_and_Data"
sentence_name = "sentences_model.pt"
chunk_name = "150_chunksize_model.pt"

sentences_embedded = np.load('../Generation/Models_and_Data/embedded_sentences_generated_text.npy', allow_pickle=True)
sentences_embedded = np.array([sentence for sentence in sentences_embedded.flat])

X1 = torch.tensor(sentences_embedded)
y1 = torch.from_numpy(np.repeat(np.arange(5), 100)).long()
dataset_sentence = TextDataset(X1, y1)

# import best classification model - sentences
dl = DataLoader(dataset_sentence)
loss_fn = nn.CrossEntropyLoss()
sentence_model = torch.load(os.path.join(folder, sentence_name))
predictions, labels, correct = test_loop(dl, sentence_model, loss_fn)

# Compute and print confusion matrix
cm = confusion_matrix(labels, predictions)
print("Confusion Matrix:")
print(cm)


# Classifying (150 chunks)
chunks_embedded = np.load('../Generation/Models_and_Data/embedded_chunks_generated_text.npy', allow_pickle=True)
chunks_embedded = np.array([chunk for chunk in chunks_embedded.flat])

X1 = torch.tensor(chunks_embedded)
y1 = torch.from_numpy(np.repeat(np.arange(5), 100)).long()
dataset_chunks = TextDataset(X1, y1)

# import best classification model - sentences
dl = DataLoader(dataset_chunks)
loss_fn = nn.CrossEntropyLoss()
sentence_model = torch.load(os.path.join(folder, sentence_name))
predictions, labels, correct = test_loop(dl, sentence_model, loss_fn)

# Compute and print confusion matrix
cm = confusion_matrix(labels, predictions)
print("Confusion Matrix:")
print(cm)



  sentence_model = torch.load(os.path.join(folder, sentence_name))


Test Error: 
 Accuracy: 99.2%, Avg loss: 0.913381 

Confusion Matrix:
[[100   0   0   0   0]
 [  2  97   0   1   0]
 [  0   0 100   0   0]
 [  0   0   1  99   0]
 [  0   0   0   0 100]]


NameError: name 'chunks_embedded' is not defined

: 