In [1]:
# Import list

import numpy as np
import re

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelBinarizer
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     /teamspace/studios/this_studio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Read Data 

In [2]:
# Different Editors paths
PATH_A = ""
PATH_E = ""
PATH_G = "0. Projects/3/Project-III/data/sherlock-holm.es_stories_plain-text_advs.txt"
PATH_J = ""
PATH_M = ""

PATHS = [PATH_A, PATH_E, PATH_G, PATH_J, PATH_M]

text = ""

for path in PATHS:
    try:
        # Read the text file
        with open(path, 'r', encoding='utf-8') as file:
            text = file.read()
            print(path)
    except:
        continue
    else:
        break


text = text.lower()
print(text[:200])

0. Projects/3/Project-III/data/sherlock-holm.es_stories_plain-text_advs.txt




                        the adventures of sherlock holmes

                               arthur conan doyle



                                table of contents

               a scandal in bohem


## Data Preprocess

### Divide the set

In [3]:
# Divide with regular expressions
DIVIDERS_ORIGINAL = "\n"
DIVIDERS_ALL = "[,.!?:;\"]|\n\n|--| and | that | which "
DIVIDERS_MIN = "[.!]|\n\n"
DIVIDERS_BAL = "[,.!?]|\n\n|--"
divide_set = True

text_try = text

if divide_set:
    # Delete cover of book and extra information
    text_try = text[980:-550]

# Split following the dividers given
text_try = re.split(DIVIDERS_ALL, text_try)

# Delete all the new line comments 
text_try = [el.replace('\n', '') for el in text_try]

text_try[:10]

['     chapter i',
 '',
 '     to sherlock holmes she is always the woman',
 ' i have seldom heard him     mention her under any other name',
 ' in his eyes she eclipses and     predominates the whole of her sex',
 ' it was not',
 'he felt any     emotion akin to love for irene adler',
 ' all emotions',
 '',
 'that one     particularly']

### Tokenization

In [4]:
# Create Tokenizer object in python
tokens = word_tokenize(text)
vocabulary = set(tokens)
total_words = len(vocabulary) + 1

word_to_idx = {word:idx for idx, word in enumerate(vocabulary)}

print(f"total_words: {total_words}")
print(f"Índice de palabras: {word_to_idx}")

total_words: 8388


In [5]:
# Create input-output sequences
input_sequences = []
for line in text_try:
    line_list = line.rstrip(",.;:").split(' ')

    # Tokenize each sentence
    token_list = []
    for char in line_list:
        if char in word_to_idx.keys():
            token_list.append(word_to_idx[char])

    # Divide the different sentences in n-grams
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

input_sequences

[[3493, 8035],
 [7693, 8330],
 [7693, 8330, 1633],
 [7693, 8330, 1633, 4604],
 [7693, 8330, 1633, 4604, 6263],
 [7693, 8330, 1633, 4604, 6263, 1964],
 [7693, 8330, 1633, 4604, 6263, 1964, 5985],
 [7693, 8330, 1633, 4604, 6263, 1964, 5985, 4118],
 [8035, 5029],
 [8035, 5029, 3665],
 [8035, 5029, 3665, 4035],
 [8035, 5029, 3665, 4035, 3551],
 [8035, 5029, 3665, 4035, 3551, 3782],
 [8035, 5029, 3665, 4035, 3551, 3782, 5122],
 [8035, 5029, 3665, 4035, 3551, 3782, 5122, 410],
 [8035, 5029, 3665, 4035, 3551, 3782, 5122, 410, 2772],
 [8035, 5029, 3665, 4035, 3551, 3782, 5122, 410, 2772, 8081],
 [8035, 5029, 3665, 4035, 3551, 3782, 5122, 410, 2772, 8081, 1955],
 [4831, 8271],
 [4831, 8271, 964],
 [4831, 8271, 964, 4604],
 [4831, 8271, 964, 4604, 2393],
 [4831, 8271, 964, 4604, 2393, 4753],
 [4831, 8271, 964, 4604, 2393, 4753, 5919],
 [4831, 8271, 964, 4604, 2393, 4753, 5919, 5985],
 [4831, 8271, 964, 4604, 2393, 4753, 5919, 5985, 6515],
 [4831, 8271, 964, 4604, 2393, 4753, 5919, 5985, 6515, 59

### Padding

In [24]:
# Get the max value to add padding to other entries

average = 0
for seq in input_sequences:
    average += len(seq) 

max_sequence_len, value = max([(len(seq), seq) for seq in input_sequences])
input_seq_pad = np.array([np.pad(seq, (max_sequence_len - len(seq), 0), mode='constant') for seq in input_sequences])

print (f"average = {average / len(input_sequences)}")
print (f"Max seq length = {max_sequence_len}")
input_seq_pad

average = 5.456304696166646
Max seq length = 33


array([[   0,    0,    0, ...,    0, 3493, 8035],
       [   0,    0,    0, ...,    0, 7693, 8330],
       [   0,    0,    0, ..., 7693, 8330, 1633],
       ...,
       [   0,    0,    0, ..., 4970, 1069, 3124],
       [   0,    0,    0, ..., 1069, 3124, 1921],
       [   0,    0,    0, ..., 3124, 1921,   80]])

## Model Train

### X and Y separation

In [7]:
# Split the sequences into input (X) and output (y)
X = input_seq_pad[:, :-1]
y = input_seq_pad[:, -1]

# Convert output to one-hot encoded vectors
y = np.array(torch.nn.functional.one_hot(torch.tensor(y), num_classes=total_words))

print (X)
y

[[   0    0    0 ...    0    0 3493]
 [   0    0    0 ...    0    0 7693]
 [   0    0    0 ...    0 7693 8330]
 ...
 [   0    0    0 ... 4604 4970 1069]
 [   0    0    0 ... 4970 1069 3124]
 [   0    0    0 ... 1069 3124 1921]]


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [11]:
# Create a custom Dataset class
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.float)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = TextDataset(X, y)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

In [12]:
# Define the model
class NextWordPredictor(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(NextWordPredictor, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = x[:, -1, :]
        x = self.fc(x)
        return x

In [13]:
model = NextWordPredictor(vocab_size = total_words, 
embed_dim = 100, hidden_dim = 150, output_dim = total_words)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [14]:
# Train the model
epochs = 3
for epoch in range(epochs):
    for i, (inputs, labels) in enumerate(dataloader):
        outputs = model(inputs)
        loss = criterion(outputs, labels.argmax(dim=1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

Epoch 1/3, Loss: 5.495655536651611
Epoch 2/3, Loss: 5.30692720413208
Epoch 3/3, Loss: 5.404407024383545


## Prediction

In [26]:
# Generate predictions

# Initial text to predict
seed_text = "I will leave if they"
next_words = 3

# Index to word
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

# Generate the n next words
model.eval()  # Set the model to evaluation
for _ in range(next_words):
    tokens = word_tokenize(seed_text)
    token_list = [word_to_idx[word] for word in tokens if word in word_to_idx]
    token_list = np.pad(token_list, (max_sequence_len - len(token_list), 0), mode='constant')
    token_list = torch.tensor(token_list[-max_sequence_len:], dtype=torch.long).unsqueeze(0)

    with torch.no_grad():
        predicted = model(token_list).argmax(dim=1).item()

    output_word = idx_to_word[predicted]
    seed_text += " " + output_word


print(seed_text)

I will leave if they were to the
