In [3]:
import torch
import torch.nn as nn
import pandas as pd
import os
import numpy as np

In [4]:
import re
import string
from nltk.stem import PorterStemmer

# Read the contents of the text file
with open("data/Elman.txt", "r") as file:
    text = file.read()
    
def remove_non_ascii(s):
    return re.sub(r'[^\x00-\x7F]+|\d', '', s)

# Create a PorterStemmer object
stemmer = PorterStemmer()

# Splitting the text into sentences
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s|\n', text)
sentences = [remove_non_ascii(item) for item in sentences]
# print(sentences)

In [5]:
# Preprocessing function
def preprocess(sentence):
    # Convert to lowercase
    sentence = sentence.lower()
    # Remove punctuation
    sentence = sentence.translate(str.maketrans("", "", string.punctuation))
    # Tokenize the sentence
    words = sentence.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    return stemmed_words
    # return words

# Applying the preprocessing function to each sentence
preprocessed_sentences = [preprocess(sentence) for sentence in sentences]
sentences_tokens = [['<SOS>']+s+['<EOS>'] for s in preprocessed_sentences]
# print(sentences_tokens)

In [6]:
unique_tokens = sorted(set(sum(sentences_tokens,[])))
# print(unique_tokens)
def sentenceToTensor(tokens_list):
    # Convert list of strings to tensor of token indices (integers)
    #
    # Input
    #  tokens_list : list of strings, e.g. ['<SOS>','lion','eat','man','<EOS>']
    # Output
    #  1D tensor of the same length (integers), e.g., tensor([ 2, 18, 13, 19,  0])
    assert(isinstance(tokens_list,list))
    tokens_index = [token_to_index[token] for token in tokens_list]
    return torch.tensor(tokens_index)

n_tokens = len(unique_tokens) # all words and special tokens
token_to_index = {t : i for i,t in enumerate(unique_tokens)}
index_to_token = {i : t for i,t in enumerate(unique_tokens)}
training_pats = [sentenceToTensor(s) for s in sentences_tokens] # python list of 1D sentence tensors
ntrain = len(training_pats)
print('mapping unique tokens to integers: %s \n' % token_to_index)
print('example sentence as string: %s \n' % ' '.join(sentences_tokens[0]))
print('example sentence as tensor: %s \n' % training_pats[0])

print(unique_tokens)

mapping unique tokens to integers: {'<EOS>': 0, '<SOS>': 1, 'book': 2, 'boy': 3, 'bread': 4, 'break': 5, 'car': 6, 'cat': 7, 'chase': 8, 'cooki': 9, 'dog': 10, 'dragon': 11, 'eat': 12, 'exist': 13, 'girl': 14, 'glass': 15, 'like': 16, 'lion': 17, 'man': 18, 'monster': 19, 'mous': 20, 'move': 21, 'plate': 22, 'rock': 23, 'sandwich': 24, 'see': 25, 'sleep': 26, 'smash': 27, 'smell': 28, 'think': 29, 'woman': 30} 

example sentence as string: <SOS> dragon break plate <EOS> 

example sentence as tensor: tensor([ 1, 11,  5, 22,  0]) 

['<EOS>', '<SOS>', 'book', 'boy', 'bread', 'break', 'car', 'cat', 'chase', 'cooki', 'dog', 'dragon', 'eat', 'exist', 'girl', 'glass', 'like', 'lion', 'man', 'monster', 'mous', 'move', 'plate', 'rock', 'sandwich', 'see', 'sleep', 'smash', 'smell', 'think', 'woman']


In [7]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
         # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM layer
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        
        # Fully connected layer (output layer)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Initialize hidden state and cell state
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
        
        # LSTM forward pass
        out, _ = self.lstm(x, (h0, c0))
        
        # Extract the output of the last time step and pass it through the fully connected layer
        out = self.fc(out[:, -1, :])
        
        return out
    
    def initHidden(self):
        # Returns length hidden_size 1D tensor of zeros
        return torch.zeros(self.hidden_size)
    
    def get_embeddings(self):
        # Returns [vocab_size x hidden_size] numpy array of input embeddings
        return self.embed(torch.arange(self.vocab_size)).detach().numpy()

In [16]:
def train(seq_tensor, lstm):
    # Process a sentence and update the SRN weights. With <SOS> as the input at step 0,
    # predict every subsequent word given the past words.
    # Return the mean loss across each symbol prediction.
    #
    # Input
    #   seq_tensor: [1D tensor] sentence as token indices
    #   rnn : instance of SRN class
    # Output
    #   loss : [scalar] average NLL loss across prediction steps
    # TODO : YOUR CODE GOES HERE
    
    hidden = lstm.initHidden()
    lstm.train()
    lstm.zero_grad()
    loss = 0
    seq_length = seq_tensor.shape[0]

    for i in range(seq_length - 1):
        output, hidden = lstm(seq_tensor[i])
        loss += criterion(output, seq_tensor[i+1]) 
    loss.backward()
    optimizer.step()
    
    return loss.item() / float(seq_length-1)
    # raise Exception('Replace with your code.')

In [18]:
input_size = 10  # The number of features in your input
hidden_size = 128  # The number of hidden units in the LSTM layer
num_layers = 2  # The number of stacked LSTM layers
output_size = 1  # The size of the output (e.g., for a regression problem, it's 1)
nepochs = 20

lstm = LSTMModel(input_size, hidden_size, num_layers, output_size)
optimizer = torch.optim.AdamW(lstm.parameters(), weight_decay=0.04) # w/ default learning rate 0.001
criterion = nn.NLLLoss()

for i in range(nepochs):
    perm = np.random.permutation(len(training_pats))
    error_epoch = 0.
    for p in perm:
        loss = train(training_pats[p], lstm)
        error_epoch += loss
    error_epoch = error_epoch / float(len(training_pats)) 
    print(f"loss for epoch {i+1} is: {error_epoch}")
# raise Exception('Replace with your code.')

IndexError: Dimension specified as 0 but tensor has no dimensions