In [2]:
import import_ipynb

In [3]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
import string
import random
import os
import glob

from vocab_building import load_tokenized_file, load_vocab, decode_vocab, nlp, get_vocab_indx_vector

importing Jupyter notebook from vocab_building.ipynb


Load categories:

In [4]:
"""def find_files(path): return glob.glob(path)

all_categories = []
for filename in find_files('../data/languages/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    
n_categories_languages = len(all_categories)

if n_categories_languages == 0:
    raise RuntimeError('Data not found.')

print('# categories:', n_categories_languages, all_categories)
all_categories.remove('combined')
all_categories.remove('hungarian')
print('# categories:', n_categories_languages, all_categories)"""

"def find_files(path): return glob.glob(path)\n\nall_categories = []\nfor filename in find_files('../data/languages/*.txt'):\n    category = os.path.splitext(os.path.basename(filename))[0]\n    all_categories.append(category)\n    \nn_categories_languages = len(all_categories)\n\nif n_categories_languages == 0:\n    raise RuntimeError('Data not found.')\n\nprint('# categories:', n_categories_languages, all_categories)\nall_categories.remove('combined')\nall_categories.remove('hungarian')\nprint('# categories:', n_categories_languages, all_categories)"

In [5]:
"""def find_files(path): return glob.glob(path)

all_categories = []
for filename in find_files('../data/scripts/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    
n_categories_languages = len(all_categories)

if n_categories_languages == 0:
    raise RuntimeError('Data not found.')

print('# categories:', n_categories_languages, all_categories)"""

"def find_files(path): return glob.glob(path)\n\nall_categories = []\nfor filename in find_files('../data/scripts/*.txt'):\n    category = os.path.splitext(os.path.basename(filename))[0]\n    all_categories.append(category)\n    \nn_categories_languages = len(all_categories)\n\nif n_categories_languages == 0:\n    raise RuntimeError('Data not found.')\n\nprint('# categories:', n_categories_languages, all_categories)"

In [6]:
def find_files(path): return glob.glob(path)

all_categories = []
for filename in find_files('../data/reviews/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    
n_categories_languages = len(all_categories)

if n_categories_languages == 0:
    raise RuntimeError('Data not found.')

print('# categories:', n_categories_languages, all_categories)
all_categories.remove('garden')
all_categories.remove('music')
all_categories.remove('small_combined')
n_categories_languages = len(all_categories)
print('# categories:', n_categories_languages, all_categories)

# categories: 5 ['garden', 'garden_small', 'music', 'music_small', 'small_combined']
# categories: 2 ['garden_small', 'music_small']


In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 'cuda' if torch.cuda.is_available() else 

In [7]:
BATCH_SIZE=16
TRAIN_TOKEN_LEN=256
#VOCAB_SIZE = len(vocab)

In [8]:
class RNN_Dataset(torch.utils.data.Dataset):
    def __init__(
        self,
        sequence_length,
        vocab_file,
        token_file
    ):
        self.sequence_length = sequence_length
        self.load_words(vocab_file, token_file)
    
        self.uniq_words = len(self.vocab)

    def load_words(self, vocab_file, token_file):
        self.vocab = load_vocab(vocab_file) # mar_2023_lowercase_peter_pan_vocab.pt
        self.raw_tokens = load_tokenized_file(token_file) # lowercase_peter_pan_Transformer_tok.pkl

        self.num_samples = max(1, (len(self.raw_tokens) // TRAIN_TOKEN_LEN)) # Split raw tokens into groups of TRAIN_TOKEN_LEN
        self.num_batches = max(1, self.num_samples // BATCH_SIZE)

        print('Number of raw_tokens: ', len(self.raw_tokens))
        print('Number of samples in a batch: ', self.num_samples)
        print('Number of batches: ', self.num_batches)

        return 1

    def __len__(self):
        return self.num_samples

    def __getitem__(self, index):
        #print('INDEX: ', index)
        index = index * TRAIN_TOKEN_LEN
        return (
            torch.tensor(self.raw_tokens[index:index+self.sequence_length]).to(device), # x
            torch.tensor(self.raw_tokens[index+1:index+self.sequence_length+1]).to(device), # y
            0 # no cat
        )

In [9]:
class RNN_Dataset_multiple_sources(torch.utils.data.Dataset):
    def __init__(
        self,
        sequence_length,
        vocab_file,
        token_file_1,
        token_file_2
    ):
        self.sequence_length = sequence_length
        self.load_words(vocab_file, token_file_1, token_file_2)
    
        self.uniq_words = len(self.vocab)

    def load_words(self, vocab_file, token_file_1, token_file_2):
        # We want the vocab to be constructed from all sources, but we need the raw token sets for each seperately.
        # The category vector can just be a simple index vector.
        self.vocab = load_vocab(vocab_file)
        self.raw_tokens_1 = load_tokenized_file(token_file_1)
        self.raw_tokens_2 = load_tokenized_file(token_file_2)

        self.num_samples_1 = len(self.raw_tokens_1)
        self.num_samples_2 = len(self.raw_tokens_2)

        # This is iffy, because we aren't actually going through all of the "samples"
        self.num_samples = max(1, ((self.num_samples_1 + self.num_samples_2) // TRAIN_TOKEN_LEN)) # Split raw tokens into groups of TRAIN_TOKEN_LEN
        self.num_batches = max(1, self.num_samples // BATCH_SIZE)

        print('Number of raw_tokens: ', len(self.raw_tokens_1 + self.raw_tokens_2))
        print('Number of samples in a batch: ', self.num_samples)
        print('Number of batches: ', self.num_batches)

        return 1
    
    def random_choice(self, l):
        return l[random.randint(0, len(l)-1)]
    
    def category_tensor(self, category):
        li = all_categories.index(category)
        if li == 0:
            tensor = torch.zeros(self.sequence_length).to(device).long()
        else:
            tensor = torch.ones(self.sequence_length).to(device).long()
        return tensor, li

    def __len__(self):
        return self.num_samples

    def __getitem__(self, index):
        # This should pick a random source, grab it's category, and then grab a sequence associated with it.
        # Pick random category
        string_category= self.random_choice(all_categories)
        category, category_index = self.category_tensor(string_category)

        # Pick the right token samples based on the category
        if category_index == 0:
            current_sample = self.raw_tokens_1
        else:
            current_sample = self.raw_tokens_2
            
        # We cut off the potential of it being too long
        random_index = random.randint(0, len(current_sample) - (self.sequence_length + 1)) 
        end_index = random_index + self.sequence_length
        return ( # might break if it gets the very end?
            torch.tensor(current_sample[random_index:end_index]).to(device), # x
            torch.tensor(current_sample[random_index+1:end_index+1]).to(device), # y
            category
        )
        

In [10]:
class GRU(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers):
        super(GRU, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers

        # EMBEDDING
        self.embedding = nn.Embedding(num_embeddings=input_size, embedding_dim=hidden_size)

        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=0.2, batch_first=True)

        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, input_token, hidden, batch_size):
        #print('input_token shape: ', input_token.size()) # 16, 256 -- batch_size, sequence_len

        embedded = self.embedding(input_token)
        #print('embedded size: ', embedded.size()) # 16, 256, 1400 -- batch, sequence_length, input_size

        out, hidden = self.gru(embedded, hidden)
        
        out = self.fc(out)

        #print('out shape: ', out.size()) # 16, 256, 4822
        #print('hidden shape: ', hidden.size()) # 3, 16, 1400

        return out, hidden

    def init_hidden(self, batch_size):
        return torch.zeros(self.n_layers, batch_size, self.hidden_size).to(device) # num_layers, batch_size, hidden_size

In [11]:
class GRU_category(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers):
        super(GRU_category, self).__init__()
        self.input_size = input_size # 4822
        self.hidden_size = hidden_size # 1400
        self.output_size = output_size # 4822
        self.n_layers = n_layers

        # EMBEDDING
        self.word_embedding = nn.Embedding(num_embeddings=output_size, embedding_dim=hidden_size)
        self.cat_embedding = nn.Embedding(num_embeddings=2, embedding_dim=hidden_size)

        self.gru = nn.GRU(hidden_size*2, hidden_size, n_layers, batch_first=True)

        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, input_token, hidden, batch_size, category):
        #print('input_token shape: ', input_token.size()) # 16, 256 -- batch_size, sequence_len
        #print('cat size: ', category.size()) # [16, 1]
        #print('cat: ', category)

        # To determine:
        # Do we concatenate the category and input token together, before embedding?
        # Do we embed them seperately in different spaces, and then concatenate?
        # Do we only embed the input, and then just concatenate the category?
        # Do we do addition, or concatenation?
        # Try embed both and then concatenate, but use the same embedding module.

        embedded_word = self.word_embedding(input_token)
        #print('embedded size: ', embedded_word.size()) # 16, 256, 1400 -- batch, sequence_length, input_size
        
        embedded_cat = self.cat_embedding(category)
        #print('embedded cat size: ', embedded_cat.size()) # [16, 1, 64]

        combined = torch.cat((embedded_word, embedded_cat), 2)

        #print('combined size: ', combined.size())

        out, hidden = self.gru(combined, hidden)
        
        out = self.fc(out)

        #print('out shape: ', out.size()) # 16, 256, 4822
        #print('hidden shape: ', hidden.size()) # 3, 16, 1400

        return out, hidden

    def init_hidden(self, batch_size):
        return torch.zeros(self.n_layers, batch_size, self.hidden_size).to(device) # num_layers, batch_size, hidden_size

In [12]:
class GRU_with_cells(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers):
        super(GRU_with_cells, self).__init__()
        self.input_size = input_size 
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers

        # EMBEDDING
        self.embedding = nn.Embedding(num_embeddings=input_size, embedding_dim=hidden_size)

        #self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=0.2, batch_first=True)
        # batch_size, input_size
        self.gru_cell_1 = nn.GRUCell(hidden_size, hidden_size)

        self.gru_cell_2 = nn.GRUCell(hidden_size, hidden_size)

        self.gru_cell_3 = nn.GRUCell(hidden_size, hidden_size)

        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, input_token, hidden_state, batch_size):
        hidden_1, hidden_2, hidden_3 = hidden_state

        # input_token size: 16, 256

        embedded = self.embedding(input_token)

        final_output_tensor = torch.empty((batch_size, 256, self.output_size)).to(device) # Final output needs to be 16, 256, 4822, or batch_size, sequence_length, output_size
        # Embedded size: 16, 256, 1400 -- batch, sequence_length, input_size
        for index in range(256):
            initial = embedded[:,index,:]
            #print('initial size: ', initial.size()) # 16, 1400
            #print('hidden_size: ', hidden_1.size()) # 16, 1400
            hidden_1 = self.gru_cell_1(initial, hidden_1)
            hidden_2 = self.gru_cell_2(hidden_1, hidden_2)
            hidden_3 = self.gru_cell_3(hidden_2, hidden_3)

            out = self.fc(hidden_3)

            #print('out size: ', torch.unsqueeze(out, 1).size()) # [16, 1, 4822]
            # append to output tensor?
            final_output_tensor[:, index, :] = torch.unsqueeze(out, 1)[:, 0, :]

        #print("final_output_tensor size: ", final_output_tensor.size()) # 16, 256, 4822
        # Ultimately, we want to return the final out vector of all predictions
        return final_output_tensor, (hidden_1, hidden_2, hidden_3)

    def init_hidden(self, batch_size):
        hidden_1 = torch.zeros(batch_size, self.hidden_size).to(device) # num_layers, batch_size, hidden_size
        hidden_2 = torch.zeros(batch_size, self.hidden_size).to(device)
        hidden_3 = torch.zeros(batch_size, self.hidden_size).to(device)
        return (hidden_1, hidden_2, hidden_3)

In [13]:
class GRU_with_cells_category(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers):
        super(GRU_with_cells_category, self).__init__()
        self.input_size = input_size 
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers

        # EMBEDDING
        self.word_embedding = nn.Embedding(num_embeddings=output_size, embedding_dim=hidden_size)
        self.cat_embedding = nn.Embedding(num_embeddings=2, embedding_dim=hidden_size)

        #self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=0.2, batch_first=True)
        # batch_size, input_size
        self.gru_cell_1 = nn.GRUCell(hidden_size*2, hidden_size) # dropout?

        self.gru_cell_2 = nn.GRUCell(hidden_size, hidden_size)

        self.gru_cell_3 = nn.GRUCell(hidden_size, hidden_size)

        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, input_token, hidden_state, batch_size, category):
        hidden_1, hidden_2, hidden_3 = hidden_state # should unpack

        # input_token size: 16, 256

        embedded_word = self.word_embedding(input_token) # You'll have to check size on this
        #print('embedded size: ', embedded_word.size()) # 16, 256, 1400 -- batch, sequence_length, input_size
        
        embedded_cat = self.cat_embedding(category)
        #print('embedded cat size: ', embedded_cat.size()) # [16, 1, 64]

        combined = torch.cat((embedded_word, embedded_cat), 2)

        final_output_tensor = torch.empty((batch_size, input_token.size()[1], self.output_size)).to(device) # Final output needs to be 16, 256, 4822, or batch_size, sequence_length, output_size
        # Embedded size: 16, 256, 1400 -- batch, sequence_length, input_size
        for index in range(input_token.size()[1]):
            initial = combined[:,index,:]
            #print('initial size: ', initial.size()) # 16, 1400 prediction: 1, 256
            #print('hidden_size: ', hidden_1.size()) # 16, 1400 - 1, 256
            hidden_1 = self.gru_cell_1(initial, hidden_1)
            hidden_2 = self.gru_cell_2(hidden_1, hidden_2)
            hidden_3 = self.gru_cell_3(hidden_2, hidden_3)

            out = self.fc(hidden_3)

            #print('out size: ', torch.unsqueeze(out, 1).size()) # [16, 1, 4822] 1, 1, 12923
            # append to output tensor?
            final_output_tensor[:, index, :] = torch.unsqueeze(out, 1)[:, 0, :]

        #print("final_output_tensor size: ", final_output_tensor.size()) # 16, 256, 4822
        # Ultimately, we want to return the final out vector of all predictions
        return final_output_tensor, (hidden_1, hidden_2, hidden_3)

    def init_hidden(self, batch_size):
        hidden_1 = torch.zeros(batch_size, self.hidden_size).to(device) # num_layers, batch_size, hidden_size
        hidden_2 = torch.zeros(batch_size, self.hidden_size).to(device)
        hidden_3 = torch.zeros(batch_size, self.hidden_size).to(device)
        return (hidden_1, hidden_2, hidden_3)

In [14]:
class GRU_with_cells_category_edited(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers):
        super(GRU_with_cells_category_edited, self).__init__()
        self.input_size = input_size 
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers

        # EMBEDDING
        self.word_embedding = nn.Embedding(num_embeddings=output_size, embedding_dim=hidden_size)
        self.cat_embedding = nn.Embedding(num_embeddings=2, embedding_dim=hidden_size)

        #self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=0.2, batch_first=True)
        # batch_size, input_size
        self.gru_cell_1 = nn.GRUCell(hidden_size*2, hidden_size)

        self.gru_cell_2 = nn.GRUCell(hidden_size*2, hidden_size)

        self.gru_cell_3 = nn.GRUCell(hidden_size*2, hidden_size)

        self.fc = nn.Linear(hidden_size*2, output_size)
        
    def forward(self, input_token, hidden_state, batch_size, category):
        hidden_1, hidden_2, hidden_3 = hidden_state # should unpack

        # input_token size: 16, 256

        embedded_word = self.word_embedding(input_token) # You'll have to check size on this
        #print('embedded size: ', embedded_word.size()) # 16, 256, 1400 -- batch, sequence_length, input_size
        #print('category size: ', category.size())
        embedded_cat = self.cat_embedding(category)
        #print('embedded cat size: ', embedded_cat.size()) # [16, 1, 64]

        combined = torch.cat((embedded_word, embedded_cat), 2)

        final_output_tensor = torch.empty((batch_size, input_token.size()[1], self.output_size)).to(device) # Final output needs to be 16, 256, 4822, or batch_size, sequence_length, output_size
        # Embedded size: 16, 256, 1400 -- batch, sequence_length, input_size
        # input_token.size()[1] is the sequence length
        for index in range(input_token.size()[1]):
            initial = combined[:,index,:]
            cat = embedded_cat[:, index, :]
            #print('initial size: ', initial.size()) # 16, 1400
            #print('hidden_size: ', hidden_1.size()) # 16, 1400
            hidden_1 = self.gru_cell_1(initial, hidden_1)
            #print('hidden_1 size: ', hidden_1.size()) # 16, 256
            hidden_2 = self.gru_cell_2(torch.cat((cat,hidden_1), 1), hidden_2)
            hidden_3 = self.gru_cell_3(torch.cat((cat,hidden_2), 1), hidden_3)

            out = self.fc(torch.cat((cat,hidden_3),1))

            #print('out size: ', torch.unsqueeze(out, 1).size()) # [16, 1, 4822]
            # append to output tensor?
            final_output_tensor[:, index, :] = torch.unsqueeze(out, 1)[:, 0, :]

        #print("final_output_tensor size: ", final_output_tensor.size()) # 16, 256, 4822
        # Ultimately, we want to return the final out vector of all predictions
        return final_output_tensor, (hidden_1, hidden_2, hidden_3)

    def init_hidden(self, batch_size):
        hidden_1 = torch.zeros(batch_size, self.hidden_size).to(device) # num_layers, batch_size, hidden_size
        hidden_2 = torch.zeros(batch_size, self.hidden_size).to(device)
        hidden_3 = torch.zeros(batch_size, self.hidden_size).to(device)
        return (hidden_1, hidden_2, hidden_3)

In [15]:
def train(dataset, model, max_epochs, batch_size, cat = False):
    train_losses = []

    model.train()

    dataloader = DataLoader(dataset, batch_size=batch_size, drop_last=True)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(max_epochs):

        total_loss = 0
        
        for batch, (x, y, category) in enumerate(dataloader):
            hidden_states = model.init_hidden(batch_size)

            #print('x size: ', x.size()) # 16, 256
            #print('category size: ', category.size()) # 16, 256
            
            optimizer.zero_grad()

            if cat:
                y_pred, hidden_states = model(x, hidden_states, batch_size, category)
            else:
                y_pred, hidden_states = model(x, hidden_states, batch_size)

            #print('y_pred size: ', y_pred.size()) # [16, 4822] for cells, [16, 256, 4822] normal
            #print('y_pred transposed size: ', y_pred.transpose(1, 2).size()) # [16, 4822, 256]

            loss = criterion(y_pred.transpose(1, 2), y)
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

            print({ 'epoch': epoch, 'batch': batch, 'loss': loss.item() })

        train_losses.append(total_loss/batch_size)

    return train_losses

In [16]:
def predict(dataset, model, text, next_words=100):
    model.eval()

    prediction = get_vocab_indx_vector(dataset.vocab, nlp, text)
    tokens = torch.tensor(prediction).to(device)

    state_h = model.init_hidden(1) # num_layers, batch_size, lstm_size

    # Prime generation by feeding in initial input:
    for p in range(len(tokens)-1):
        _, state_h = model(tokens[p].view(1,-1), state_h)

    last_token = tokens[-1]
    for i in range(0, next_words):
        y_pred, state_h = model(last_token.view(1,-1), state_h)
        print('y_pred size: ', y_pred.size())
        print('y_pred[0][-1] size: ', y_pred[0][-1].size())

        last_word_logits = y_pred[0][-1]

        # These are the probabilities
        p = torch.nn.functional.softmax(last_word_logits, dim=0)
        word_index = torch.multinomial(p, 1)[0]
        top_values = torch.topk(p, 5)
        top_words = top_values.indices
        top_probs = top_values.values

        #print('word index: ', word_index)
        #print('top_words: ', top_words.tolist())
        top_word_pred = decode_vocab(dataset.vocab, [word_index])
        top_words_pred = decode_vocab(dataset.vocab, top_words.tolist())

        #print('The top word predicted was: ', top_word_pred)
        #print('The top five predictions were: ', top_words_pred)
        #print('Their probabilites are: ', top_probs)

        prediction.append(word_index)

        last_token = torch.tensor([word_index]).to(device)

    final_prediction = decode_vocab(dataset.vocab, prediction)
    return final_prediction


In [17]:
def predict_with_category(dataset, model, text, category, next_words=100):
    model.eval()

    prediction = get_vocab_indx_vector(dataset.vocab, nlp, text)
    tokens = torch.tensor(prediction).to(device)

    # Get category tensor
    li = all_categories.index(category)
    if li == 0:
        category = torch.zeros(len(prediction)).to(device).long()
    else:
        category = torch.ones(len(prediction)).to(device).long()

    print('cat size: ', category.size())
    print('prediction size: ', tokens.size())

    state_h = model.init_hidden(1) # num_layers, batch_size, lstm_size

    # Prime generation by feeding in initial input:
    for p in range(len(tokens)-1):
        _, state_h = model(tokens[p].view(1,-1), state_h, 1, category[p].view(1,-1))
        #print('state_h size: ', state_h.size())

    last_token = tokens[-1]
    for i in range(0, next_words):
        y_pred, state_h = model(last_token.view(1,-1), state_h, 1, category[0].view(1,-1))
        #print('y_pred size: ', y_pred.size()) # [16, 256, 12923], should be [1, 1, 12923]
        #print('y_pred[0][-1] size: ', y_pred[0][-1].size())

        last_word_logits = y_pred[0][-1]

        # These are the probabilities
        p = torch.nn.functional.softmax(last_word_logits, dim=0)
        word_index = torch.multinomial(p, 1)[0]
        top_values = torch.topk(p, 5)
        #top_words = top_values.indices
        #top_probs = top_values.values

        #print('word index: ', word_index)
        #print('top_words: ', top_words.tolist())
        #top_word_pred = decode_vocab(dataset.vocab, [word_index])
        #top_words_pred = decode_vocab(dataset.vocab, top_words.tolist())

        #print('The top word predicted was: ', top_word_pred)
        #print('The top five predictions were: ', top_words_pred)
        #print('Their probabilites are: ', top_probs)

        prediction.append(word_index)

        last_token = torch.tensor([word_index]).to(device)

    final_prediction = decode_vocab(dataset.vocab, prediction)
    return final_prediction


In [None]:
dataset = RNN_Dataset(TRAIN_TOKEN_LEN, "mar_2023_lowercase_peter_pan_vocab.pt", "lowercase_peter_pan_Transformer_tok.pkl")
input_size = dataset.uniq_words # Should be size of vocab?
hidden_size = 64
n_layers = 3
num_epochs = 50

model = GRU(input_size, hidden_size, input_size, n_layers).to(device)

file_path = f"gru_trained.pt"

losses = train(dataset, model, num_epochs, BATCH_SIZE)

torch.save(model.state_dict(), file_path)

In [None]:
dataset = RNN_Dataset(TRAIN_TOKEN_LEN, "mar_2023_lowercase_peter_pan_vocab.pt", "lowercase_peter_pan_Transformer_tok.pkl")
input_size = dataset.uniq_words # Should be size of vocab?
hidden_size = 512
n_layers = 3
num_epochs = 50

model_with_cells = GRU_with_cells(input_size, hidden_size, input_size, n_layers).to(device)

file_path_cells = f"gru_trained_cells.pt"

losses_with_cells = train(dataset, model_with_cells, num_epochs, BATCH_SIZE)

torch.save(model_with_cells.state_dict(), file_path_cells)

CATEGORY MODELS:

In [18]:
dataset = RNN_Dataset_multiple_sources(TRAIN_TOKEN_LEN, "mar_2023_lowercase_reviews_small_vocab.pt", "lowercase_garden_small_tok.pkl", "lowercase_music_small_tok.pkl")
input_size = dataset.uniq_words # Should be size of vocab?
hidden_size = 256
n_layers = 3
num_epochs = 3

category_model = GRU_category(input_size, hidden_size, input_size, n_layers).to(device)

file_path = f"gru_trained_cat_reviews.pt"

losses_cat = train(dataset, category_model, num_epochs, BATCH_SIZE, cat=True)

torch.save(category_model.state_dict(), file_path)

Finished.
Vocabulary sizes:
33806
Number of raw_tokens:  1245513
Number of samples in a batch:  4865
Number of batches:  304
{'epoch': 0, 'batch': 0, 'loss': 10.426780700683594}
{'epoch': 0, 'batch': 1, 'loss': 10.294330596923828}
{'epoch': 0, 'batch': 2, 'loss': 10.063139915466309}
{'epoch': 0, 'batch': 3, 'loss': 9.715354919433594}
{'epoch': 0, 'batch': 4, 'loss': 9.317502975463867}
{'epoch': 0, 'batch': 5, 'loss': 8.895532608032227}
{'epoch': 0, 'batch': 6, 'loss': 8.585273742675781}
{'epoch': 0, 'batch': 7, 'loss': 8.206415176391602}
{'epoch': 0, 'batch': 8, 'loss': 7.929788589477539}
{'epoch': 0, 'batch': 9, 'loss': 7.640334129333496}
{'epoch': 0, 'batch': 10, 'loss': 7.458206653594971}
{'epoch': 0, 'batch': 11, 'loss': 7.248251914978027}
{'epoch': 0, 'batch': 12, 'loss': 7.090412616729736}
{'epoch': 0, 'batch': 13, 'loss': 6.993451118469238}
{'epoch': 0, 'batch': 14, 'loss': 6.838291168212891}
{'epoch': 0, 'batch': 15, 'loss': 6.721880912780762}
{'epoch': 0, 'batch': 16, 'loss': 

In [19]:
dataset = RNN_Dataset_multiple_sources(TRAIN_TOKEN_LEN, "mar_2023_lowercase_reviews_small_vocab.pt", "lowercase_garden_small_tok.pkl", "lowercase_music_small_tok.pkl")
input_size = dataset.uniq_words # Should be size of vocab?
hidden_size = 256
n_layers = 3
num_epochs = 3

cells_category_model = GRU_with_cells_category(input_size, hidden_size, input_size, n_layers).to(device)

file_path = f"gru_trained_cat_cells_reviews.pt"

losses_cat_cells = train(dataset, cells_category_model, num_epochs, BATCH_SIZE, True)

torch.save(cells_category_model.state_dict(), file_path)

Finished.
Vocabulary sizes:
33806
Number of raw_tokens:  1245513
Number of samples in a batch:  4865
Number of batches:  304
{'epoch': 0, 'batch': 0, 'loss': 10.433921813964844}
{'epoch': 0, 'batch': 1, 'loss': 10.302325248718262}
{'epoch': 0, 'batch': 2, 'loss': 10.084747314453125}
{'epoch': 0, 'batch': 3, 'loss': 9.768202781677246}
{'epoch': 0, 'batch': 4, 'loss': 9.317633628845215}
{'epoch': 0, 'batch': 5, 'loss': 8.877665519714355}
{'epoch': 0, 'batch': 6, 'loss': 8.537729263305664}
{'epoch': 0, 'batch': 7, 'loss': 8.163792610168457}
{'epoch': 0, 'batch': 8, 'loss': 7.851109027862549}
{'epoch': 0, 'batch': 9, 'loss': 7.532853603363037}
{'epoch': 0, 'batch': 10, 'loss': 7.365812301635742}
{'epoch': 0, 'batch': 11, 'loss': 7.108844757080078}
{'epoch': 0, 'batch': 12, 'loss': 7.009706974029541}
{'epoch': 0, 'batch': 13, 'loss': 6.869292736053467}
{'epoch': 0, 'batch': 14, 'loss': 6.7958221435546875}
{'epoch': 0, 'batch': 15, 'loss': 6.790960311889648}
{'epoch': 0, 'batch': 16, 'loss':

In [None]:
dataset = RNN_Dataset_multiple_sources(TRAIN_TOKEN_LEN, "mar_2023_lowercase_reviews_small_vocab.pt", "lowercase_garden_small_tok.pkl", "lowercase_music_small_tok.pkl")
input_size = dataset.uniq_words # Should be size of vocab?
hidden_size = 256
n_layers = 3
num_epochs = 3

cells_category_edited_model = GRU_with_cells_category_edited(input_size, hidden_size, input_size, n_layers).to(device)

file_path = f"gru_trained_cat_cells_edited_reviews.pt"

losses_cat_cells_edited = train(dataset, cells_category_edited_model, num_epochs, BATCH_SIZE, True)

torch.save(cells_category_edited_model.state_dict(), file_path)

In [None]:
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(range(len(losses_cat)), losses_cat, label="original")
ax.plot(range(len(losses_cat_cells)), losses_cat_cells, label="original with cells")
ax.plot(range(len(losses_cat_cells_edited)), losses_cat_cells_edited, label="edited")
plt.title("Loss over time")
plt.xlabel("Time")
plt.ylabel("Loss")
plt.legend()
plt.show()

In [None]:
print(' '.join(predict_with_category(dataset, category_model, text='i am', category='english', next_words=100)))

In [None]:
print(' '.join(predict_with_category(dataset, cells_category_model, text='i am', category='english', next_words=100)))

In [None]:
print(' '.join(predict_with_category(dataset, cells_category_edited_model, text='i am', category='english', next_words=100)))

In [None]:
print(' '.join(predict_with_category(dataset, category_model, text='i am', category='news', next_words=100)))