In [1]:
# RNN which will (hopefully) generate a random chat between 2 people

In [3]:
# # for kaggle
# # import kaggle
# for work
import torch
import nltk # to tokenize the text
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import numpy as np

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
# get and unzip dataset - only need to run once
# dataset = 'projjal1/human-conversation-training-data'
# kaggle.api.dataset_download_files(dataset, path=".", unzip=True)

In [6]:
# reading the data line by line into list
text = None
with open('human_chat.txt','r',encoding='utf-8') as dataset_file:
    text = dataset_file.read()
print("Total characters in text: ",len(text))
print(text[:495])

Total characters in text:  115782
Human 1: Hi!
Human 2: What is your favorite holiday?
Human 1: one where I get to meet lots of different people.
Human 2: What was the most number of people you have ever met during a holiday?
Human 1: Hard to keep a count. Maybe 25.
Human 2: Which holiday was that?
Human 1: I think it was Australia
Human 2: Do you still talk to the people you met?
Human 1: Not really. The interactions are usually short-lived but it's fascinating to learn where people are coming from and what matters to them


In [7]:
# building vocabulary in token form
# words = [line.lower().split() for line in lines] # using nltk tokens for better splitting (fine! vs fine)
words = word_tokenize(text.lower()) # get tokens in the form of words and punctuations from the text
print(words[:100])
print("Total words: ",len(words))
print("Unique words: ", len(set((words))))

['human', '1', ':', 'hi', '!', 'human', '2', ':', 'what', 'is', 'your', 'favorite', 'holiday', '?', 'human', '1', ':', 'one', 'where', 'i', 'get', 'to', 'meet', 'lots', 'of', 'different', 'people', '.', 'human', '2', ':', 'what', 'was', 'the', 'most', 'number', 'of', 'people', 'you', 'have', 'ever', 'met', 'during', 'a', 'holiday', '?', 'human', '1', ':', 'hard', 'to', 'keep', 'a', 'count', '.', 'maybe', '25.', 'human', '2', ':', 'which', 'holiday', 'was', 'that', '?', 'human', '1', ':', 'i', 'think', 'it', 'was', 'australia', 'human', '2', ':', 'do', 'you', 'still', 'talk', 'to', 'the', 'people', 'you', 'met', '?', 'human', '1', ':', 'not', 'really', '.', 'the', 'interactions', 'are', 'usually', 'short-lived', 'but', 'it', "'s"]
Total words:  27943
Unique words:  2813


In [8]:
vocabulary = sorted(set(words)) # sorts the words alphabetically
print(len(vocabulary))
print(vocabulary[:50])

2813
['!', '%', '&', "'", "''", "'billions", "'court", "'d", "'ll", "'m", "'re", "'s", "'ve", '(', ')', '*', '+', ',', '-', '--', '.', '..', '...', '....', '/', '1', '1-2', '1.', '10', '114', '12', '15', '1:1', '2', '2-3', '2.', '20', '2019', '23rd', '24', '24th', '25.', '3', '3.', '30c', '320', '3pm', '4', '4.', '4.30']


In [9]:
# create lookup table for word to in and reverse
word2int_mapping = {word:i for i,word in enumerate(vocabulary)}
word_array = np.array(vocabulary) # reverse of word2int_mapping, the index holds the word
# print(word2int_mapping)
print(word_array[:50])

['!' '%' '&' "'" "''" "'billions" "'court" "'d" "'ll" "'m" "'re" "'s"
 "'ve" '(' ')' '*' '+' ',' '-' '--' '.' '..' '...' '....' '/' '1' '1-2'
 '1.' '10' '114' '12' '15' '1:1' '2' '2-3' '2.' '20' '2019' '23rd' '24'
 '24th' '25.' '3' '3.' '30c' '320' '3pm' '4' '4.' '4.30']


In [10]:
# encoding and decoding the sentences according to the vocabulary
encoded_lines = np.array([word2int_mapping[word] for word in words], dtype=np.int32)
print(encoded_lines)
print("Encoded lines shape:",encoded_lines.shape)
print("Words", words[:20])
print("Encoding,", encoded_lines[:20])
print("Reverse conversion: ", ' '.join(word_array[encoded_lines[:20]])) # ' '.join(word_array[np.array(list_of_indices))

[1187   25   56 ... 1574 1281   20]
Encoded lines shape: (27943,)
Words ['human', '1', ':', 'hi', '!', 'human', '2', ':', 'what', 'is', 'your', 'favorite', 'holiday', '?', 'human', '1', ':', 'one', 'where', 'i']
Encoding, [1187   25   56 1134    0 1187   33   56 2708 1278 2783  876 1159   60
 1187   25   56 1664 2713 1203]
Reverse conversion:  human 1 : hi ! human 2 : what is your favorite holiday ? human 1 : one where i


In [11]:
# make sequences and chunks
sequence_size = 50 # input sequence length
chunk_size = sequence_size + 1

text_chunks = [encoded_lines[i:i+chunk_size] for i in range(len(encoded_lines)-chunk_size+1)] # make text chunks of sequence size
# each chunk start +1 to the right of the previous
print(text_chunks[:1],end="\n\n")

for seq in text_chunks[:1]:
    input_seq = seq[:sequence_size] # input sequence = 50 words
    target = seq[sequence_size] # target output = one word
    print(input_seq,'->',target)
    print(repr(' '.join(word_array[input_seq])),'->',repr(''.join(word_array[target])))

[array([1187,   25,   56, 1134,    0, 1187,   33,   56, 2708, 1278, 2783,
        876, 1159,   60, 1187,   25,   56, 1664, 2713, 1203, 1009, 2501,
       1491, 1427, 1642,  688, 1762,   20, 1187,   33,   56, 2708, 2681,
       2461, 1556, 1628, 1642, 1762, 2781, 1103,  812, 1512,  749,   63,
       1159,   60, 1187,   25,   56, 1095, 2501], dtype=int32)]

[1187   25   56 1134    0 1187   33   56 2708 1278 2783  876 1159   60
 1187   25   56 1664 2713 1203 1009 2501 1491 1427 1642  688 1762   20
 1187   33   56 2708 2681 2461 1556 1628 1642 1762 2781 1103  812 1512
  749   63 1159   60 1187   25   56 1095] -> 2501
'human 1 : hi ! human 2 : what is your favorite holiday ? human 1 : one where i get to meet lots of different people . human 2 : what was the most number of people you have ever met during a holiday ? human 1 : hard' -> 'to'


In [12]:
# convert into proper pytorch dataset form

class TextDataset(torch.utils.data.Dataset):
    def __init__(self, text_chunks):
        self.text_chunks = text_chunks
    def __len__(self):
        return len(self.text_chunks) # number of text chunks
    def __getitem__(self,idx):
        text_chunk = self.text_chunks[idx] # return the idxth text chunk
        return text_chunk[:-1].long(), text_chunk[1:].long() # return it in 2 parts: all words cept last (input sequence) and all words cept 1st (target sequence)

seq_dataset = TextDataset(torch.tensor((text_chunks)))

  seq_dataset = TextDataset(torch.tensor((text_chunks)))


In [13]:
# checking out our get item chunks functionality
for i,(seq,target) in enumerate(seq_dataset): # gets items in order using overloaded getitem
    print("Input:",repr(' '.join(word_array[seq])))
    print("Target:",repr(' '.join(word_array[target])))
    print()
    if i == 2:
        break

Input: 'human 1 : hi ! human 2 : what is your favorite holiday ? human 1 : one where i get to meet lots of different people . human 2 : what was the most number of people you have ever met during a holiday ? human 1 : hard'
Target: '1 : hi ! human 2 : what is your favorite holiday ? human 1 : one where i get to meet lots of different people . human 2 : what was the most number of people you have ever met during a holiday ? human 1 : hard to'

Input: '1 : hi ! human 2 : what is your favorite holiday ? human 1 : one where i get to meet lots of different people . human 2 : what was the most number of people you have ever met during a holiday ? human 1 : hard to'
Target: ': hi ! human 2 : what is your favorite holiday ? human 1 : one where i get to meet lots of different people . human 2 : what was the most number of people you have ever met during a holiday ? human 1 : hard to keep'

Input: ': hi ! human 2 : what is your favorite holiday ? human 1 : one where i get to meet lots of differe

In [14]:
# create a dataloader
batch_size = 32
dataloader = torch.utils.data.DataLoader(seq_dataset,batch_size=batch_size,shuffle=True,drop_last=True) # drop last batch if it is smaller than batch size

In [15]:
# this code tells what to use, not neccessary if u dont have a gpu to run on
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(DEVICE)

cuda:0


In [16]:
# my rnn
class ChatGeneratingRNN(torch.nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super().__init__()
        self.embedding = torch.nn.Embedding(vocab_size, embed_size) # embedds the encodings into vectors
        self.rnn_hidden_size = hidden_size # num hidden layer neurons
        self.rnn = torch.nn.LSTM(embed_size, hidden_size, batch_first=True) # LSTM layer as hidden layer
        self.fc = torch.nn.Linear(hidden_size, vocab_size) # fully connected layer as output layer

    def forward(self, inputs, hidden, cell):
        layer_output = self.embedding(inputs).unsqueeze(1) # unqueeze = add a dim of size = 1 at indx = 1
        layer_output, (hidden,cell) = self.rnn(layer_output,(hidden,cell))
        layer_output = self.fc(layer_output).reshape(layer_output.size(0),-1) # output dim (dim0, mult_of_other_dims)
        return layer_output, hidden, cell

    def init_hidden(self, batch_size):
        hidden = torch.zeros(1,batch_size,self.rnn_hidden_size) # init hidden with all 0s - 1 is num layers in LSTM
        cell = torch.zeros(1,batch_size,self.rnn_hidden_size) # init cell with all 0s
        return hidden.to(DEVICE), cell.to(DEVICE)

In [17]:
# Hyperparameters
vocab_size = len(word_array)
embed_size = 256
hidden_size = 512

In [18]:
# creating model
model = ChatGeneratingRNN(vocab_size, embed_size, hidden_size)
model = model.to(DEVICE)
model

ChatGeneratingRNN(
  (embedding): Embedding(2813, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=2813, bias=True)
)

In [19]:
# setting output activation and loss
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [37]:
# training the model
training_epochs = 5001
model.to(DEVICE)
model.train()
for epoch in range(training_epochs):
    hidden,cell = model.init_hidden(batch_size)
    seq_batch, target_batch = next(iter(dataloader)) # get batches of batch size from the dataloader
    seq_batch = seq_batch.to(DEVICE)
    target_batch = target_batch.to(DEVICE)
    optimizer.zero_grad()
    loss = 0
    for w in range(sequence_size): # w: 0 -> sequence_size - 1
        pred, hidden, cell = model(seq_batch[:,w],hidden,cell) # run forward iteration, all rows entries at column = w
        loss += loss_function(pred, target_batch[:,w]) # caculate cumulative loss, all rows entries at column w
    loss.backward()
    optimizer.step()
    loss = loss.item()/sequence_size # normalize loss
    if epoch % 500 == 0:
      print(f'Epoch [{epoch}/{training_epochs}], Loss: {loss:.4f}')

Epoch [0/5001], Loss: 0.4349
Epoch [500/5001], Loss: 0.3923
Epoch [1000/5001], Loss: 0.3332
Epoch [1500/5001], Loss: 0.3252
Epoch [2000/5001], Loss: 0.2842
Epoch [2500/5001], Loss: 0.2828
Epoch [3000/5001], Loss: 0.2413
Epoch [3500/5001], Loss: 0.2597
Epoch [4000/5001], Loss: 0.2171
Epoch [4500/5001], Loss: 0.2148
Epoch [5000/5001], Loss: 0.2033


In [38]:
def top_p_sampling(logits, temperature=1.0, top_p=0.9):
    # Apply temperature scaling
    scaled_logits = logits / temperature

    # Convert logits to probabilities using softmax
    probabilities = torch.softmax(scaled_logits, dim=-1)

    # Sort probabilities and compute cumulative sum
    sorted_indices = torch.argsort(probabilities, descending=True)
    sorted_probabilities = probabilities[sorted_indices]
    cumulative_probabilities = torch.cumsum(sorted_probabilities, dim=-1)

    # Apply top-p filtering
    indices_to_keep = cumulative_probabilities <= top_p
    truncated_probabilities = sorted_probabilities[indices_to_keep]

    # Rescale the probabilities
    truncated_probabilities /= torch.sum(truncated_probabilities)

    # Convert to numpy arrays for random choice
    truncated_probabilities = truncated_probabilities.cpu().numpy()
    sorted_indices = sorted_indices.cpu().numpy()
    indices_to_keep = indices_to_keep.cpu().numpy()

    # Sample from the truncated distribution
    if not indices_to_keep.any():
        # Handle the empty case - for example, using regular sampling without top-p
        probabilities = torch.softmax(logits / temperature, dim=-1)
        next_word_index = torch.multinomial(probabilities, 1).item() # sample 1 item based on probabilities
    else:
        # Existing sampling process
        next_word_index = np.random.choice(sorted_indices[indices_to_keep], p=truncated_probabilities)

    return torch.tensor(next_word_index).to(DEVICE)

In [39]:
def generate(model, seed_string, len_generated_text=50, temperature=1.0, top_p=0.95):
    seed_tokens = word_tokenize(seed_string.lower())
    encoded_input = torch.tensor([word2int_mapping[t] for t in seed_tokens])
    encoded_input = torch.reshape(encoded_input, (1, -1)).to(DEVICE) # reshape to 2d form (1, num_tokens)
    generated_str = seed_string # init with input
    model.eval()
    with torch.inference_mode():
        hidden, cell = model.init_hidden(1)
        hidden = hidden.to(DEVICE)
        cell = cell.to(DEVICE)
        for w in range(len(seed_tokens) - 1):
            _, hidden, cell = model(encoded_input[:, w].view(1), hidden, cell)
        last_word = encoded_input[:, -1] # initializes with last word of seed string
        for i in range(len_generated_text):
            logits, hidden, cell = model(last_word.view(1), hidden, cell) # start generating starting with giving the last word as 1st input
            logits = torch.squeeze(logits, 0)
            last_word = top_p_sampling(logits.cpu(), temperature, top_p)  # Ensure logits is on CPU and replace last word with generated next word
            generated_str += " " + str(word_array[last_word]) # add the generated next word tio generated_str

    return generated_str.replace(" . ", ". ") # ok thanks . i will look into it -> ok thanks. i will look into it

In [40]:
model.to(DEVICE)
print(generate(model, seed_string='how did that'))

how did that you enjoyed it ? human 1 : actually no idea of it was a little as well. do you have any plans for the break ? human 2 : what 's the book is about ? human 1 : it 's a sci-fi book about aliens yourself ? human


In [41]:
print(generate(model, seed_string= 'the human'))

the human 1 : oh that 's cool ! what is your favorite ? human 2 : not great , but i know too much is things. any plans for the break ? human 1 : there 's the one of < redacted_term > used to run after death. i


In [36]:
print(generate(model, seed_string='once upon a time'))

once upon a time best ! human 2 : hello there ! who are you doing ? human 1 : i 've been reading the conference i prefer it everyday over the weekends. human 2 : lol i ca n't tell my cat the city and i think its best to start discussing
