In [2]:
import math
import torch
import torch.nn as nn

### Exercise 2. (A)


When we anticipate numerous out-of-vocabulary (OOV) words during test, the preferred tokenization method would be **subword tokenization**, because it excels at handling OOV words by decomposing them into smaller, more common subunits known as subwords. By representing OOV words using subwords, the model can still capture meaningful information from these words, even if they haven't been explicitly encountered during training.


Subword tokenization approaches fall into two main categories. **WordPiece** which splits words into character subsequences based on a predefined vocabulary of subwords. It's particularly effective as it can generate subwords that correspond to meaningful morphemes.
**Byte Pair Encoding (BPE)** iteratively merges the most frequent byte pairs in the corpus until it reaches a specified vocabulary size. Unlike WordPiece, BPE doesn't need a predefined vocabulary, making it more adaptable to new words and languages.



In situations where OOV words are prevalent, **subword tokenization** offers several advantages over traditional word-based tokenization:
**Reduced OOV Rate**: Subwords effectively decompose OOV words into known units, significantly reducing the number of true OOV tokens encountered by the model.

**Improved Representation**: By representing OOV words using subwords, the model can still extract contextual information and semantic relationships from these words, even if they haven't been explicitly trained on.

**Vocabulary Flexibility**: Subword tokenization techniques don't require a fixed vocabulary, allowing the model to adapt to new words and languages without explicit vocabulary updates.

Therefore, **subword tokenization** is the preferred choice for scenarios where OOV words are expected, as it effectively handles these rare words while still preserving meaningful information for model training and prediction.

### References:
 - https://www.datacamp.com/blog/what-is-tokenization

### Exercise 2. (B) I.


In [2]:
def bag_of_words(sentence, vocabulary):
    # Initialize a vector with zeros for each word in the vocabulary
    bag_of_words_vector = [0] * len(vocabulary)

    # Tokenize the sentence into words
    words = sentence.split()

    # Count the frequency of each word in the sentence
    for word in words:
        if word in vocabulary:
            index = vocabulary.index(word)
            bag_of_words_vector[index] += 1

    return bag_of_words_vector

vocabulary = ['and', 'apple', 'banana', 'eat', 'hate', 'I', 'pie', 'strawberry', 'the', 'they']

input_sentence = "You and I eat the strawberry pie"

result = bag_of_words(input_sentence, vocabulary)

result

[1, 0, 0, 1, 0, 1, 1, 1, 1, 0]

### Exercise 2. (B) II.

In [4]:
vocabulary = ["and", "apple", "banana", "eat", "hate", "I", "pie", "strawberry", "the", "they"]
# lower case all words in the vocabulary
vocabulary = [word.lower() for word in vocabulary]

document_counts = [90, 30, 15, 40, 10, 60, 20, 5, 85, 30]

total_documents = 100

sentence = "You and I eat the strawberry pie"

# Tokenize the sentence into words
words = sentence.lower().split()

# Compute TF-IDF representation
tf_representation = []
idf_representation = []
tfidf_representation = []

for term in vocabulary:
    # Compute TF (Term Frequency)
    tf = round(words.count(term) / len(words) if len(words) > 0 else 0, 6)
    tf_representation.append(tf)

    # Compute IDF (Inverse Document Frequency)
    idf = round(math.log(total_documents / document_counts[vocabulary.index(term)]), 6)
    idf_representation.append(idf)

    # Compute TF-IDF
    tfidf = round(tf * idf, 6)
    # Append TF-IDF value to the representation
    tfidf_representation.append(tfidf)

for word, tf, idf, tfidf in zip(vocabulary, tf_representation, idf_representation, tfidf_representation):
    print(f"Word: {word}, TF: {tf}, IDF: {idf}, TF-IDF: {tfidf}")

print()
print("TF-IDF representation of vocabulary for sentence:")
print(tfidf_representation)

Word: and, TF: 0.142857, IDF: 0.105361, TF-IDF: 0.015052
Word: apple, TF: 0.0, IDF: 1.203973, TF-IDF: 0.0
Word: banana, TF: 0.0, IDF: 1.89712, TF-IDF: 0.0
Word: eat, TF: 0.142857, IDF: 0.916291, TF-IDF: 0.130899
Word: hate, TF: 0.0, IDF: 2.302585, TF-IDF: 0.0
Word: i, TF: 0.142857, IDF: 0.510826, TF-IDF: 0.072975
Word: pie, TF: 0.142857, IDF: 1.609438, TF-IDF: 0.229919
Word: strawberry, TF: 0.142857, IDF: 2.995732, TF-IDF: 0.427961
Word: the, TF: 0.142857, IDF: 0.162519, TF-IDF: 0.023217
Word: they, TF: 0.0, IDF: 1.203973, TF-IDF: 0.0

TF-IDF representation of vocabulary for sentence:
[0.015052, 0.0, 0.0, 0.130899, 0.0, 0.072975, 0.229919, 0.427961, 0.023217, 0.0]


References
 - https://www.learndatasci.com/glossary/tf-idf-term-frequency-inverse-document-frequency/#:~:text=The%20TF%2DIDF%20of%20a,multiplying%20TF%20and%20IDF%20scores.&text=Translated%20into%20plain%20English%2C%20importance,between%20documents%20measured%20by%20IDF.

### Exercise

In [8]:
import torch
import torch.nn as nn

class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        """
        Initializes the LSTMLanguageModel.

        Parameters:
            - vocab_size (int): Size of the vocabulary.
            - embed_size (int): Size of the word embeddings.
            - hidden_size (int): Size of the hidden state of the LSTM.
            - num_layers (int): Number of layers in the LSTM.
        """
        super(LSTMLanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, h0=None):
        """
        Defines the forward pass of the LSTMLanguageModel.

        Parameters:
            - x (torch.Tensor): Input sequence of token indices (batch_size, sequence_length).
            - h0 (torch.Tensor, optional): Initial hidden state (num_layers * num_directions, batch_size, hidden_size).

        Returns:
            - output (torch.Tensor): Output sequence from the language model (batch_size, sequence_length, vocab_size).
            - hn (torch.Tensor): Final hidden state (num_layers * num_directions, batch_size, hidden_size).
        """
        embedded = self.embedding(x)
        output, hn = self.lstm(embedded, h0)
        output = self.fc(output)
        return output, hn

    def generate(self, x, h0=None, no=10):
        """
        Generates a sequence of token indices using greedy decoding.

        Parameters:
            - x (torch.Tensor): Input token index (batch_size, 1).
            - h0 (torch.Tensor, optional): Initial hidden state (num_layers * num_directions, batch_size, hidden_size).
            - no (int): Number of tokens to be generated.

        Returns:
            - generated_sequence (torch.Tensor): Decoded sequence of token indices (batch_size, no).
        """
        with torch.no_grad():
            generated_sequence = []
            current_token = x

            for _ in range(no):
                output, hn = self.forward(current_token, h0)
                probabilities = nn.functional.softmax(output[:, -1, :], dim=1)
                next_token = torch.argmax(probabilities, dim=1, keepdim=True)
                generated_sequence.append(next_token)
                current_token = next_token

            generated_sequence = torch.cat(generated_sequence, dim=1)
            return generated_sequence

In [11]:
class RNNLM(nn.Module):
    """Recurrent Neural Network (RNN) Language Model"""
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, rnn_type='lstm'):
        super(RNNLM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        if rnn_type == 'lstm':
            self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        elif rnn_type == 'gru':
            self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        """
        Forward pass through the RNNLM model

        Args:
            x (torch.Tensor): Input sequence of token indices (integers)
                Shape: (batch_size, seq_len)

        Returns:
            logits (torch.Tensor): Unnormalized log probabilities of the next word for each token in the input sequence
                Shape: (batch_size, seq_len, vocab_size)
        """
        embedded = self.embedding(x)
        _, hidden = self.rnn(embedded)
        logits = self.linear(hidden)
        return logits

    def generate(self, x, h0, no):
        """
        Generate text using the greedy decoding algorithm

        Args:
            x (torch.Tensor): Input token (integer)
                Shape: (1,)
            h0 (tuple): Initial hidden state of the RNN
                Shape: (num_layers, batch_size, hidden_dim)
            no (int): Desired number of tokens to be generated

        Returns:
            decoded_tokens (torch.Tensor): Sequence of token indices (integers) representing the generated text
                Shape: (no,)
        """
        decoded_tokens = torch.empty(no, dtype=torch.long, device=x.device)
        for i in range(no):
            embedded = self.embedding(x.unsqueeze(0))
            output, h0 = self.rnn(embedded, h0)
            logits = self.linear(output.squeeze(0))
            pred = logits.argmax(dim=1)
            decoded_tokens[i] = pred

            x = pred.unsqueeze(0)

        return decoded_tokens