In [9]:
import math
import torch
import torch.nn as nn

### Exercise 2. (B) I.

In [12]:
def bag_of_words(sentence, vocabulary):
    # Initialize a vector with zeros for each word in the vocabulary
    bag_of_words_vector = [0] * len(vocabulary)

    # Tokenize the sentence into words
    words = sentence.split()

    # Count the frequency of each word in the sentence
    for word in words:
        if word in vocabulary:
            index = vocabulary.index(word)
            bag_of_words_vector[index] += 1

    return bag_of_words_vector

vocabulary = ['and', 'apple', 'banana', 'eat', 'hate', 'I', 'pie', 'strawberry', 'the', 'they']

input_sentence = "You and I eat the strawberry pie"

result = bag_of_words(input_sentence, vocabulary)


result

[1, 0, 0, 1, 0, 1, 1, 1, 1, 0]

### Exercise 2. (B) II.

In [13]:
def tf(term, document):
    term_count = document.count(term)
    total_terms = len(document)

    return term_count / total_terms if total_terms > 0 else 0.0

def idf(term, document_frequency, total_documents):
    term_document_frequency = document_frequency.get(term, 0)

    return math.log(total_documents / (term_document_frequency + 1)) + 1

def tfidf(term, document, document_frequency, total_documents):

    return tf(term, document) * idf(term, document_frequency, total_documents)

def compute_tfidf_representation(sentence, vocabulary, document_frequency, total_documents):
    words = sentence.split()
    tfidf_vector = [tfidf(word, words, document_frequency, total_documents) for word in vocabulary]

    return tfidf_vector

vocabulary = ['and', 'apple', 'banana', 'eat', 'hate', 'I', 'pie', 'strawberry', 'the', 'they']
document_frequency = {'and': 90, 'apple': 30, 'banana': 15, 'eat': 40, 'hate': 10, 'I': 60, 'pie': 20, 'strawberry': 5, 'the': 85, 'they': 30}
total_documents = 100


input_sentence = "You and I eat the strawberry pie"


tfidf_result = compute_tfidf_representation(input_sentence, vocabulary, document_frequency, total_documents)

tfidf_result

[0.15633009706732018,
 0.0,
 0.0,
 0.2702283027548262,
 0.0,
 0.21347090311639716,
 0.3658068211806669,
 0.544772959537148,
 0.16440326996208335,
 0.0]

In [7]:
vocabulary = ["and", "apple", "banana", "eat", "hate", "I", "pie", "strawberry", "the", "they"]

document_counts = [90, 30, 15, 40, 10, 60, 20, 5, 85, 30]

total_documents = 100

sentence = "You and I eat the strawberry pie"

# Tokenize the sentence into words
words = sentence.lower().split()

# Compute TF-IDF representation
tfidf_representation = []

for term in vocabulary:
    # Compute TF (Term Frequency)
    tf = words.count(term) / len(words) if len(words) > 0 else 0

    # Compute IDF (Inverse Document Frequency)
    idf = math.log(total_documents / document_counts[vocabulary.index(term)])

    # Compute TF-IDF
    tfidf = tf * idf

    # Append TF-IDF value to the representation
    tfidf_representation.append(tfidf)

print("TF-IDF representation:", tfidf_representation)

TF-IDF representation: [0.015051502236832335, 0.0, 0.0, 0.13089867598202215, 0.0, 0.0, 0.22991970177630003, 0.4279617533648558, 0.02321698992825356, 0.0]


References
 - https://www.learndatasci.com/glossary/tf-idf-term-frequency-inverse-document-frequency/#:~:text=The%20TF%2DIDF%20of%20a,multiplying%20TF%20and%20IDF%20scores.&text=Translated%20into%20plain%20English%2C%20importance,between%20documents%20measured%20by%20IDF.

In [8]:
import torch
import torch.nn as nn

class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        """
        Initializes the LSTMLanguageModel.

        Parameters:
            - vocab_size (int): Size of the vocabulary.
            - embed_size (int): Size of the word embeddings.
            - hidden_size (int): Size of the hidden state of the LSTM.
            - num_layers (int): Number of layers in the LSTM.
        """
        super(LSTMLanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, h0=None):
        """
        Defines the forward pass of the LSTMLanguageModel.

        Parameters:
            - x (torch.Tensor): Input sequence of token indices (batch_size, sequence_length).
            - h0 (torch.Tensor, optional): Initial hidden state (num_layers * num_directions, batch_size, hidden_size).

        Returns:
            - output (torch.Tensor): Output sequence from the language model (batch_size, sequence_length, vocab_size).
            - hn (torch.Tensor): Final hidden state (num_layers * num_directions, batch_size, hidden_size).
        """
        embedded = self.embedding(x)
        output, hn = self.lstm(embedded, h0)
        output = self.fc(output)
        return output, hn

    def generate(self, x, h0=None, no=10):
        """
        Generates a sequence of token indices using greedy decoding.

        Parameters:
            - x (torch.Tensor): Input token index (batch_size, 1).
            - h0 (torch.Tensor, optional): Initial hidden state (num_layers * num_directions, batch_size, hidden_size).
            - no (int): Number of tokens to be generated.

        Returns:
            - generated_sequence (torch.Tensor): Decoded sequence of token indices (batch_size, no).
        """
        with torch.no_grad():
            generated_sequence = []
            current_token = x

            for _ in range(no):
                output, hn = self.forward(current_token, h0)
                probabilities = nn.functional.softmax(output[:, -1, :], dim=1)
                next_token = torch.argmax(probabilities, dim=1, keepdim=True)
                generated_sequence.append(next_token)
                current_token = next_token

            generated_sequence = torch.cat(generated_sequence, dim=1)
            return generated_sequence