# **Data Preparation (make_batch)**

In [1]:


import torch
import torch.nn as nn
import torch.optim as optim

def make_batch():
    """
    Tokenizes each sentence and splits into inputs (all words except last)
    and target (the final word) for causal language modeling.
    """
    input_batch = []
    target_batch = []

    for sen in sentences:
        words = sen.split()
        input_ids = [word_dict[w] for w in words[:-1]]   # IDs for all tokens except last
        target_id = word_dict[words[-1]]                  # ID of the last token

        input_batch.append(input_ids)
        target_batch.append(target_id)

    return input_batch, target_batch


# **Definition (NNLM)**

In [2]:


class NNLM(nn.Module):
    def __init__(self):
        super(NNLM, self).__init__()
        # Embedding lookup: maps token IDs → m‑dim vectors
        self.C = nn.Embedding(n_class, m)
        # Hidden layer: transforms flattened embeddings → hidden features
        self.H = nn.Linear(n_step * m, n_hidden, bias=False)
        # Bias vector for hidden layer before activation
        self.d = nn.Parameter(torch.ones(n_hidden))
        # Output projection from hidden state → vocabulary logits
        self.U = nn.Linear(n_hidden, n_class, bias=False)
        # Direct linear connection from input embeddings → vocabulary logits
        self.W = nn.Linear(n_step * m, n_class, bias=False)
        # Bias vector for final output
        self.b = nn.Parameter(torch.ones(n_class))

    def forward(self, X):
        X = self.C(X)                       # [batch_size, n_step, m]
        X = X.view(-1, n_step * m)          # flatten to [batch_size, n_step*m]
        hidden = torch.tanh(self.d + self.H(X))  # non‑linear hidden representation
        output = self.b + self.W(X) + self.U(hidden)  # combine direct & hidden paths
        return output                       # [batch_size, n_class]




# **Training & Prediction**

In [3]:
# === Cell:  ===

if __name__ == '__main__':
    # Hyperparameters
    n_step = 2        # Sequence length (context window)
    n_hidden = 2      # Hidden layer size
    m = 2             # Embedding dimension

    # Example sentences
    sentences = ["i like dog", "i love coffee", "i hate milk"]

    # Build vocabulary mappings
    word_list = list(set(" ".join(sentences).split()))
    word_dict = {w: i for i, w in enumerate(word_list)}    # token → index
    number_dict = {i: w for i, w in enumerate(word_list)}  # index → token
    n_class = len(word_dict)                               # vocabulary size

    # Instantiate model, loss, and optimizer
    model = NNLM()
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Prepare training batches
    input_batch, target_batch = make_batch()
    input_batch = torch.LongTensor(input_batch)    # shape: [batch_size, n_step]
    target_batch = torch.LongTensor(target_batch)  # shape: [batch_size]

    # Training loop
    for epoch in range(5000):
        optimizer.zero_grad()
        output = model(input_batch)                 # logits over vocabulary
        loss = criterion(output, target_batch)      # compute cross‑entropy loss

        # Log every 1000 epochs
        if (epoch + 1) % 1000 == 0:
            print(f'Epoch: {epoch+1:04d}, Loss = {loss.item():.6f}')

        loss.backward()
        optimizer.step()

    # Prediction: pick highest‑scoring word for each input
    _, predicted_indices = model(input_batch).max(dim=1)

    # Display results
    contexts = [sen.split()[:2] for sen in sentences]
    predictions = [number_dict[idx.item()] for idx in predicted_indices]
    print(contexts, '->', predictions)


Epoch: 1000, Loss = 0.057152
Epoch: 2000, Loss = 0.010861
Epoch: 3000, Loss = 0.003912
Epoch: 4000, Loss = 0.001764
Epoch: 5000, Loss = 0.000885
[['i', 'like'], ['i', 'love'], ['i', 'hate']] -> ['dog', 'coffee', 'milk']
