In [None]:
import nltk
nltk.download('punkt')  # Download the Punkt tokenizer models


In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import Counter

# Define the NPLM architecture
class NPLM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size, hidden_dim):
        super(NPLM, self).__init__()
        # Embedding layer (lookup table)
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        # First fully connected layer followed by tanh
        self.fc1 = nn.Linear(context_size * embedding_dim, hidden_dim)
        
        # Output layer (softmax over the vocabulary)
        self.fc2 = nn.Linear(hidden_dim, vocab_size)

    def forward(self, context):
        # Look up the embeddings for the context words
        embeds = self.embeddings(context).view((1, -1))  # Flatten the embeddings
        
        # Pass through the first hidden layer with Tanh activation
        hidden = torch.tanh(self.fc1(embeds))
        
        # Compute output with the softmax layer
        output = self.fc2(hidden)
        return F.log_softmax(output, dim=-1)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import Counter

# Define the NPLM architecture
class NPLM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size, hidden_dim):
        super(NPLM, self).__init__()
        # Embedding layer (lookup table)
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        # First fully connected layer followed by tanh
        self.fc1 = nn.Linear(context_size * embedding_dim, hidden_dim)
        
        # Output layer (softmax over the vocabulary)
        self.fc2 = nn.Linear(hidden_dim, vocab_size)

    def forward(self, context):
        # Look up the embeddings for the context words
        embeds = self.embeddings(context).view((1, -1))  # Flatten the embeddings
        
        # Pass through the first hidden layer with Tanh activation
        hidden = torch.tanh(self.fc1(embeds))
        
        # Compute output with the softmax layer
        output = self.fc2(hidden)
        return F.log_softmax(output, dim=-1)

# Load SST2 dataset from the local file (directly using words without tokenization)
file_path = '/home/mohammad/Safety-Driven-Self-Compressing-Neural-Networks/Neural Probablistic /data/sst2_train.txt'

data = []

# Read the file and split it into words directly
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        label, sentence = line.strip().split('\t')
        words = sentence.split()  # Split the sentence into words
        data.append((label, words))

# Build a vocabulary from the words
def build_vocab(data):
    vocab = Counter()
    for _, words in data:
        vocab.update(words)
    return {word: idx for idx, (word, _) in enumerate(vocab.items(), start=0)}

# Create a vocabulary from the SST2 dataset
vocab = build_vocab(data)

# Create context-target pairs (3 words as context, next word as target)
def create_context_target_pairs(data, context_size):
    pairs = []
    for _, words in data:
        for i in range(len(words) - context_size):
            context = words[i:i + context_size]
            target = words[i + context_size]
            pairs.append((context, target))
    return pairs

# Create the training data
train_data = create_context_target_pairs(data, context_size=3)

# Convert words to indices using the vocabulary
def words_to_indices(context, vocab):
    return torch.tensor([vocab[word] for word in context], dtype=torch.long)

# Define hyperparameters
embedding_dim = 50  # Embedding dimension
hidden_dim = 100    # Hidden layer size
context_size = 3    # Context of 3 words
vocab_size = len(vocab)

# Instantiate the model
model = NPLM(vocab_size=vocab_size, embedding_dim=embedding_dim, context_size=context_size, hidden_dim=hidden_dim)

# Define the loss function and optimizer
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Train the NPLM model
epochs = 50
for epoch in range(epochs):
    total_loss = 0
    for context, target in train_data:
        # Convert context words and target to indices
        context_idxs = words_to_indices(context, vocab)
        target_idx = torch.tensor([vocab[target]], dtype=torch.long)

        # Zero gradients
        model.zero_grad()

        # Forward pass
        log_probs = model(context_idxs)

        # Compute loss and backward pass
        loss = loss_function(log_probs, target_idx)
        loss.backward()

        # Update weights
        optimizer.step()

        total_loss += loss.item()

    if epoch % 5 == 0:
        print(f'Epoch {epoch}, Loss: {total_loss/len(train_data)}')

# Example usage: Predict the next word given a context of 3 words
with torch.no_grad():
    test_context = ['i', 'love', 'this']  # Replace with any context from SST2
    test_context_idxs = words_to_indices(test_context, vocab)
    prediction = model(test_context_idxs)
    predicted_word_idx = prediction.argmax(dim=-1).item()
    predicted_word = list(vocab.keys())[list(vocab.values()).index(predicted_word_idx)]
    print(f"Given context: {' '.join(test_context)}, predicted next word: {predicted_word}")
