In [1]:
!nvidia-smi

Fri Aug 29 18:54:27 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.5     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          Off | 00000000:17:00.0 Off |                    0 |
| N/A   53C    P0              68W / 300W |  75260MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100 80GB PCIe          Off | 00000000:31:00.0 Off |  

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [3]:
import nltk
from nltk.corpus import brown
import numpy as np
from nltk.tag import map_tag
from collections import defaultdict, Counter

In [4]:
nltk.download('brown')
nltk.download('universal_tagset')

start_tag = '^'

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [5]:
data = brown.tagged_sents(tagset='universal')
# data = [(word.lower(),tag) for i in data for word, tag in i]
# train_size = int(len(data) * 0.8)
# train_data = data[:train_size]
# test_data = data[train_size:]

# train_data
K = 5
start = 0
end = int(len(data)/K) -1

emission_prob_list = []
transition_prob_list = []

data_fold_wise = []
for i in range(K):
  data_fold_wise.append(data[start:end])
  start += int(len(data)/K)
  end += int(len(data)/K)


In [6]:
tags = set(tag for sent in data for _, tag in sent)
sorted_tags = sorted(tags)
sorted_tags.append(start_tag)
print(tags)
sorted_tags

{'VERB', 'CONJ', 'PRON', 'NUM', 'NOUN', 'ADJ', 'X', 'ADP', 'ADV', '.', 'PRT', 'DET'}


['.',
 'ADJ',
 'ADP',
 'ADV',
 'CONJ',
 'DET',
 'NOUN',
 'NUM',
 'PRON',
 'PRT',
 'VERB',
 'X',
 '^']

In [7]:
tag_size = len(sorted_tags)

In [8]:
tag_to_idx = {c:i for i, c in enumerate(sorted_tags)}
idx_to_tag = {i:c for i, c in enumerate(sorted_tags)}

In [9]:
words = [word for sent in data for word, tag in sent]
words = set(words)
vocab_size = len(words)
print(vocab_size)

56057


In [10]:
words = list(words)

In [11]:
word_to_idx = {c:i for i, c in enumerate(words)}
idx_to_word = {i:c for i, c in enumerate(words)}

In [12]:
sent_by_word = []
sent_by_tag = []
for sent in data:
    s = []
    t = []
    for word, tag in sent:
        s.append(word)
        t.append(tag)
    sent_by_word.append(s)
    sent_by_tag.append(t)
print(len(sent_by_word))
print(len(sent_by_tag))

57340
57340


In [13]:
print(len(data))

57340


In [14]:
# !pip install sentence-transformers

In [15]:
# pip install sentence-transformers
from sentence_transformers import SentenceTransformer
import numpy as np

# Load once (downloads the model the first time)
_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
vocab_size = 384

In [17]:
data_X = sent_by_word
data_y = sent_by_tag

In [18]:
from sklearn.model_selection import train_test_split

# 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    data_X, data_y, test_size=0.2, random_state=42, shuffle=True
)

In [21]:
##### Helper Functions #####
def Encode(text, vocab_size):
    emb = _model.encode(text, normalize_embeddings=True)
    if isinstance(emb, np.ndarray):
        emb = torch.from_numpy(emb).float()
    else:
        emb = torch.tensor(emb, dtype=torch.float32)
    emb = emb.reshape(-1, 1).to(device)   # ensure column vector and on device
    return emb

def Embedding(text: str) -> np.ndarray:
    """Returns a dense 384-dimensional embedding."""
    return _model.encode(text, normalize_embeddings=True)

# Xavier Normalized Initialization
def initWeights(input_size, output_size):
    return np.random.uniform(-1, 1, (output_size, input_size)) * np.sqrt(6 / (input_size + output_size))

##### Activation Functions #####
def sigmoid(input, derivative = False):
    if derivative:
        return input * (1 - input)
    
    return 1 / (1 + np.exp(-input))

def tanh(input, derivative = False):
    if derivative:
        return 1 - input ** 2
    
    return np.tanh(input)

def softmax(input):
    return np.exp(input) / np.sum(np.exp(input))

In [25]:
##### Imports #####
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.init as init
import numpy as np
from tqdm import tqdm

##### Assume you already have these #####
# train_X = list of sentences (each sentence = list of tokens)
# train_y = list of tags (each tag sequence aligned with train_X)
# vocab_size = number of tokens in vocabulary
# tag_size = number of possible tags
# word_to_idx, tag_to_idx, idx_to_tag dictionaries
# Embedding(word) -> returns np.array of shape (embedding_dim,)

##### Hyperparameters #####
hidden_size = 32       # changed
num_epochs = 10
learning_rate = 0.05
embedding_dim = 384    # your Embedding() outputs 384-dim
PAD_IDX = -1           # not really needed since no padding

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

##### Model #####
class LSTMTagger(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LSTMTagger, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        lstm_out, _ = self.lstm(x)          # [1, seq_len, hidden_dim]
        logits = self.fc(lstm_out)          # [1, seq_len, output_dim]
        return logits

##### Weight Initialization #####
def init_lstm_weights(lstm):
    for name, param in lstm.named_parameters():
        if "weight_ih" in name:
            init.xavier_uniform_(param.data)
        elif "weight_hh" in name:
            init.orthogonal_(param.data)
        elif "bias" in name:
            param.data.fill_(0)

model = LSTMTagger(embedding_dim, hidden_size, tag_size).to(device)
init_lstm_weights(model.lstm)
init.xavier_uniform_(model.fc.weight)

##### Loss & Optimizer #####
criterion = nn.CrossEntropyLoss(reduction="sum")
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

def train(model, train_X, train_y, num_epochs=10, learning_rate=0.05, tag_size=None, device=None):
    """
    Train an LSTM tagger with stochastic gradient descent (sentence-by-sentence).
    
    Args:
        model: LSTMTagger instance
        train_X: list of sentences (list of tokens)
        train_y: list of tag sequences aligned with train_X
        num_epochs: number of training epochs
        learning_rate: learning rate for SGD
        tag_size: number of possible tags
        device: torch device (cpu or cuda)
    """
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model.to(device)
    criterion = nn.CrossEntropyLoss(reduction="sum")
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)

    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for i in tqdm(range(len(train_X)), desc=f"Epoch {epoch+1}/{num_epochs}"):
            sent = train_X[i]
            tags = train_y[i]

            # Convert sentence and tags to tensors
            X_tensor = torch.tensor(np.array([Embedding(w) for w in sent]), dtype=torch.float32).unsqueeze(0).to(device)
            y_tensor = torch.tensor([tag_to_idx[t] for t in tags], dtype=torch.long).unsqueeze(0).to(device)

            optimizer.zero_grad()
            outputs = model(X_tensor)  # [1, seq_len, tag_size]
            loss = criterion(outputs.view(-1, tag_size), y_tensor.view(-1))
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        
        avg_loss = total_loss / len(train_X)
        print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")

##### Testing #####
def test(model, X, y):
    model.eval()
    correct, total = 0, 0
    all_preds = []  # store predictions sentence-wise
    
    with torch.no_grad():
        for sent, tags in zip(X, y):
            X_tensor = torch.tensor(
                np.array([Embedding(w) for w in sent]),
                dtype=torch.float32
            ).unsqueeze(0).to(device)
            
            outputs = model(X_tensor)  # [1, seq_len, tag_size]
            preds = torch.argmax(outputs, dim=-1).squeeze(0).cpu().numpy()
            
            # Convert indices to tags
            pred_tags = [idx_to_tag[p] for p in preds]
            all_preds.append(pred_tags)
            
            for p_tag, t in zip(pred_tags, tags):
                if p_tag == t:
                    correct += 1
                total += 1
    
    accuracy = 100 * correct / total
    print(f"Accuracy: {accuracy:.2f}%")
    return all_preds

# train(model, X_train, y_train, num_epochs=5, learning_rate=0.05, tag_size=tag_size, device=device)
# test(model, X_test, y_test)

In [26]:
import torch

# ##### Option 1: Load full model #####
# loaded_model = torch.load("pytorch_lstm_model_state.pth")
# loaded_model.to(device)
# # Use test function separately
# test(loaded_model, X_test[:10], y_test[:10])

##### Option 2: Load state dict (recommended) #####
# Recreate model instance first
loaded_model2 = LSTMTagger(embedding_dim, hidden_size, tag_size).to(device)
loaded_model2.load_state_dict(torch.load("complete_pytorch_lstm_model_state.pth"))
loaded_model2.to(device)


y_pred = test(loaded_model2, X_test[:5], y_test[:5])

Accuracy: 96.55%


In [27]:
y_pred

[['ADJ', 'NOUN', 'NOUN'],
 ['CONJ', 'PRON', 'VERB', 'PRON', 'VERB', 'NOUN', 'NOUN', '.'],
 ['NOUN',
  'VERB',
  'DET',
  'NOUN',
  'ADP',
  'DET',
  'NOUN',
  'ADP',
  'DET',
  'NOUN',
  '.'],
 ['DET',
  'NOUN',
  'VERB',
  'PRON',
  'PRT',
  'VERB',
  'DET',
  'ADJ',
  'NOUN',
  'ADP',
  'ADV',
  'ADJ',
  'DET',
  'NOUN',
  'ADP',
  'DET',
  'NOUN',
  'VERB',
  'VERB',
  '.'],
 ['DET',
  'NOUN',
  '.',
  'VERB',
  'NOUN',
  '.',
  '.',
  'ADP',
  'DET',
  'NOUN',
  'VERB',
  'VERB',
  'ADP',
  'NOUN',
  '.',
  '.']]