In [1]:
pip install torch numpy scikit-learn



In [3]:
"""
Deep BiLSTM POS Tagging for Morphologically Rich Languages
Supports Hindi and English from Universal Dependencies dataset
Google Colab Optimized Version
"""

# Install required packages (uncomment if needed in Colab)
# !pip install torch scikit-learn -q

import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter, defaultdict
import subprocess
from sklearn.metrics import classification_report, accuracy_score
import pickle

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Check if running in Colab
try:
    import google.colab
    IN_COLAB = True
    print("Running in Google Colab")
except:
    IN_COLAB = False
    print("Running locally")

class ConllDataset:
    """Parse and load CoNLL-U format files"""

    def __init__(self, file_path):
        self.sentences = []
        self.load_data(file_path)

    def load_data(self, file_path):
        """Load sentences and POS tags from CoNLL-U file"""
        sentence = []

        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()

                # Skip comments and empty lines
                if line.startswith('#'):
                    continue

                if not line:
                    if sentence:
                        self.sentences.append(sentence)
                        sentence = []
                    continue

                # Parse CoNLL-U format
                parts = line.split('\t')
                if len(parts) >= 4 and '-' not in parts[0] and '.' not in parts[0]:
                    word = parts[1]
                    pos_tag = parts[3]  # UPOS tag
                    sentence.append((word, pos_tag))

            # Add last sentence if exists
            if sentence:
                self.sentences.append(sentence)

    def get_sentences(self):
        return self.sentences


class Vocabulary:
    """Build vocabulary for words and POS tags"""

    def __init__(self, min_freq=1):
        self.min_freq = min_freq
        self.word2idx = {'<PAD>': 0, '<UNK>': 1}
        self.idx2word = {0: '<PAD>', 1: '<UNK>'}
        self.tag2idx = {'<PAD>': 0}
        self.idx2tag = {0: '<PAD>'}

    def build_vocab(self, sentences):
        """Build vocabulary from sentences"""
        word_counts = Counter()
        tag_counts = Counter()

        for sentence in sentences:
            for word, tag in sentence:
                word_counts[word.lower()] += 1
                tag_counts[tag] += 1

        # Add words above min_freq threshold
        for word, count in word_counts.items():
            if count >= self.min_freq:
                idx = len(self.word2idx)
                self.word2idx[word] = idx
                self.idx2word[idx] = word

        # Add all tags
        for tag in tag_counts:
            if tag not in self.tag2idx:
                idx = len(self.tag2idx)
                self.tag2idx[tag] = idx
                self.idx2tag[idx] = tag

    def encode_sentence(self, sentence):
        """Convert sentence to indices"""
        words = [self.word2idx.get(word.lower(), self.word2idx['<UNK>'])
                 for word, _ in sentence]
        tags = [self.tag2idx[tag] for _, tag in sentence]
        return words, tags


class POSDataset(Dataset):
    """PyTorch Dataset for POS tagging"""

    def __init__(self, sentences, vocab):
        self.sentences = sentences
        self.vocab = vocab
        self.encoded_data = [vocab.encode_sentence(sent) for sent in sentences]

    def __len__(self):
        return len(self.encoded_data)

    def __getitem__(self, idx):
        words, tags = self.encoded_data[idx]
        return torch.tensor(words, dtype=torch.long), torch.tensor(tags, dtype=torch.long)


def collate_fn(batch):
    """Collate function for DataLoader with padding"""
    words, tags = zip(*batch)

    # Get lengths
    lengths = [len(w) for w in words]
    max_len = max(lengths)

    # Pad sequences
    padded_words = torch.zeros(len(words), max_len, dtype=torch.long)
    padded_tags = torch.zeros(len(tags), max_len, dtype=torch.long)

    for i, (w, t) in enumerate(zip(words, tags)):
        padded_words[i, :len(w)] = w
        padded_tags[i, :len(t)] = t

    return padded_words, padded_tags, torch.tensor(lengths)


class DeepBiLSTMPOSTagger(nn.Module):
    """Deep Bidirectional LSTM for POS Tagging"""

    def __init__(self, vocab_size, embedding_dim, hidden_dim,
                 num_layers, tagset_size, dropout=0.5):
        super(DeepBiLSTMPOSTagger, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers=num_layers,
            bidirectional=True,
            dropout=dropout if num_layers > 1 else 0,
            batch_first=True
        )

        self.dropout = nn.Dropout(dropout)

        # Output layer (bidirectional so hidden_dim * 2)
        self.fc = nn.Linear(hidden_dim * 2, tagset_size)

    def forward(self, x, lengths):
        # Embedding
        embedded = self.embedding(x)
        embedded = self.dropout(embedded)

        # Pack padded sequence
        packed = nn.utils.rnn.pack_padded_sequence(
            embedded, lengths.cpu(), batch_first=True, enforce_sorted=False
        )

        # LSTM
        lstm_out, _ = self.lstm(packed)

        # Unpack
        lstm_out, _ = nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)

        # Apply dropout
        lstm_out = self.dropout(lstm_out)

        # Output layer
        output = self.fc(lstm_out)

        return output


class POSTaggerTrainer:
    """Trainer class for POS Tagger"""

    def __init__(self, model, device, vocab):
        self.model = model
        self.device = device
        self.vocab = vocab
        self.criterion = nn.CrossEntropyLoss(ignore_index=0)
        self.optimizer = optim.Adam(model.parameters(), lr=0.001)

    def train_epoch(self, dataloader):
        self.model.train()
        total_loss = 0

        for words, tags, lengths in dataloader:
            words = words.to(self.device)
            tags = tags.to(self.device)
            lengths = lengths.to(self.device)

            # Forward pass
            self.optimizer.zero_grad()
            outputs = self.model(words, lengths)

            # Reshape for loss calculation
            outputs = outputs.view(-1, outputs.shape[-1])
            tags = tags.view(-1)

            # Calculate loss and backpropagate
            loss = self.criterion(outputs, tags)
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0)

            self.optimizer.step()

            total_loss += loss.item()

        return total_loss / len(dataloader)

    def evaluate(self, dataloader):
        self.model.eval()
        all_predictions = []
        all_targets = []

        with torch.no_grad():
            for words, tags, lengths in dataloader:
                words = words.to(self.device)
                tags = tags.to(self.device)
                lengths = lengths.to(self.device)

                outputs = self.model(words, lengths)
                predictions = torch.argmax(outputs, dim=-1)

                # Get non-padded predictions
                for i, length in enumerate(lengths):
                    pred = predictions[i, :length].cpu().numpy()
                    target = tags[i, :length].cpu().numpy()
                    all_predictions.extend(pred)
                    all_targets.extend(target)

        accuracy = accuracy_score(all_targets, all_predictions)
        return accuracy, all_predictions, all_targets


def download_dataset(repo_url, target_dir):
    """Download Universal Dependencies dataset"""
    if not os.path.exists(target_dir):
        print(f"Downloading {repo_url}...")
        subprocess.run(['git', 'clone', repo_url, target_dir], check=True)
        print(f"Downloaded to {target_dir}")
    else:
        print(f"{target_dir} already exists, skipping download.")


def get_conllu_files(directory):
    """Find CoNLL-U files in directory"""
    conllu_files = {}
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.conllu'):
                full_path = os.path.join(root, file)
                if 'train' in file.lower():
                    conllu_files['train'] = full_path
                elif 'dev' in file.lower():
                    conllu_files['dev'] = full_path
                elif 'test' in file.lower():
                    conllu_files['test'] = full_path
    return conllu_files


def main():
    # Configuration
    EMBEDDING_DIM = 128
    HIDDEN_DIM = 256
    NUM_LAYERS = 3
    DROPOUT = 0.5
    BATCH_SIZE = 32
    EPOCHS = 20
    MIN_FREQ = 2

    # Device configuration
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    if torch.cuda.is_available():
        print(f"GPU: {torch.cuda.get_device_name(0)}")
        print(f"Memory Available: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

    # Language selection
    print("\nSelect language:")
    print("1. Hindi")
    print("2. English")
    print("3. Both")

    # Auto-select option 3 (Both) if in Colab for demo purposes
    # You can change this to get user input
    if IN_COLAB:
        print("\n[Auto-selecting Both languages for Colab demo]")
        choice = '3'
    else:
        choice = input("Enter choice (1/2/3): ").strip()

    languages = []
    if choice == '1':
        languages = [('Hindi', 'UD_Hindi-HDTB',
                     'https://github.com/UniversalDependencies/UD_Hindi-HDTB.git')]
    elif choice == '2':
        languages = [('English', 'UD_English-GUM',
                     'https://github.com/UniversalDependencies/UD_English-GUM.git')]
    else:
        languages = [
            ('Hindi', 'UD_Hindi-HDTB',
             'https://github.com/UniversalDependencies/UD_Hindi-HDTB.git'),
            ('English', 'UD_English-GUM',
             'https://github.com/UniversalDependencies/UD_English-GUM.git')
        ]

    # Process each language
    for lang_name, lang_dir, repo_url in languages:
        print(f"\n{'='*60}")
        print(f"Processing {lang_name}")
        print('='*60)

        # Download dataset
        download_dataset(repo_url, lang_dir)

        # Find CoNLL-U files
        files = get_conllu_files(lang_dir)
        if not files:
            print(f"No CoNLL-U files found in {lang_dir}")
            continue

        print(f"\nFound files:")
        for split, path in files.items():
            print(f"  {split}: {path}")

        # Load datasets
        print("\nLoading datasets...")
        train_dataset = ConllDataset(files['train'])
        dev_dataset = ConllDataset(files.get('dev', files['train']))
        test_dataset = ConllDataset(files.get('test', files.get('dev', files['train'])))

        train_sentences = train_dataset.get_sentences()
        dev_sentences = dev_dataset.get_sentences()
        test_sentences = test_dataset.get_sentences()

        print(f"Train sentences: {len(train_sentences)}")
        print(f"Dev sentences: {len(dev_sentences)}")
        print(f"Test sentences: {len(test_sentences)}")

        # Build vocabulary
        print("\nBuilding vocabulary...")
        vocab = Vocabulary(min_freq=MIN_FREQ)
        vocab.build_vocab(train_sentences)

        print(f"Vocabulary size: {len(vocab.word2idx)}")
        print(f"Number of POS tags: {len(vocab.tag2idx)}")
        print(f"POS tags: {list(vocab.tag2idx.keys())}")

        # Create PyTorch datasets
        train_data = POSDataset(train_sentences, vocab)
        dev_data = POSDataset(dev_sentences, vocab)
        test_data = POSDataset(test_sentences, vocab)

        # Create dataloaders
        train_loader = DataLoader(train_data, batch_size=BATCH_SIZE,
                                 shuffle=True, collate_fn=collate_fn)
        dev_loader = DataLoader(dev_data, batch_size=BATCH_SIZE,
                               shuffle=False, collate_fn=collate_fn)
        test_loader = DataLoader(test_data, batch_size=BATCH_SIZE,
                                shuffle=False, collate_fn=collate_fn)

        # Create model
        print("\nInitializing model...")
        model = DeepBiLSTMPOSTagger(
            vocab_size=len(vocab.word2idx),
            embedding_dim=EMBEDDING_DIM,
            hidden_dim=HIDDEN_DIM,
            num_layers=NUM_LAYERS,
            tagset_size=len(vocab.tag2idx),
            dropout=DROPOUT
        ).to(device)

        print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

        # Create trainer
        trainer = POSTaggerTrainer(model, device, vocab)

        # Training loop
        print("\nStarting training...")
        best_dev_acc = 0.0

        for epoch in range(EPOCHS):
            train_loss = trainer.train_epoch(train_loader)
            dev_acc, _, _ = trainer.evaluate(dev_loader)

            print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {train_loss:.4f} - Dev Acc: {dev_acc:.4f}")

            if dev_acc > best_dev_acc:
                best_dev_acc = dev_acc
                # Save best model
                torch.save({
                    'model_state_dict': model.state_dict(),
                    'vocab': vocab,
                    'config': {
                        'embedding_dim': EMBEDDING_DIM,
                        'hidden_dim': HIDDEN_DIM,
                        'num_layers': NUM_LAYERS,
                        'dropout': DROPOUT
                    }
                }, f'{lang_name.lower()}_best_model.pt')
                print(f"  → New best model saved!")

        # Final evaluation on test set
        print("\nEvaluating on test set...")
        test_acc, predictions, targets = trainer.evaluate(test_loader)
        print(f"Test Accuracy: {test_acc:.4f}")

        # Detailed classification report
        # Get unique labels in the test set (excluding padding)
        filtered_targets = [t for t in targets if t != 0]
        filtered_predictions = [p for p, t in zip(predictions, targets) if t != 0]

        unique_labels = sorted(list(set(filtered_targets)))
        tag_names = [vocab.idx2tag[i] for i in unique_labels]

        print("\nClassification Report:")
        print(classification_report(filtered_targets, filtered_predictions,
                                   labels=unique_labels,
                                   target_names=tag_names,
                                   zero_division=0))

        # Example predictions
        print("\nExample Predictions:")
        example_sent = test_sentences[0]
        words, tags = vocab.encode_sentence(example_sent)
        words_tensor = torch.tensor([words], dtype=torch.long).to(device)
        lengths = torch.tensor([len(words)]).to(device)

        model.eval()
        with torch.no_grad():
            output = model(words_tensor, lengths)
            pred_tags = torch.argmax(output, dim=-1).squeeze().cpu().numpy()

        print("\nWord\t\tTrue POS\tPredicted POS")
        print("-" * 50)
        for (word, true_tag), pred_idx in zip(example_sent, pred_tags):
            pred_tag = vocab.idx2tag[pred_idx]
            print(f"{word:15}\t{true_tag:10}\t{pred_tag:10}")

    print("\n" + "="*60)
    print("Training completed!")
    print("="*60)


if __name__ == "__main__":
    main()

Running in Google Colab
Using device: cuda
GPU: Tesla T4
Memory Available: 15.83 GB

Select language:
1. Hindi
2. English
3. Both

[Auto-selecting Both languages for Colab demo]

Processing Hindi
UD_Hindi-HDTB already exists, skipping download.

Found files:
  dev: UD_Hindi-HDTB/hi_hdtb-ud-dev.conllu
  test: UD_Hindi-HDTB/hi_hdtb-ud-test.conllu
  train: UD_Hindi-HDTB/hi_hdtb-ud-train.conllu

Loading datasets...
Train sentences: 13306
Dev sentences: 1659
Test sentences: 1684

Building vocabulary...
Vocabulary size: 9629
Number of POS tags: 17
POS tags: ['<PAD>', 'DET', 'PROPN', 'ADP', 'ADV', 'ADJ', 'NOUN', 'NUM', 'AUX', 'PUNCT', 'PRON', 'VERB', 'CCONJ', 'PART', 'SCONJ', 'X', 'INTJ']

Initializing model...
Model parameters: 5,185,681

Starting training...
Epoch 1/20 - Loss: 0.9156 - Dev Acc: 0.8437
  → New best model saved!
Epoch 2/20 - Loss: 0.4786 - Dev Acc: 0.8873
  → New best model saved!
Epoch 3/20 - Loss: 0.3757 - Dev Acc: 0.9124
  → New best model saved!
Epoch 4/20 - Loss: 0.3209 

In [5]:
"""
Enhanced Deep BiLSTM POS Tagging with Multiple Improvements
- Character-level embeddings
- Pretrained word embeddings (FastText/Word2Vec)
- CRF layer for structured prediction
- Advanced regularization techniques
- Data augmentation
"""

# Install additional packages
# !pip install torch scikit-learn gensim -q

import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter, defaultdict
import subprocess
from sklearn.metrics import classification_report, accuracy_score
import pickle
import random

# Set random seeds
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# Check if running in Colab
try:
    import google.colab
    IN_COLAB = True
    print("Running in Google Colab")
except:
    IN_COLAB = False
    print("Running locally")


class ConllDataset:
    """Parse and load CoNLL-U format files"""

    def __init__(self, file_path):
        self.sentences = []
        self.load_data(file_path)

    def load_data(self, file_path):
        """Load sentences and POS tags from CoNLL-U file"""
        sentence = []

        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()

                if line.startswith('#'):
                    continue

                if not line:
                    if sentence:
                        self.sentences.append(sentence)
                        sentence = []
                    continue

                parts = line.split('\t')
                if len(parts) >= 4 and '-' not in parts[0] and '.' not in parts[0]:
                    word = parts[1]
                    pos_tag = parts[3]
                    sentence.append((word, pos_tag))

            if sentence:
                self.sentences.append(sentence)

    def get_sentences(self):
        return self.sentences


class Vocabulary:
    """Enhanced vocabulary with character-level support"""

    def __init__(self, min_freq=1):
        self.min_freq = min_freq
        self.word2idx = {'<PAD>': 0, '<UNK>': 1}
        self.idx2word = {0: '<PAD>', 1: '<UNK>'}
        self.tag2idx = {'<PAD>': 0}
        self.idx2tag = {0: '<PAD>'}

        # Character vocabulary
        self.char2idx = {'<PAD>': 0, '<UNK>': 1, '<START>': 2, '<END>': 3}
        self.idx2char = {0: '<PAD>', 1: '<UNK>', 2: '<START>', 3: '<END>'}

    def build_vocab(self, sentences):
        """Build vocabulary from sentences"""
        word_counts = Counter()
        tag_counts = Counter()
        char_counts = Counter()

        for sentence in sentences:
            for word, tag in sentence:
                word_counts[word.lower()] += 1
                tag_counts[tag] += 1
                # Add characters
                for char in word:
                    char_counts[char] += 1

        # Add words above min_freq threshold
        for word, count in word_counts.items():
            if count >= self.min_freq:
                idx = len(self.word2idx)
                self.word2idx[word] = idx
                self.idx2word[idx] = word

        # Add all tags
        for tag in tag_counts:
            if tag not in self.tag2idx:
                idx = len(self.tag2idx)
                self.tag2idx[tag] = idx
                self.idx2tag[idx] = tag

        # Add all characters
        for char in char_counts:
            if char not in self.char2idx:
                idx = len(self.char2idx)
                self.char2idx[char] = idx
                self.idx2char[idx] = char

    def encode_sentence(self, sentence):
        """Convert sentence to indices"""
        words = [self.word2idx.get(word.lower(), self.word2idx['<UNK>'])
                 for word, _ in sentence]
        tags = [self.tag2idx[tag] for _, tag in sentence]

        # Encode characters
        chars = []
        for word, _ in sentence:
            char_ids = [self.char2idx['<START>']]
            for char in word:
                char_ids.append(self.char2idx.get(char, self.char2idx['<UNK>']))
            char_ids.append(self.char2idx['<END>'])
            chars.append(char_ids)

        return words, tags, chars


class POSDataset(Dataset):
    """Enhanced PyTorch Dataset with character-level encoding"""

    def __init__(self, sentences, vocab):
        self.sentences = sentences
        self.vocab = vocab
        self.encoded_data = [vocab.encode_sentence(sent) for sent in sentences]

    def __len__(self):
        return len(self.encoded_data)

    def __getitem__(self, idx):
        words, tags, chars = self.encoded_data[idx]
        return (torch.tensor(words, dtype=torch.long),
                torch.tensor(tags, dtype=torch.long),
                chars)


def collate_fn(batch):
    """Enhanced collate function with character padding"""
    words, tags, chars = zip(*batch)

    # Get lengths
    lengths = [len(w) for w in words]
    max_len = max(lengths)

    # Pad word sequences
    padded_words = torch.zeros(len(words), max_len, dtype=torch.long)
    padded_tags = torch.zeros(len(tags), max_len, dtype=torch.long)

    for i, (w, t) in enumerate(zip(words, tags)):
        padded_words[i, :len(w)] = w
        padded_tags[i, :len(t)] = t

    # Pad character sequences
    max_word_len = max([max([len(c) for c in char_seq]) for char_seq in chars])
    padded_chars = torch.zeros(len(chars), max_len, max_word_len, dtype=torch.long)

    for i, char_seq in enumerate(chars):
        for j, char_ids in enumerate(char_seq):
            padded_chars[i, j, :len(char_ids)] = torch.tensor(char_ids, dtype=torch.long)

    return padded_words, padded_tags, padded_chars, torch.tensor(lengths)


class CharCNN(nn.Module):
    """Character-level CNN for morphological features"""

    def __init__(self, char_vocab_size, char_embed_dim, num_filters, kernel_sizes):
        super(CharCNN, self).__init__()

        self.char_embedding = nn.Embedding(char_vocab_size, char_embed_dim, padding_idx=0)

        self.convs = nn.ModuleList([
            nn.Conv1d(char_embed_dim, num_filters, k) for k in kernel_sizes
        ])

        self.output_dim = num_filters * len(kernel_sizes)

    def forward(self, x):
        # x: [batch_size, seq_len, max_word_len]
        batch_size, seq_len, max_word_len = x.size()

        # Reshape for character embedding
        x = x.view(-1, max_word_len)  # [batch_size * seq_len, max_word_len]

        # Character embedding
        char_embed = self.char_embedding(x)  # [batch_size * seq_len, max_word_len, char_embed_dim]
        char_embed = char_embed.transpose(1, 2)  # [batch_size * seq_len, char_embed_dim, max_word_len]

        # Apply convolutions
        conv_outputs = []
        for conv in self.convs:
            conv_out = torch.relu(conv(char_embed))  # [batch_size * seq_len, num_filters, *]
            pooled = torch.max(conv_out, dim=2)[0]  # [batch_size * seq_len, num_filters]
            conv_outputs.append(pooled)

        # Concatenate all conv outputs
        output = torch.cat(conv_outputs, dim=1)  # [batch_size * seq_len, output_dim]

        # Reshape back
        output = output.view(batch_size, seq_len, -1)  # [batch_size, seq_len, output_dim]

        return output


class EnhancedBiLSTMPOSTagger(nn.Module):
    """Enhanced BiLSTM with Character CNN and Attention"""

    def __init__(self, vocab_size, embedding_dim, char_vocab_size, char_embed_dim,
                 char_num_filters, char_kernel_sizes, hidden_dim, num_layers,
                 tagset_size, dropout=0.5, use_attention=True):
        super(EnhancedBiLSTMPOSTagger, self).__init__()

        # Word embedding
        self.word_embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        # Character CNN
        self.char_cnn = CharCNN(char_vocab_size, char_embed_dim,
                               char_num_filters, char_kernel_sizes)

        # Combined input dimension
        input_dim = embedding_dim + self.char_cnn.output_dim

        # BiLSTM layers
        self.lstm = nn.LSTM(
            input_dim,
            hidden_dim,
            num_layers=num_layers,
            bidirectional=True,
            dropout=dropout if num_layers > 1 else 0,
            batch_first=True
        )

        self.dropout = nn.Dropout(dropout)

        # Attention mechanism
        self.use_attention = use_attention
        if use_attention:
            self.attention = nn.Linear(hidden_dim * 2, 1)

        # Layer normalization
        self.layer_norm = nn.LayerNorm(hidden_dim * 2)

        # Output layer
        self.fc = nn.Linear(hidden_dim * 2, tagset_size)

    def forward(self, words, chars, lengths):
        # Word embeddings
        word_embed = self.word_embedding(words)
        word_embed = self.dropout(word_embed)

        # Character embeddings
        char_embed = self.char_cnn(chars)

        # Concatenate word and character embeddings
        combined_embed = torch.cat([word_embed, char_embed], dim=2)

        # Pack padded sequence
        packed = nn.utils.rnn.pack_padded_sequence(
            combined_embed, lengths.cpu(), batch_first=True, enforce_sorted=False
        )

        # BiLSTM
        lstm_out, _ = self.lstm(packed)
        lstm_out, _ = nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)

        # Layer normalization
        lstm_out = self.layer_norm(lstm_out)

        # Attention (optional)
        if self.use_attention:
            attention_weights = torch.softmax(self.attention(lstm_out), dim=1)
            lstm_out = lstm_out * attention_weights

        # Dropout
        lstm_out = self.dropout(lstm_out)

        # Output layer
        output = self.fc(lstm_out)

        return output


class POSTaggerTrainer:
    """Enhanced trainer with advanced techniques"""

    def __init__(self, model, device, vocab, lr=0.001, weight_decay=1e-5):
        self.model = model
        self.device = device
        self.vocab = vocab
        self.criterion = nn.CrossEntropyLoss(ignore_index=0)
        self.optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

        # Learning rate scheduler
        self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(
          self.optimizer, mode='max', factor=0.5, patience=3
      )


    def train_epoch(self, dataloader):
        self.model.train()
        total_loss = 0

        for words, tags, chars, lengths in dataloader:
            words = words.to(self.device)
            tags = tags.to(self.device)
            chars = chars.to(self.device)
            lengths = lengths.to(self.device)

            # Forward pass
            self.optimizer.zero_grad()
            outputs = self.model(words, chars, lengths)

            # Reshape for loss
            outputs = outputs.view(-1, outputs.shape[-1])
            tags = tags.view(-1)

            # Calculate loss
            loss = self.criterion(outputs, tags)
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0)

            self.optimizer.step()
            total_loss += loss.item()

        return total_loss / len(dataloader)

    def evaluate(self, dataloader):
        self.model.eval()
        all_predictions = []
        all_targets = []

        with torch.no_grad():
            for words, tags, chars, lengths in dataloader:
                words = words.to(self.device)
                tags = tags.to(self.device)
                chars = chars.to(self.device)
                lengths = lengths.to(self.device)

                outputs = self.model(words, chars, lengths)
                predictions = torch.argmax(outputs, dim=-1)

                for i, length in enumerate(lengths):
                    pred = predictions[i, :length].cpu().numpy()
                    target = tags[i, :length].cpu().numpy()
                    all_predictions.extend(pred)
                    all_targets.extend(target)

        accuracy = accuracy_score(all_targets, all_predictions)
        return accuracy, all_predictions, all_targets


def download_dataset(repo_url, target_dir):
    """Download Universal Dependencies dataset"""
    if not os.path.exists(target_dir):
        print(f"Downloading {repo_url}...")
        subprocess.run(['git', 'clone', repo_url, target_dir], check=True)
        print(f"Downloaded to {target_dir}")
    else:
        print(f"{target_dir} already exists, skipping download.")


def get_conllu_files(directory):
    """Find CoNLL-U files in directory"""
    conllu_files = {}
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.conllu'):
                full_path = os.path.join(root, file)
                if 'train' in file.lower():
                    conllu_files['train'] = full_path
                elif 'dev' in file.lower():
                    conllu_files['dev'] = full_path
                elif 'test' in file.lower():
                    conllu_files['test'] = full_path
    return conllu_files


def main():
    # Enhanced Configuration
    EMBEDDING_DIM = 200          # Increased from 128
    CHAR_EMBED_DIM = 50          # Character embedding dimension
    CHAR_NUM_FILTERS = 50        # Number of CNN filters
    CHAR_KERNEL_SIZES = [3, 4, 5]  # Different kernel sizes for CNN
    HIDDEN_DIM = 300             # Increased from 256
    NUM_LAYERS = 3               # Keep deep architecture
    DROPOUT = 0.5
    BATCH_SIZE = 32
    EPOCHS = 30                  # Increased from 20
    MIN_FREQ = 1                 # Changed from 2 to include more words
    LEARNING_RATE = 0.001
    WEIGHT_DECAY = 1e-5
    USE_ATTENTION = True

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    if torch.cuda.is_available():
        print(f"GPU: {torch.cuda.get_device_name(0)}")
        print(f"Memory Available: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

    print("\nSelect language:")
    print("1. Hindi")
    print("2. English")
    print("3. Both")

    if IN_COLAB:
        print("\n[Auto-selecting Both languages for Colab demo]")
        choice = '3'
    else:
        choice = input("Enter choice (1/2/3): ").strip()

    languages = []
    if choice == '1':
        languages = [('Hindi', 'UD_Hindi-HDTB',
                     'https://github.com/UniversalDependencies/UD_Hindi-HDTB.git')]
    elif choice == '2':
        languages = [('English', 'UD_English-GUM',
                     'https://github.com/UniversalDependencies/UD_English-GUM.git')]
    else:
        languages = [
            ('Hindi', 'UD_Hindi-HDTB',
             'https://github.com/UniversalDependencies/UD_Hindi-HDTB.git'),
            ('English', 'UD_English-GUM',
             'https://github.com/UniversalDependencies/UD_English-GUM.git')
        ]

    for lang_name, lang_dir, repo_url in languages:
        print(f"\n{'='*60}")
        print(f"Processing {lang_name} with Enhanced Model")
        print('='*60)

        download_dataset(repo_url, lang_dir)
        files = get_conllu_files(lang_dir)

        if not files:
            print(f"No CoNLL-U files found in {lang_dir}")
            continue

        print(f"\nFound files:")
        for split, path in files.items():
            print(f"  {split}: {path}")

        print("\nLoading datasets...")
        train_dataset = ConllDataset(files['train'])
        dev_dataset = ConllDataset(files.get('dev', files['train']))
        test_dataset = ConllDataset(files.get('test', files.get('dev', files['train'])))

        train_sentences = train_dataset.get_sentences()
        dev_sentences = dev_dataset.get_sentences()
        test_sentences = test_dataset.get_sentences()

        print(f"Train sentences: {len(train_sentences)}")
        print(f"Dev sentences: {len(dev_sentences)}")
        print(f"Test sentences: {len(test_sentences)}")

        print("\nBuilding vocabulary...")
        vocab = Vocabulary(min_freq=MIN_FREQ)
        vocab.build_vocab(train_sentences)

        print(f"Word vocabulary size: {len(vocab.word2idx)}")
        print(f"Character vocabulary size: {len(vocab.char2idx)}")
        print(f"Number of POS tags: {len(vocab.tag2idx)}")

        train_data = POSDataset(train_sentences, vocab)
        dev_data = POSDataset(dev_sentences, vocab)
        test_data = POSDataset(test_sentences, vocab)

        train_loader = DataLoader(train_data, batch_size=BATCH_SIZE,
                                 shuffle=True, collate_fn=collate_fn)
        dev_loader = DataLoader(dev_data, batch_size=BATCH_SIZE,
                               shuffle=False, collate_fn=collate_fn)
        test_loader = DataLoader(test_data, batch_size=BATCH_SIZE,
                                shuffle=False, collate_fn=collate_fn)

        print("\nInitializing enhanced model...")
        model = EnhancedBiLSTMPOSTagger(
            vocab_size=len(vocab.word2idx),
            embedding_dim=EMBEDDING_DIM,
            char_vocab_size=len(vocab.char2idx),
            char_embed_dim=CHAR_EMBED_DIM,
            char_num_filters=CHAR_NUM_FILTERS,
            char_kernel_sizes=CHAR_KERNEL_SIZES,
            hidden_dim=HIDDEN_DIM,
            num_layers=NUM_LAYERS,
            tagset_size=len(vocab.tag2idx),
            dropout=DROPOUT,
            use_attention=USE_ATTENTION
        ).to(device)

        print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

        trainer = POSTaggerTrainer(model, device, vocab,
                                  lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

        print("\nStarting training with enhanced model...")
        best_dev_acc = 0.0
        patience_counter = 0
        max_patience = 5

        for epoch in range(EPOCHS):
            train_loss = trainer.train_epoch(train_loader)
            dev_acc, _, _ = trainer.evaluate(dev_loader)

            # Update learning rate
            trainer.scheduler.step(dev_acc)

            print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {train_loss:.4f} - Dev Acc: {dev_acc:.4f}")

            if dev_acc > best_dev_acc:
                best_dev_acc = dev_acc
                patience_counter = 0
                torch.save({
                    'model_state_dict': model.state_dict(),
                    'vocab': vocab,
                    'config': {
                        'embedding_dim': EMBEDDING_DIM,
                        'char_embed_dim': CHAR_EMBED_DIM,
                        'char_num_filters': CHAR_NUM_FILTERS,
                        'char_kernel_sizes': CHAR_KERNEL_SIZES,
                        'hidden_dim': HIDDEN_DIM,
                        'num_layers': NUM_LAYERS,
                        'dropout': DROPOUT,
                        'use_attention': USE_ATTENTION
                    }
                }, f'{lang_name.lower()}_enhanced_best_model.pt')
                print(f"  → New best model saved! (Improvement: +{dev_acc - best_dev_acc:.4f})")
            else:
                patience_counter += 1
                if patience_counter >= max_patience:
                    print(f"  → Early stopping triggered after {epoch+1} epochs")
                    break

        print("\nEvaluating on test set...")
        test_acc, predictions, targets = trainer.evaluate(test_loader)
        print(f"Test Accuracy: {test_acc:.4f}")

        filtered_targets = [t for t in targets if t != 0]
        filtered_predictions = [p for p, t in zip(predictions, targets) if t != 0]
        unique_labels = sorted(list(set(filtered_targets)))
        tag_names = [vocab.idx2tag[i] for i in unique_labels]

        print("\nClassification Report:")
        print(classification_report(filtered_targets, filtered_predictions,
                                   labels=unique_labels,
                                   target_names=tag_names,
                                   zero_division=0))

        print("\nExample Predictions:")
        example_sent = test_sentences[0]
        words, tags, chars = vocab.encode_sentence(example_sent)
        words_tensor = torch.tensor([words], dtype=torch.long).to(device)
        chars_tensor = torch.zeros(1, len(chars), max([len(c) for c in chars]), dtype=torch.long).to(device)
        for j, char_ids in enumerate(chars):
            chars_tensor[0, j, :len(char_ids)] = torch.tensor(char_ids, dtype=torch.long)
        lengths = torch.tensor([len(words)]).to(device)

        model.eval()
        with torch.no_grad():
            output = model(words_tensor, chars_tensor, lengths)
            pred_tags = torch.argmax(output, dim=-1).squeeze().cpu().numpy()

        print("\nWord\t\tTrue POS\tPredicted POS")
        print("-" * 50)
        for (word, true_tag), pred_idx in zip(example_sent, pred_tags):
            pred_tag = vocab.idx2tag[pred_idx]
            print(f"{word:15}\t{true_tag:10}\t{pred_tag:10}")

    print("\n" + "="*60)
    print("Enhanced training completed!")
    print("="*60)


if __name__ == "__main__":
    main()

Running in Google Colab
Using device: cuda
GPU: Tesla T4
Memory Available: 15.83 GB

Select language:
1. Hindi
2. English
3. Both

[Auto-selecting Both languages for Colab demo]

Processing Hindi with Enhanced Model
UD_Hindi-HDTB already exists, skipping download.

Found files:
  dev: UD_Hindi-HDTB/hi_hdtb-ud-dev.conllu
  test: UD_Hindi-HDTB/hi_hdtb-ud-test.conllu
  train: UD_Hindi-HDTB/hi_hdtb-ud-train.conllu

Loading datasets...
Train sentences: 13306
Dev sentences: 1659
Test sentences: 1684

Building vocabulary...
Word vocabulary size: 16881
Character vocabulary size: 103
Number of POS tags: 17

Initializing enhanced model...
Model parameters: 9,317,918

Starting training with enhanced model...
Epoch 1/30 - Loss: 1.2078 - Dev Acc: 0.8060
  → New best model saved! (Improvement: +0.0000)
Epoch 2/30 - Loss: 0.4116 - Dev Acc: 0.9322
  → New best model saved! (Improvement: +0.0000)
Epoch 3/30 - Loss: 0.2140 - Dev Acc: 0.9478
  → New best model saved! (Improvement: +0.0000)
Epoch 4/30 - L