In [None]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import numpy as np
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts

In [None]:
def load_data(directory_path):
    # Dictionaries to store data pairs for each dataset
    data = {
        'train': [],
        'dev': [],
        'test': []
    }

    # Walk through all folders and files in the directory
    for root, _, files in os.walk(directory_path):
        en_files = [f for f in files if f.endswith('.en')]
        ur_files = [f for f in files if f.endswith('.ur')]

        # Create a dictionary with matching pairs based on file names
        paired_files = {}
        for file in en_files:
            base_name = os.path.splitext(file)[0]
            paired_files[base_name] = {'en': os.path.join(root, file)}
        
        for file in ur_files:
            base_name = os.path.splitext(file)[0]
            if base_name in paired_files:
                paired_files[base_name]['ur'] = os.path.join(root, file)

        # Load contents of each paired file and separate into train, dev, and test sets
        for base_name, paths in paired_files.items():
            if 'en' in paths and 'ur' in paths:
                with open(paths['en'], 'r', encoding='utf-8') as en_file, \
                     open(paths['ur'], 'r', encoding='utf-8') as ur_file:
                    en_content = en_file.read().strip()
                    ur_content = ur_file.read().strip()
                    
                    # Determine the set based on the file name prefix
                    if "train" in base_name:
                        data['train'].append((en_content, ur_content))
                    elif "dev" in base_name:
                        data['dev'].append((en_content, ur_content))
                    elif "test" in base_name:
                        data['test'].append((en_content, ur_content))

    return data

In [None]:
# Usage
directory_path = 'umc005-corpus'
data = load_data(directory_path)

In [None]:
print(f"Training pairs: {len(data['train'])}")
print(f"Development pairs: {len(data['dev'])}")
print(f"Test pairs: {len(data['test'])}")

In [25]:
import sentencepiece as spm
import os

def train_tokenizers(data, prefix='tokenizer', chunk_size=1000):
    """
    Train SentencePiece tokenizers for English and Urdu text pairs with dynamic vocabulary sizing.
    
    Args:
        data: List of (english, urdu) text pairs
        prefix: Prefix for the output model files
        chunk_size: Number of text pairs to process in each chunk
    
    Returns:
        tuple: (english_tokenizer, urdu_tokenizer)
    """
    en_texts, ur_texts = zip(*data)
    
    # Ensure output directory exists
    output_dir = 'tokenizer'
    os.makedirs(output_dir, exist_ok=True)
    
    def save_chunked_data(texts, file_prefix):
        chunks = [texts[i:i + chunk_size] for i in range(0, len(texts), chunk_size)]
        file_paths = []
        
        for i, chunk in enumerate(chunks):
            file_path = os.path.join(output_dir, f'{file_prefix}_chunk_{i}.txt')
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write("\n".join(chunk))
            file_paths.append(file_path)
        
        return file_paths

    en_files = save_chunked_data(en_texts, 'en_texts')
    ur_files = save_chunked_data(ur_texts, 'ur_texts')

    # Function to determine safe vocabulary size
    def get_safe_vocab_size(file_path, max_allowed=5000):
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        unique_tokens = len(set(text.split()))
        # Set vocab size to minimum of unique tokens and max allowed,
        # with a small buffer to avoid potential issues
        return min(unique_tokens - 100, max_allowed)

    en_model_paths = []
    ur_model_paths = []
    
    # Train tokenizers for each chunk with appropriate vocabulary sizes
    for i, (en_file, ur_file) in enumerate(zip(en_files, ur_files)):
        # Calculate safe vocabulary sizes for both languages
        en_vocab_size = get_safe_vocab_size(en_file)
        ur_vocab_size = get_safe_vocab_size(ur_file)
        
        # Train English tokenizer
        en_model_path = f'{prefix}_en_{i}.model'
        spm.SentencePieceTrainer.train(
            input=en_file,
            model_prefix=en_model_path.replace('.model', ''),
            vocab_size=en_vocab_size,
            character_coverage=1.0,
            model_type='bpe',
            input_sentence_size=1000000,
            shuffle_input_sentence=True
        )
        en_model_paths.append(en_model_path)
        
        # Train Urdu tokenizer
        ur_model_path = f'{prefix}_ur_{i}.model'
        spm.SentencePieceTrainer.train(
            input=ur_file,
            model_prefix=ur_model_path.replace('.model', ''),
            vocab_size=ur_vocab_size,
            character_coverage=0.9995,  # Higher coverage for Urdu script
            model_type='bpe',
            input_sentence_size=1000000,
            shuffle_input_sentence=True
        )
        ur_model_paths.append(ur_model_path)
    
    # Load the trained models from the last chunk
    en_tokenizer = spm.SentencePieceProcessor(model_file=en_model_paths[-1])
    ur_tokenizer = spm.SentencePieceProcessor(model_file=ur_model_paths[-1])
    
    # Cleanup temporary files
    for f in en_files + ur_files:
        if os.path.exists(f):
            os.remove(f)
    
    return en_tokenizer, ur_tokenizer

In [26]:
# Assuming `data` is a dictionary with 'train', 'dev', and 'test' data pairs
en_tokenizer, ur_tokenizer = train_tokenizers(data['train'])

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length=5000, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Create positional encoding matrix
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__()
        assert d_model % num_heads == 0
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
        self.dropout = nn.Dropout(dropout)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attention_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        
        if mask is not None:
            attention_scores = attention_scores.masked_fill(mask == 0, -1e9)
        
        attention_probs = F.softmax(attention_scores, dim=-1)
        attention_probs = self.dropout(attention_probs)
        
        output = torch.matmul(attention_probs, V)
        return output

    def forward(self, Q, K, V, mask=None):
        batch_size = Q.size(0)
        
        # Linear transformations and reshape
        Q = self.W_q(Q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        K = self.W_k(K).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        V = self.W_v(V).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        
        # Apply attention
        output = self.scaled_dot_product_attention(Q, K, V, mask)
        
        # Reshape and apply final linear transformation
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        return self.W_o(output)

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        x = self.dropout(F.relu(self.linear1(x)))
        x = self.linear2(x)
        return x

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.self_attention = MultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        # Self attention
        attn_output = self.self_attention(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        
        # Feed forward
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.self_attention = MultiHeadAttention(d_model, num_heads, dropout)
        self.cross_attention = MultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward = FeedForward(d_model, d_ff, dropout)
        
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
        # Self attention
        attn_output = self.self_attention(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        
        # Cross attention
        attn_output = self.cross_attention(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        
        # Feed forward
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        
        return x

class Transformer(nn.Module):
    def __init__(self,
                 src_vocab_size,
                 tgt_vocab_size,
                 d_model=512,
                 num_heads=8,
                 num_layers=6,
                 d_ff=2048,
                 max_seq_length=5000,
                 dropout=0.1):
        super().__init__()
        
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length, dropout)
        
        # Encoder and Decoder layers
        self.encoder_layers = nn.ModuleList([
            EncoderLayer(d_model, num_heads, d_ff, dropout)
            for _ in range(num_layers)
        ])
        
        self.decoder_layers = nn.ModuleList([
            DecoderLayer(d_model, num_heads, d_ff, dropout)
            for _ in range(num_layers)
        ])
        
        self.final_layer = nn.Linear(d_model, tgt_vocab_size)
        
        # Initialize parameters
        self.init_parameters()

    def init_parameters(self):
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def create_masks(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(2)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()
        if src.is_cuda:
            nopeak_mask = nopeak_mask.cuda()
        tgt_mask = tgt_mask & nopeak_mask
        
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.create_masks(src, tgt)
        
        # Encoder
        src_embedded = self.positional_encoding(self.encoder_embedding(src))
        enc_output = src_embedded
        
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)
        
        # Decoder
        tgt_embedded = self.positional_encoding(self.decoder_embedding(tgt))
        dec_output = tgt_embedded
        
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)
        
        output = self.final_layer(dec_output)
        return output

class TransformerTrainer:
    def __init__(self,
                 model,
                 optimizer,
                 criterion,
                 scheduler=None,
                 device='cuda'):
        self.model = model.to(device)
        self.optimizer = optimizer
        self.criterion = criterion
        self.scheduler = scheduler
        self.device = device
        
    def train_step(self, src, tgt):
        self.model.train()
        self.optimizer.zero_grad()
        
        output = self.model(src, tgt[:, :-1])
        loss = self.criterion(output.contiguous().view(-1, output.size(-1)),
                            tgt[:, 1:].contiguous().view(-1))
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
        self.optimizer.step()
        
        if self.scheduler is not None:
            self.scheduler.step()
        
        return loss.item()
    
    def evaluate(self, val_loader):
        self.model.eval()
        total_loss = 0
        with torch.no_grad():
            for src, tgt in val_loader:
                src = src.to(self.device)
                tgt = tgt.to(self.device)
                
                output = self.model(src, tgt[:, :-1])
                loss = self.criterion(output.contiguous().view(-1, output.size(-1)),
                                   tgt[:, 1:].contiguous().view(-1))
                total_loss += loss.item()
                
        return total_loss / len(val_loader)

ModuleNotFoundError: No module named 'torch'

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, source_texts, target_texts, src_tokenizer, tgt_tokenizer, max_length=128):
        self.source_texts = source_texts
        self.target_texts = target_texts
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.source_texts)

    def __getitem__(self, idx):
        src_text = self.source_texts[idx]
        tgt_text = self.target_texts[idx]
        
        # Tokenize and pad sequences
        src_tokens = torch.tensor(
            self.src_tokenizer.encode(src_text)[:self.max_length],
            dtype=torch.long
        )
        tgt_tokens = torch.tensor(
            self.tgt_tokenizer.encode(tgt_text)[:self.max_length],
            dtype=torch.long
        )
        
        return src_tokens, tgt_tokens

def train_transformer(train_data, val_data, src_tokenizer, tgt_tokenizer, config):
    # Create datasets
    train_dataset = TranslationDataset(
        [pair[0] for pair in train_data],
        [pair[1] for pair in train_data],
        src_tokenizer,
        tgt_tokenizer,
        config['max_length']
    )
    
    val_dataset = TranslationDataset(
        [pair[0] for pair in val_data],
        [pair[1] for pair in val_data],
        src_tokenizer,
        tgt_tokenizer,
        config['max_length']
    )
    
    # Create data loaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=config['batch_size'],
        shuffle=True,
        num_workers=2
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=config['batch_size'],
        shuffle=False,
        num_workers=2
    )
    
    # Initialize model
    model = Transformer(
        src_vocab_size=src_tokenizer.vocab_size(),
        tgt_vocab_size=tgt_tokenizer.vocab_size(),
        d_model=config['d_model'],
        num_heads=config['num_heads'],
        num_layers=config['num_layers'],
        d_ff=config['d_ff'],
        dropout=config['dropout']
    )
    
    # Setup optimizer and scheduler
    optimizer = Adam(
        model.parameters(),
        lr=config['learning_rate'],
        betas=(0.9, 0.98),
        eps=1e-9
    )
    
    scheduler = CosineAnnealingWarmRestarts(
        optimizer,
        T_0=config['scheduler_t0'],
        T_mult=2
    )
    
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    trainer = TransformerTrainer(model, optimizer, criterion, scheduler, config['device'])
    
    # Training loop
    best_val_loss = float('inf')
    early_stopping_counter = 0
    
    for epoch in range(config['epochs']):
        total_train_loss = 0
        for batch_idx, (src, tgt) in enumerate(train_loader):
            src = src.to(config['device'])
            tgt = tgt.to(config['device'])
            
            loss = trainer.train_step(src, tgt)
            total_train_loss += loss
            
            if batch_idx % config['log_interval'] == 0:
                print(f'Epoch: {epoch}, Batch: {batch_idx}, Loss: {loss:.4f}')
        
        # Validation
        val_loss = trainer.evaluate(val_loader)
        print(f'Epoch: {epoch}, Train Loss: {total_train_loss/len(train_loader):.4f}, Val Loss: {val_loss:.4f}')
        
        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), config['model_path'])
            early_stopping_counter = 0
        else:
            early_stopping_counter += 1

        if early_stopping_counter >= config['early_stopping_patience']:
            print(f'Validation loss did not improve for {config["early_stopping_patience"]} epochs. Early stopping...')
            break

In [None]:
# Configuration
config = {
    'max_length': 128,
    'batch_size': 64,
    'd_model': 512,
    'num_heads': 8,
    'num_layers': 6,
    'd_ff': 2048,
    'dropout': 0.1,
    'learning_rate': 1e-4,
    'scheduler_t0': 10,
    'epochs': 100,
    'log_interval': 100,
    'early_stopping_patience': 5,
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    'model_path': 'transformer.pth'
}

# Train the Transformer model
train_transformer(data['train'], data['dev'], en_tokenizer, ur_tokenizer, config)

In [None]:
def translate_sentence(sentence, src_tokenizer, tgt_tokenizer, model, device, max_length=128):
    model.eval()
    
    # Tokenize source sentence
    src_tokens = torch.tensor(
        src_tokenizer.encode(sentence)[:max_length],
        dtype=torch.long
    ).unsqueeze(0).to(device)
    
    # Initialize target sentence with <s> token
    tgt_tokens = torch.tensor(
        [tgt_tokenizer.piece_to_id('<s>')],
        dtype=torch.long
    ).unsqueeze(0).to(device)
    
    for i in range(max_length):
        with torch.no_grad():
            output = model(src_tokens, tgt_tokens)
        
        # Get most likely next token
        next_token = output.argmax(2)[:, -1].item()
        
        # Append next token to target sentence
        tgt_tokens = torch.cat((tgt_tokens, next_token.unsqueeze(0)), dim=1)
        
        # Break if <\s> token is predicted
        if next_token == tgt_tokenizer.piece_to_id('</s>'):
            break
    
    # Decode target sentence
    translation = tgt_tokenizer.decode(tgt_tokens.squeeze().tolist())

    return translation

# Load the trained model
model = Transformer(
    src_vocab_size=en_tokenizer.vocab_size(),
    tgt_vocab_size=ur_tokenizer.vocab_size(),
    d_model=config['d_model'],
    num_heads=config['num_heads'],
    num_layers=config['num_layers'],
    d_ff=config['d_ff'],
    dropout=config['dropout']
)
model.load_state_dict(torch.load(config['model_path']))

# Translate a sample sentence
sample_sentence = "I am a student."
translation = translate_sentence(sample_sentence, en_tokenizer, ur_tokenizer, model, config['device'])
print(f"English: {sample_sentence}")
print(f"Urdu: {translation}")