In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModel



  from .autonotebook import tqdm as notebook_tqdm


In [4]:
class DAEME (nn.Module):
    def __init__(self,input_dim,hidden=[300,300]):
        super(DAEME).__init__()
        self.encoder_amharic = nn.Linear(input_dim[0],hidden)
        self.encoder_arabic = nn.Linear(input_dim[1],hidden)
        
        self.decoder_amharic = nn.Linear(hidden,input_dim[0])
        self.decoder_arabic = nn.Linear(hidden,input_dim[1])
        
        self.activation = nn.ReLU()
        
        #weight initializattion
        for layer in [self.encoder_amharic,self.decoder_arabic,self.decoder_amharic,self.decoder_arabic]:
            nn.init.normal_(layer.weight, mean=0.0, std = 0.1)
            nn.init.zeros_(layer.bias)
            
    def forward(self, amharic_vec,arabic_vec):
        s1 = self.encoder_amharic(amharic_vec)
        s2 = self.encoder_amharic(amharic_vec)
        E1 = self.activation(s1)
        E2 = self.activation(s2)
        meta = torch.cat([E1,E2],dim=1)
        s1_hat = self.decoder_amharic(E1)
        s2_hat = self.decoder_arabic(E2)
        return meta,(s1_hat,s2_hat)
    
    #apply noise to discover meaningful patterns
    @staticmethod
    def apply_masking_noise(batch, noise_fraction=0.2):
        mask = torch.rand_like(batch) > noise_fraction
        return batch * mask.float()

        

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
# Amharic embedding model
amharic_tokenizer = AutoTokenizer.from_pretrained("rasyosef/roberta-amharic-text-embedding-base")
amharic_model     = AutoModel.from_pretrained("rasyosef/roberta-amharic-text-embedding-base").to(device)

# Arabic embedding model (replace with pretrained Arabic model)
arabic_tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-base-arabic")
arabic_model     = AutoModel.from_pretrained("asafaya/bert-base-arabic").to(device)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [None]:
def embed_texts(model, tokenizer, texts, device=device):
    enc = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    input_ids = enc["input_ids"].to(device)
    attention_mask = enc["attention_mask"].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        # Mean pooling
        last_hidden = outputs.last_hidden_state  # (batch, seq_len, hidden_dim)
        mask = attention_mask.unsqueeze(-1).expand_as(last_hidden)
        summed = torch.sum(last_hidden * mask, dim=1)
        counts = torch.clamp(torch.sum(mask, dim=1), min=1e-9)
        pooled = summed / counts  # (batch, hidden_dim)
    return pooled
