In [1]:
%cd  ..

/data/home/eak/learning/zindi_challenge/machine_translation/machine-translation-zindi


In [6]:
from transformers import MarianConfig, MarianMTModel
import json

In [7]:
config = MarianConfig.from_json_file("extra_dataset/configs/helsinki.json")

config

MarianConfig {
  "_name_or_path": "/tmp/Helsinki-NLP/opus-mt-yo-fr",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "swish",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "MarianMTModel"
  ],
  "attention_dropout": 0.1,
  "bad_words_ids": [
    [
      59614
    ]
  ],
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 512,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "decoder_vocab_size": 59615,
  "dropout": 0.1,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 2,
  "forced_eos_token_id": 0,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_length": 

In [8]:
model = MarianMTModel(config)

model

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(256206, 512, padding_idx=1)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(256206, 512, padding_idx=1)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05, elem

In [9]:
model.save_pretrained("extra_dataset/configs/model")

Non-default generation parameters: {'max_length': 200, 'num_beams': 6, 'bad_words_ids': [[59614]], 'forced_eos_token_id': 0}


In [1]:
from transformers import AutoTokenizer

src_lang = "dyu_Latn"
tgt_lang = "fra_Latn"
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", model_max_length=200, src_lang=src_lang, tgt_lang=tgt_lang)

In [2]:
tokenizer

NllbTokenizerFast(name_or_path='facebook/nllb-200-distilled-600M', vocab_size=256204, model_max_length=200, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>', 'additional_special_tokens': ['ace_Arab', 'ace_Latn', 'acm_Arab', 'acq_Arab', 'aeb_Arab', 'afr_Latn', 'ajp_Arab', 'aka_Latn', 'amh_Ethi', 'apc_Arab', 'arb_Arab', 'ars_Arab', 'ary_Arab', 'arz_Arab', 'asm_Beng', 'ast_Latn', 'awa_Deva', 'ayr_Latn', 'azb_Arab', 'azj_Latn', 'bak_Cyrl', 'bam_Latn', 'ban_Latn', 'bel_Cyrl', 'bem_Latn', 'ben_Beng', 'bho_Deva', 'bjn_Arab', 'bjn_Latn', 'bod_Tibt', 'bos_Latn', 'bug_Latn', 'bul_Cyrl', 'cat_Latn', 'ceb_Latn', 'ces_Latn', 'cjk_Latn', 'ckb_Arab', 'crh_Latn', 'cym_Latn', 'dan_Latn', 'deu_Latn', 'dik_Latn', 'dyu_Latn', 'dzo_Tibt', 'ell_Grek', 'eng_Latn', 'epo_Latn', 'est_Latn', 'eus_Latn', 'ewe_Latn', 'fao_Latn', 'pes_Arab',

In [3]:
%cd ..

/data/home/eak/learning/zindi_challenge/machine_translation/machine-translation-zindi


In [4]:
tokenizer.save_pretrained("extra_dataset/configs/model")

('extra_dataset/configs/model/tokenizer_config.json',
 'extra_dataset/configs/model/special_tokens_map.json',
 'extra_dataset/configs/model/sentencepiece.bpe.model',
 'extra_dataset/configs/model/added_tokens.json',
 'extra_dataset/configs/model/tokenizer.json')

In [5]:
import torch
from torch import nn
from torch.nn import functional as F

class MTDyuFr(nn.Module):
    def __init__(self, src_len: int, tg_len: int, embedding_dim: int = 128, dropout: float = 0.1, padding_idx=-100):
        super(MTDyuFr, self, ).__init__()
        self.embedding = nn.Embedding(src_len, embedding_dim, max_norm=True, padding_idx=padding_idx)
        # Encoder
        self.encoder_gru1 = nn.GRU(embedding_dim, 128, bidirectional=True)
        self.encoder_gru2 = nn.GRU(256, 128, bidirectional=True,) # 256 because bidirectional doubles the features
        
        # Decoder
        self.decoder_gru = nn.GRU(256, 128, bidirectional=True,)
        
        self.fc = nn.Sequential(
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, tg_len),
            nn.Softmax(dim=-1)  # Correct usage of Softmax
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Embedding layer
        x = self.embedding(x)
        
        # Encoder
        x, _ = self.encoder_gru1(x)
        x, _ = self.encoder_gru2(self.dropout(x))
        
        # Decoder
        x, _ = self.decoder_gru(self.dropout(x))
        
        # Fully connected layers
        x = self.fc(self.dropout(x))
        
        return x

In [6]:
src_lang = 250
tgt_lang = 290
embedding_dim = 128

model = MTDyuFr(
    src_lang, tgt_lang, embedding_dim
)

In [7]:
model

MTDyuFr(
  (embedding): Embedding(250, 128, padding_idx=150, max_norm=True)
  (encoder_gru1): GRU(128, 128, bidirectional=True)
  (encoder_gru2): GRU(256, 128, bidirectional=True)
  (decoder_gru): GRU(256, 128, bidirectional=True)
  (fc): Sequential(
    (0): Linear(in_features=256, out_features=512, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=512, out_features=290, bias=True)
    (4): Softmax(dim=-1)
  )
  (dropout): Dropout(p=0.1, inplace=False)
)

In [8]:
X = torch.randint(0, src_lang, size=(4, 10))
X

tensor([[241, 159,  72, 107,  75,  48, 226, 237, 215, 213],
        [119, 129, 200, 140, 156,  64, 102, 221, 194, 165],
        [ 69, 236,  16, 192, 190,  92,  30, 215,  23, 129],
        [166, 226,  95,  84,  85, 204, 202,  50,  87, 109]])

In [9]:
emb = model.embedding(X)

emb.shape

torch.Size([4, 10, 128])

In [10]:
y_pred = model(X)
y_pred.shape

torch.Size([4, 10, 290])

In [13]:
y_pred[0, 0].argmax()

tensor(60)