In [46]:
!pip install -U torchtext==0.6   


Defaulting to user installation because normal site-packages is not writeable
Collecting torchtext==0.6
  Downloading torchtext-0.6.0-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 KB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
Installing collected packages: torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.18.0
    Uninstalling torchtext-0.18.0:
      Successfully uninstalled torchtext-0.18.0
Successfully installed torchtext-0.6.0


In [53]:
import transformers
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import spacy
import stanza
from torchtext.data import Field

In [None]:
# # !pip install transformers datasets
# # from datasets import load_dataset
# from google.colab import drive
# from google.colab import files
# uploaded = files.upload()
data = pd.read_csv("data/train.csv")
data.columns

In [None]:
data.head()
wolof = data["WOLOF"]
francais = data["FRENCH"]


In [None]:
wolof_sentences = [sentence.rstrip('\n').lower() for sentence in wolof]
francais_sentences = [sentence.rstrip('\n') for sentence in francais]

In [None]:
wolof_sentences[:3]

In [None]:
francais_sentences[:3]

In [32]:
spacy_frenc = spacy.load("fr_core_news_sm")
stanza.download('wo')

# Initialiser le pipeline de traitement avec tokenisation
stanza_wolof = stanza.Pipeline(lang='wo', processors='tokenize')


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-04-25 18:38:32 INFO: Downloaded file to /home/lahad/stanza_resources/resources.json
2024-04-25 18:38:32 INFO: Downloading default packages for language: wo (Wolof) ...
2024-04-25 18:38:33 INFO: File exists: /home/lahad/stanza_resources/wo/default.zip
2024-04-25 18:38:33 INFO: Finished downloading models and saved to /home/lahad/stanza_resources
2024-04-25 18:38:33 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-04-25 18:38:33 INFO: Downloaded file to /home/lahad/stanza_resources/resources.json
2024-04-25 18:38:33 INFO: Loading these models for language: wo (Wolof):
| Processor | Package |
-----------------------
| tokenize  | wtb     |
| mwt       | wtb     |

2024-04-25 18:38:33 INFO: Using device: cpu
2024-04-25 18:38:33 INFO: Loading: tokenize
2024-04-25 18:38:33 INFO: Loading: mwt
2024-04-25 18:38:33 INFO: Done loading processors!


In [34]:
def tokenize_wolof(text):
    doc = stanza_wolof(text)
    tokens = [token.text for sentence in doc.sentences for token in sentence.tokens]
    return tokens
text = "Dafa am solo motax. ndax dinga dém si biir?"
# tokenize_wolof(text)

['Dafa', 'am', 'solo', 'motax', '.', 'ndax', 'dinga', 'dém', 'si', 'biir', '?']

In [37]:
def tokenize_french(text):
    doc = spacy_frenc(text)
    tokens = [token.text for token in doc]
    return tokens

text = "Je suis une étudiante. Est-ce que tu viens avec moi?"
# tokenize_french(text)  

In [52]:
wolof = Field(tokenize=tokenize_wolof, lower=True, init_token="<sos>", eos_token="<eos>")
french = Field(tokenize=tokenize_french, lower=True, init_token="<sos>", eos_token="<eos>")
wolof.build_vocab(wolof_sentences, max_size=10000, min_freq=2)
french.build_vocab(francais_sentences, max_size=10000, min_freq=2)

In [None]:
class Transformer(nn.Module):
    def __init__(self,
                  embedding_size, # taille de l'embedding
                  src_vocab_size, # taille du vocabulaire source
                  trg_vocab_size, # taille du vocabulaire cible
                  src_pad_idx, # index du token de padding
                  num_heads, # nombre de têtes d'attention
                  num_encoder_layers, # nombre de couches de l'encodeur
                  num_decoder_layers, # nombre de couches du décodeur
                  forward_expansion, # expansion du feedforward
                  dropout,   # dropout pour la régularisation de la couche de sortie
                  max_length, # longueur maximale
                  device # device
                        ):
        super(Transformer, self).__init__()
        self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size) # embedding de la source
        self.src_position_embedding = nn.Embedding(max_length, embedding_size) # embedding de la position de la source
        self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size) # embedding de la cible
        self.trg_position_embedding = nn.Embedding(max_length, embedding_size) # embedding de la position de la cible
        self.device = device # device
        self.transformer = nn.Transformer(
            embedding_size,
            num_heads,
            num_encoder_layers,
            num_decoder_layers,
            forward_expansion,
            dropout
        )
        self.fc_out = nn.Linear(embedding_size, trg_vocab_size) # couche de sortie
        self.dropout = nn.Dropout(dropout) # dropout pour la régularisation de la couche de sortie 
        self.src_pad_idx = src_pad_idx # index du token de padding 

    def make_src_mask(self, src): # masque pour la source 
        src_mask = src.transpose(0, 1) == self.src_pad_idx # on prend le transpose [S, N] pour avoir la forme [N, S]
        return src_mask.to(self.device)
    def forward(self, src, trg):
        src_seg_len, N = src.shape # longueur de la source
        trg_seg_len, N = trg.shape  # longueur de la cible

        src_positions = ( # positions de la source 
            torch.arange(0, src_seg_len).unsqueeze(1).expand(src_seg_len, N).to(self.device)
        )
        trg_positions = ( # positions de la cible
            torch.arange(0, trg_seg_len).unsqueeze(1).expand(trg_seg_len, N).to(self.device)
        )
        embed_src = self.dropout((self.src_word_embedding(src) + self.src_position_embedding(src_positions))) # embedding de la source
        embed_trg = self.dropout((self.trg_word_embedding(trg) + self.trg_position_embedding(trg_positions))) # embedding de la cible
        src_padding_mask = self.make_src_mask(src) # masque pour la source 
        trg_mask = self.transformer.generate_square_subsequent_mask(trg_seg_len).to(self.device) # masque pour la cible*

        out = self.transformer(
            embed_src,
            embed_trg,
            src_key_padding_mask = src_padding_mask,
            tgt_mask = trg_mask
        ) # transformer 



