In [None]:
# !pip install tensorflow==2.12.0 tensorflow-probability==0.20.0
# !pip install d2l==1.0.3

# Autores:
**Gabriel Roberto (221020870) e Jean Soares (241033810)**

# IBMModel1 - Método estatístico

In [16]:
import requests
import tarfile
import re
import collections
import math
from random import choices
from d2l import tensorflow as d2l
from nltk import AlignedSent
from nltk.translate import IBMModel1
from nltk.translate.bleu_score import sentence_bleu

In [2]:
def baixar_arquivo(url, endereco):
    resposta = requests.get(url)
    if resposta.status_code == requests.codes.OK:
        with open(endereco, 'wb') as novo_arquivo:
                novo_arquivo.write(resposta.content)
        print("Download finalizado. Arquivo salvo em: {}".format(endereco))
    else:
        resposta.raise_for_status()

In [3]:
URL_arquivo = 'https://www.statmt.org/europarl/v7/pt-en.tgz'

baixar_arquivo(URL_arquivo, './raw_dataset.tgz')

Download finalizado. Arquivo salvo em: ./raw_dataset.tgz


In [4]:
# Caminho para o arquivo .tgz
caminho_arquivo = './raw_dataset.tgz'

# Abrir o arquivo .tgz
with tarfile.open(caminho_arquivo, 'r:gz') as arquivo_tgz:
    # Extrair todos os arquivos do .tgz
    arquivo_tgz.extractall(path='./')

    # Listar os arquivos extraídos
    for membro in arquivo_tgz.getmembers():
        print(membro.name)

europarl-v7.pt-en.en
europarl-v7.pt-en.pt


## Leitura do Dataset

In [3]:
class ParallelCorpus(d2l.DataModule):
    """The Parallel Corpus Portuguese-English dataset."""
    def _download(self, fname):
        with open(fname) as f:
            return f.read()

In [4]:
data = ParallelCorpus()
raw_text_en = data._download('./europarl-v7.pt-en.en')
raw_text_en[:60]

'Resumption of the session\nI declare resumed the session of t'

In [8]:
raw_text_pt = data._download('./europarl-v7.pt-en.pt')
raw_text_pt[:60]

'Reinício da sessão\nDeclaro reaberta a sessão do Parlamento E'

## Pré-processamento e separação das sentenças

In [5]:
@d2l.add_to_class(ParallelCorpus)
def _preprocess(self, text):
    return re.sub(r'[^\w\s]', '', text).lower()

In [10]:
text_en = data._preprocess(raw_text_en).split('\n')

del raw_text_en
text_en[:10]

['resumption of the session',
 'i declare resumed the session of the european parliament adjourned on friday 17 december 1999 and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period',
 'although as you will have seen the dreaded millennium bug failed to materialise still the people in a number of countries suffered a series of natural disasters that truly were dreadful',
 'you have requested a debate on this subject in the course of the next few days during this partsession',
 'in the meantime i should like to observe a minute s silence as a number of members have requested on behalf of all the victims concerned particularly those of the terrible storms in the various countries of the european union',
 'please rise then for this minute s silence',
 'the house rose and observed a minute s silence',
 'madam president on a point of order',
 'you will be aware from the press and television that there have been a number of bomb explosi

In [11]:
text_pt = data._preprocess(raw_text_pt).split('\n')

del raw_text_pt
text_pt[:10]

['reinício da sessão',
 'declaro reaberta a sessão do parlamento europeu que tinha sido interrompida na sextafeira 17 de dezembro último e renovo todos os meus votos esperando que tenham tido boas férias',
 'como puderam constatar o grande bug do ano 2000 não aconteceu em contrapartida os cidadãos de alguns dos nossos países foram vítimas de catástrofes naturais verdadeiramente terríveis',
 'os senhores manifestaram o desejo de se proceder a um debate sobre o assunto nos próximos dias durante este período de sessões',
 'entretanto gostaria  como também me foi pedido por um certo número de colegas  que observássemos um minuto de silêncio por todas as vítimas nomeadamente das tempestades nos diferentes países da união europeia que foram afectados',
 'convidoos a levantaremse para um minuto de silêncio',
 'o parlamento de pé guarda um minuto de silêncio',
 'senhora presidente intervenho para um ponto de ordem',
 'certamente que já tomou conhecimento pelas notícias transmitidas na imprensa

In [12]:
sentences_en = [sentence.split() for sentence in text_en]
sentences_pt = [sentence.split() for sentence in text_pt]
del text_en, text_pt

In [14]:
print(len(sentences_en), len(sentences_pt))

1960408 1960408


In [24]:
print(sentences_en[1960406])
print(sentences_pt[1960406])

['the', 'sitting', 'was', 'closed', 'at', '1050', 'am']
['a', 'sessão', 'é', 'suspensa', 'às', '10h50']


In [30]:
s_sent = []

for i in range(len(sentences_en)):
  if len(sentences_en[i]) == len(sentences_pt[i]):
    s_sent.append([sentences_en[i], sentences_pt[i]])

del sentences_en, sentences_pt
len(s_sent)

256335

In [31]:
s_sent[100]

[['we',
  'european',
  'socialists',
  'are',
  'in',
  'favour',
  'of',
  'a',
  'market',
  'economy',
  'with',
  'a',
  'social',
  'purpose'],
 ['nós',
  'socialistas',
  'europeus',
  'somos',
  'a',
  'favor',
  'de',
  'uma',
  'economia',
  'de',
  'mercado',
  'de',
  'finalidade',
  'social']]

In [48]:
source_sentences, target_sentences = zip(*choices(s_sent, k=10000))
source_sentences = list(source_sentences)
target_sentences = list(target_sentences)

In [49]:
print(len(source_sentences), len(target_sentences))

10000 10000


In [51]:
print(source_sentences[:2])
print(target_sentences[:2])

[['this', 'is', 'firstly', 'because', 'the', 'agreement', 'forms', 'part', 'of', 'a', 'broad', 'context', 'of', 'securitarian', 'measures', 'note', 'the', 'programme', 'of', 'illegal', 'phone', 'tapping', 'in', 'the', 'united', 'states', 'on', 'the', 'basis', 'of', 'the', 'socalled', 'fight', 'against', 'terrorism', 'which', 'has', 'been', 'a', 'pretext', 'for', 'unleashing', 'interference', 'and', 'aggression', 'on', 'countries', 'and', 'peoples', 'when', 'us', 'imperialist', 'interests', 'have', 'been', 'at', 'stake'], ['what', 'does', 'it', 'intend', 'to', 'do', 'about', 'this']]
[['desde', 'logo', 'porque', 'esse', 'acordo', 'se', 'integra', 'num', 'amplo', 'conjunto', 'de', 'medidas', 'de', 'cariz', 'securitário', 'refirase', 'o', 'programa', 'de', 'escutas', 'telefónicas', 'ilegais', 'nos', 'eua', 'que', 'tem', 'por', 'base', 'a', 'denominada', 'luta', 'contra', 'o', 'terrorismo', 'pretexto', 'para', 'o', 'desencadeamento', 'da', 'ingerência', 'e', 'da', 'agressão', 'a', 'países'

## Treinamento do modelo

In [53]:
bitext = []
for i in range(len(source_sentences)):
  bitext.append(AlignedSent(source_sentences[i], target_sentences[i]))

ibm_model = IBMModel1(bitext, 5)  # 5 iterations for training

In [66]:
print(round(ibm_model.translation_table['good']['bom'], 3))
print(round(ibm_model.translation_table['president']['presidente'], 3))
print(round(ibm_model.translation_table['parliament']['parlamento'], 3))
print(round(ibm_model.translation_table['european']['europeu'], 3))
print(round(ibm_model.translation_table['european'][None], 3))

0.798
0.763
0.92
0.882
0.001


## Avaliação

In [70]:
def translate_sentence(sentence, ibm_model):
    translated_sentence = []
    for word in sentence:
        if word in ibm_model.translation_table:
            # Escolher a palavra de destino com maior probabilidade
            best_translation = max(ibm_model.translation_table[word], key=ibm_model.translation_table[word].get)
            translated_sentence.append(best_translation)
        else:
            translated_sentence.append(word)  # Palavra desconhecida, mantém original
    return " ".join(translated_sentence)

In [94]:
reference = [['O', 'governo', 'é', 'corrupto']]  # Example reference translation
new_sentence = ['the', 'government', 'is',  'corrupt']
candidate = translate_sentence(new_sentence, ibm_model)  # Example machine output
score = sentence_bleu(reference, candidate)

print('Referência: ', ' '.join(reference[0]))
print('Tradução do modelo: ', candidate)
print('BLEU: ', score)

Referência:  O governo é corrupto
Tradução do modelo:  o governo é corrupto
BLEU:  8.614911585158347e-232


In [98]:
score = 0
reference = []
candidate = ''

for i in range(len(source_sentences)):
  reference = [target_sentences[i]]
  candidate = translate_sentence(source_sentences[i], ibm_model)
  score += sentence_bleu(reference, candidate)

mean_BLEU = score / len(source_sentences)

print('mean_score: ', mean_BLEU)

mean_score:  1.3813326996009179e-157


# Codificador-Decodificador

In [99]:
import tensorflow as tf

class Seq2SeqEncoder(d2l.Encoder):
    """The RNN encoder for sequence-to-sequence learning."""
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 dropout=0):
        super().__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size, embed_size)
        self.rnn = d2l.GRU(num_hiddens, num_layers, dropout)

    def call(self, X, *args):
        # X shape: (batch_size, num_steps)
        embs = self.embedding(tf.transpose(X))
        # embs shape: (num_steps, batch_size, embed_size)
        outputs, state = self.rnn(embs)
        # outputs shape: (num_steps, batch_size, num_hiddens)
        # state shape: (num_layers, batch_size, num_hiddens)
        return outputs, state

In [100]:
vocab_size, embed_size, num_hiddens, num_layers = 10, 8, 16, 2
batch_size, num_steps = 4, 9
encoder = Seq2SeqEncoder(vocab_size, embed_size, num_hiddens, num_layers)
X = tf.zeros((batch_size, num_steps))
enc_outputs, enc_state = encoder(X)
d2l.check_shape(enc_outputs, (num_steps, batch_size, num_hiddens))

In [101]:
d2l.check_len(enc_state, num_layers)
d2l.check_shape(enc_state[0], (batch_size, num_hiddens))

In [102]:
class Seq2SeqDecoder(d2l.Decoder):
    """The RNN decoder for sequence to sequence learning."""
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 dropout=0):
        super().__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size, embed_size)
        self.rnn = d2l.GRU(num_hiddens, num_layers, dropout)
        self.dense = tf.keras.layers.Dense(vocab_size)

    def init_state(self, enc_all_outputs, *args):
        return enc_all_outputs

    def call(self, X, state):
        # X shape: (batch_size, num_steps)
        # embs shape: (num_steps, batch_size, embed_size)
        embs = self.embedding(tf.transpose(X))
        enc_output, hidden_state = state
        # context shape: (batch_size, num_hiddens)
        context = enc_output[-1]
        # Broadcast context to (num_steps, batch_size, num_hiddens)
        context = tf.tile(tf.expand_dims(context, 0), (embs.shape[0], 1, 1))
        # Concat at the feature dimension
        embs_and_context = tf.concat((embs, context), -1)
        outputs, hidden_state = self.rnn(embs_and_context, hidden_state)
        outputs = tf.transpose(self.dense(outputs), (1, 0, 2))
        # outputs shape: (batch_size, num_steps, vocab_size)
        # hidden_state shape: (num_layers, batch_size, num_hiddens)
        return outputs, [enc_output, hidden_state]

In [103]:
decoder = Seq2SeqDecoder(vocab_size, embed_size, num_hiddens, num_layers)
state = decoder.init_state(encoder(X))
dec_outputs, state = decoder(X, state)
d2l.check_shape(dec_outputs, (batch_size, num_steps, vocab_size))
d2l.check_len(state[1], num_layers)
d2l.check_shape(state[1][0], (batch_size, num_hiddens))

In [104]:
class Seq2Seq(d2l.EncoderDecoder):
    """The RNN encoder--decoder for sequence to sequence learning."""
    def __init__(self, encoder, decoder, tgt_pad, lr):
        super().__init__(encoder, decoder)
        self.save_hyperparameters()

    def validation_step(self, batch):
        Y_hat = self(*batch[:-1])
        self.plot('loss', self.loss(Y_hat, batch[-1]), train=False)

    def configure_optimizers(self):
        # Adam optimizer is used here
        return tf.keras.optimizers.Adam(learning_rate=self.lr)

In [111]:
@d2l.add_to_class(Seq2Seq)
def loss(self, Y_hat, Y):
    l = super(Seq2Seq, self).loss(Y_hat, Y, averaged=False)
    mask = tf.cast(tf.reshape(Y, -1) != self.tgt_pad, tf.float32)
    return tf.reduce_sum(l * mask) / tf.reduce_sum(mask)

In [125]:
tgt_vocab = d2l.Vocab(target_sentences, min_freq=2,
                      reserved_tokens=['<pad>', '<bos>', '<eos>'])

embed_size, num_hiddens, num_layers, dropout = 256, 256, 2, 0.2
with d2l.try_gpu():
    encoder = Seq2SeqEncoder(
        len(source_sentences), embed_size, num_hiddens, num_layers, dropout)
    decoder = Seq2SeqDecoder(
        len(target_sentences), embed_size, num_hiddens, num_layers, dropout)
    model = Seq2Seq(encoder, decoder, tgt_pad=tgt_vocab['<pad>'], lr=0.005)
trainer = d2l.Trainer(max_epochs=30, gradient_clip_val=1)
trainer.fit(model, data)

AttributeError: 'list' object has no attribute 'train_dataloader'

In [78]:
@d2l.add_to_class(d2l.EncoderDecoder)
def predict_step(self, batch, device, num_steps,
                 save_attention_weights=False):
    src, tgt, src_valid_len, _ = batch
    enc_all_outputs = self.encoder(src, src_valid_len, training=False)
    dec_state = self.decoder.init_state(enc_all_outputs, src_valid_len)
    outputs, attention_weights = [tf.expand_dims(tgt[:, 0], 1), ], []
    for _ in range(num_steps):
        Y, dec_state = self.decoder(outputs[-1], dec_state, training=False)
        outputs.append(tf.argmax(Y, 2))
        # Save attention weights (to be covered later)
        if save_attention_weights:
            attention_weights.append(self.decoder.attention_weights)
    return tf.concat(outputs[1:], 1), attention_weights

In [81]:
def bleu(pred_seq, label_seq, k):
    """Compute the BLEU."""
    pred_tokens, label_tokens = pred_seq.split(' '), label_seq.split(' ')
    len_pred, len_label = len(pred_tokens), len(label_tokens)
    score = math.exp(min(0, 1 - len_label / len_pred))
    for n in range(1, min(k, len_pred) + 1):
        num_matches, label_subs = 0, collections.defaultdict(int)
        for i in range(len_label - n + 1):
            label_subs[' '.join(label_tokens[i: i + n])] += 1
        for i in range(len_pred - n + 1):
            if label_subs[' '.join(pred_tokens[i: i + n])] > 0:
                num_matches += 1
                label_subs[' '.join(pred_tokens[i: i + n])] -= 1
        score *= math.pow(num_matches / (len_pred - n + 1), math.pow(0.5, n))
    return score

In [82]:
engs = ['go .', 'i lost .', 'he\'s calm .', 'i\'m home .']
ports = ['vai .', 'eu perdi .', 'ele está calmo .', 'estou em casa .']
preds, _ = model.predict_step(
    data.build(engs, ports), d2l.try_gpu(), data.num_steps)
for en, pt, p in zip(engs, ports, preds):
    translation = []
    for token in data.tgt_vocab.to_tokens(p):
        if token == '<eos>':
            break
        translation.append(token)
    print(f'{en} => {translation}, bleu,'
          f'{sentence_bleu([pt.split()], translation, weights=(1, 0, 0, 0))}')

go . => ['<unk>', '!'], bleu,0
i lost . => ['je', 'suis', '<unk>', '.'], bleu,0.25
he's calm . => ['je', 'suis', '<unk>', '.'], bleu,0.25
i'm home . => ['je', 'suis', '<unk>', '.'], bleu,0.25


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
