In [116]:
from lxml import etree
from importlib import reload

def read_tmx_lxml(file_path):
    en_arr , fr_arr = [] , []
    for _, elem in etree.iterparse(file_path, tag="tu"):  # Stream parse <tu> tags
        en = elem.xpath(".//tuv[@xml:lang='en']/seg/text()")[0]
        fr = elem.xpath(".//tuv[@xml:lang='fr']/seg/text()")[0]
        en_arr.append(en)
        fr_arr.append(fr)
        elem.clear()  # Free memory
    return en_arr , fr_arr


english_sentences , french_sentences = read_tmx_lxml(r"data\en-fr.tmx\en-fr.tmx")

In [117]:
english_sentences[:10] 

['VÉNUS lN FURS',
 'Go on!',
 '- Tell me first.',
 'Howthe hell should I know?',
 '- Shall I pick it up?',
 'Feel like going on a journey.',
 'Where to?',
 "I haven' t decided.",
 'Come along.',
 'We made a deal.']

In [118]:
len(english_sentences) , len(french_sentences)

(202180, 202180)

In [119]:
import tokenizer as tk
reload(tk)


sp, embedding_matrix = tk.complete_setup(english_sentences, french_sentences)


print(f"New model created with vocab size: {sp.get_piece_size()}")


Created training corpus: C:\Users\hasan\AppData\Local\Temp\tmph6cuniee.txt
Total sentences: 404360

SentencePiece model created:
  Model file: translation_bpe_v2.model
  Vocab file: translation_bpe_v2.vocab
Vocabulary size: 32000
Special tokens - PAD: 0, UNK: 1, BOS: 2, EOS: 3
English FastText loaded
French FastText loaded
Vocabulary size: 32000

Embedding matrix statistics:
  Shape: torch.Size([32000, 300])
  Found in English FastText: 4125 (12.9%)
  Found in French FastText: 0 (0.0%)
  Special tokens (random): 24357 (76.1%)
  Not found (random): 3518 (11.0%)
  Total FastText coverage: 12.9%
  Tokenizer: translation_bpe_v2.model
  Vocab size: 32000
  Embedding shape: torch.Size([32000, 300])
New model created with vocab size: 32000


In [None]:
import model_arch as ma
import torch
import torch.nn as nn
reload(ma)


model = ma.create_model(embedding_matrix, sp.get_piece_size())
model = model.to('cuda')


input_sentence = "Hello, how are you today?"
translation = model.translate(input_sentence, sp, max_length=50)
print(f"Translation: {translation}")

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss(ignore_index=0) 



Translation: technologie portion repar arrive�dezenfer tuant entendait portion fiers fiers fiers fiers fiers fiers fiers fiers fiers fiers fiers fiers fiers fiers fiers fiers fiers fiers fiers fiers fiers fiers fiers fiers fiers fiers fiers fiers fiers fiers fiers fiers fiers fiers fiers fiers fiers fiers fiers fiers


In [133]:
import data_loader as dl
reload(dl)

dataloader = dl.create_dataloader(
    english_sentences, 
    french_sentences, 
    sp,  
    batch_size=16,
    max_length=64, 
    shuffle=True
)

dl.test_dataloader(dataloader, sp, num_batches=1)


len(dataloader)

Dataset created with 202180 sentence pairs

Batch 1:
Source batch shape: torch.Size([16, 15])
Target batch shape: torch.Size([16, 19])

First example:
Source tokens: [292, 392, 31957, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]...
Target tokens: [2, 292, 6723, 31940, 292, 680, 31957, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]...
Source text: - what?
Target text: - daniel. - quoi?


12637

In [112]:
import trainer as tr 
reload(tr)

num_epochs = 2
device = 'cuda'

tr.train(model , dataloader , num_epochs,device, optimizer, criterion , sp)


BLEU Score: 0.0000
Epoch 0, Batch 0, Loss: 10.3803
Epoch 0, Batch 100, Loss: 7.0687
Epoch 0, Batch 200, Loss: 5.7968
Epoch 0, Batch 300, Loss: 5.2720
Epoch 0, Batch 400, Loss: 5.5267
Epoch 0, Batch 500, Loss: 5.8053
Epoch 0, Batch 600, Loss: 5.0420
Epoch 0, Batch 700, Loss: 5.8231
Epoch 0, Batch 800, Loss: 4.7605
Epoch 0, Batch 900, Loss: 5.0860
Epoch 0, Batch 1000, Loss: 4.3438
Epoch 0, Batch 1100, Loss: 4.6817
Epoch 0, Batch 1200, Loss: 4.8158
Epoch 0, Batch 1300, Loss: 4.3287
Epoch 0, Batch 1400, Loss: 4.6585
Epoch 0, Batch 1500, Loss: 4.8502
Epoch 0, Batch 1600, Loss: 4.2405
Epoch 0, Batch 1700, Loss: 4.1612
Epoch 0, Batch 1800, Loss: 3.7920
Epoch 0, Batch 1900, Loss: 4.8153
Epoch 0, Batch 2000, Loss: 4.4997
Epoch 0, Batch 2100, Loss: 4.6550
Epoch 0, Batch 2200, Loss: 4.4531
Epoch 0, Batch 2300, Loss: 4.1248
Epoch 0, Batch 2400, Loss: 4.5487
Epoch 0, Batch 2500, Loss: 4.5681
Epoch 0, Batch 2600, Loss: 4.5038
Epoch 0, Batch 2700, Loss: 4.4260
Epoch 0, Batch 2800, Loss: 4.4351
Epoch 

KeyboardInterrupt: 

In [93]:
torch.save({
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
            }, 'best_model.pt')

In [124]:
checkpoint = torch.load('best_model.pt')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

In [130]:
import bleu_tester as bt
reload(bt)

source_sentences , reference_sentences = bt.get_sentences()

results = bt.calculate_bleu_score(model, sp, source_sentences, reference_sentences)

print(f"BLEU Score: {results['bleu_score']:.4f}")

for i, (trans, ref) in enumerate(zip(results['translations'], results['references'])):
    print(f"{i+1}. Translation: {trans}")
    print(f"   Reference:  {ref}")

BLEU Score: 0.4109
1. Translation: comment ça va?
   Reference:  comment ça va?
2. Translation: comment t'appelles- tu?
   Reference:  comment tu t'appelles
3. Translation: où vas- tu?
   Reference:  bonjour, où vas-tu?
4. Translation: je ne vous comprends pas.
   Reference:  je suis désolé, je ne vous comprends pas
5. Translation: je suis malade.
   Reference:  je suis malade aujourd'hui
6. Translation: a quelle heure le train part le train?
   Reference:  à quelle heure part le train?
7. Translation: le chat dort sur le canapé.
   Reference:  le chat dort sur le canapé.
8. Translation: je t'aime très fort.
   Reference:  je t'aime beaucoup.
9. Translation: je ne suis pas sûr de pouvoir vous aider.
   Reference:  bien que je ne sois pas sûr de pouvoir vous aider.
10. Translation: j'essaierai de vous avoir bien d'être le meilleur pour vous.
   Reference:  je ferai de mon mieux pour vous aider.
11. Translation: aidez- moi à cette tâche.
   Reference:  s'il vous plaît, aidez-moi avec cet