In [146]:
from lxml import etree
from importlib import reload

def read_tmx_lxml(file_path):
    en_arr , fr_arr = [] , []
    for _, elem in etree.iterparse(file_path, tag="tu"):  # Stream parse <tu> tags
        en = elem.xpath(".//tuv[@xml:lang='en']/seg/text()")[0]
        fr = elem.xpath(".//tuv[@xml:lang='fr']/seg/text()")[0]
        en_arr.append(en)
        fr_arr.append(fr)
        elem.clear()  # Free memory
    return en_arr , fr_arr


english_sentences , french_sentences = read_tmx_lxml(r"data\en-fr.tmx\en-fr.tmx")

In [147]:
english_sentences[:10] 

['VÉNUS lN FURS',
 'Go on!',
 '- Tell me first.',
 'Howthe hell should I know?',
 '- Shall I pick it up?',
 'Feel like going on a journey.',
 'Where to?',
 "I haven' t decided.",
 'Come along.',
 'We made a deal.']

In [148]:
len(english_sentences) , len(french_sentences)

(202180, 202180)

In [None]:
import random

def modify_dataset_punctuation(english_sentences, french_sentences, seed=42):
    random.seed(seed)  # For reproducibility
    
    # Create indices for half the dataset
    total_sentences = len(english_sentences)
    indices_to_modify = random.sample(range(total_sentences), total_sentences // 2)
    
    print(f"Total sentences: {total_sentences:,}")
    print(f"Modifying punctuation for: {len(indices_to_modify):,} sentences")
    
    # Modify the selected sentences
    english_modified = english_sentences.copy()
    french_modified = french_sentences.copy()
    
    for idx in indices_to_modify:
        # Remove ending punctuation from both languages
        english_modified[idx] = english_sentences[idx].rstrip('.!?;:').strip()
        french_modified[idx] = french_sentences[idx].rstrip('.!?;:').strip()
    
    return english_modified, french_modified, indices_to_modify

In [150]:
english_modified , french_modified , _= modify_dataset_punctuation(english_sentences, french_sentences,42) 

Total sentences: 202,180
Modifying punctuation for: 101,090 sentences


In [151]:
len(english_modified)

202180

In [152]:
from sklearn.model_selection import train_test_split

train_english, test_english, train_french, test_french = train_test_split(
   english_sentences, french_sentences, test_size=0.1, random_state=42
)

print(f"Training: {len(train_english):,} pairs")
print(f"Testing: {len(test_english):,} pairs")

Training: 181,962 pairs
Testing: 20,218 pairs


In [153]:
import tokenizer as tk
reload(tk)


sp, embedding_matrix = tk.complete_setup(train_english, train_french)


print(f"New model created with vocab size: {sp.get_piece_size()}")


Created training corpus: C:\Users\hasan\AppData\Local\Temp\tmpud8e4f20.txt
Total sentences: 363924

SentencePiece model created:
  Model file: translation_bpe.model
  Vocab file: translation_bpe.vocab
Vocabulary size: 32000
Special tokens - PAD: 0, UNK: 1, BOS: 2, EOS: 3
English FastText loaded
French FastText loaded
Vocabulary size: 32000

Embedding matrix statistics:
  Shape: torch.Size([32000, 300])
  Found in English FastText: 4262 (13.3%)
  Found in French FastText: 0 (0.0%)
  Special tokens (random): 24059 (75.2%)
  Not found (random): 3679 (11.5%)
  Total FastText coverage: 13.3%
  Tokenizer: translation_bpe.model
  Vocab size: 32000
  Embedding shape: torch.Size([32000, 300])
New model created with vocab size: 32000


In [165]:
import model_arch as ma
import torch
import torch.nn as nn
reload(ma)


model = ma.create_model(embedding_matrix, sp.get_piece_size())
model = model.to('cuda')


input_sentence = "Hello, how are you today?"
translation = model.translate(input_sentence, sp, max_length=50)
print(f"Translation: {translation}")

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss(ignore_index=0) 



Translation: lax lax lax lax lax lax lax lax lax lax lax hostiles hostiles hostiles hostiles hostiles hostiles hostiles hostiles hostiles hostiles hostiles hostiles hostiles hostiles hostiles hostiles hostiles hostiles lax lax lax lax lax lax lax lax lax hostiles hostiles hostiles hostiles hostiles hostiles lax lax lax lax lax lax


In [166]:
import data_loader as dl
reload(dl)

train_loader = dl.create_dataloader(
    train_english, 
    train_french, 
    sp,  
    batch_size=16,
    max_length=64, 
    shuffle=True
)

dl.test_dataloader(train_loader, sp, num_batches=1)

test_loader = dl.create_dataloader(
    test_english, 
    test_french, 
    sp,  
    batch_size=16,
    max_length=64, 
    shuffle=False
)

dl.test_dataloader(test_loader , sp , num_batches = 1)

len(train_loader)
len(test_loader)

Dataset created with 181962 sentence pairs

Batch 1:
Source batch shape: torch.Size([16, 22])
Target batch shape: torch.Size([16, 18])

First example:
Source tokens: [353, 741, 290, 4343, 358, 23342, 451, 634, 2870, 1396, 337, 31940, 3, 0, 0, 0, 0, 0, 0, 0]...
Target tokens: [2, 1129, 274, 31947, 7319, 25872, 2085, 546, 536, 1673, 2870, 31940, 3, 0, 0, 0, 0, 0]...
Source text: and then the electrician with his machine came in.
Target text: puis l'électricien arrive avec sa petite machine.
Dataset created with 20218 sentence pairs

Batch 1:
Source batch shape: torch.Size([16, 23])
Target batch shape: torch.Size([16, 27])

First example:
Source tokens: [480, 448, 945, 1635, 31940, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]...
Target tokens: [2, 426, 13845, 31960, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]...
Source text: they are savages.
Target text: des sauvages!


1264

In [None]:
import trainer as tr 
reload(tr)

num_epochs = 5
device = 'cuda'

tr.train(model , train_loader ,test_loader,  num_epochs, device, optimizer, criterion , sp)


Epoch 1/5
Batch 0, Training Loss: 2.4290
Batch 1000, Training Loss: 2.9564
Batch 2000, Training Loss: 2.4287
Batch 3000, Training Loss: 2.7509
Batch 4000, Training Loss: 2.9753
Batch 5000, Training Loss: 2.9637
Batch 6000, Training Loss: 2.3331


In [170]:
torch.save({
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
            }, 'best_model.pt')

In [None]:
checkpoint = torch.load('best_model.pt')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

In [169]:
import bleu_tester as bt
reload(bt)

source_sentences , reference_sentences = bt.get_sentences()

results = bt.calculate_bleu_score(model, sp, source_sentences, reference_sentences)

print(f"BLEU Score: {results['bleu_score']:.4f}")

for i, (trans, ref) in enumerate(zip(results['translations'], results['references'])):
    print(f"{i+1}. Translation: {trans}")
    print(f"   Reference:  {ref}")

BLEU Score: 0.2155
1. Translation: comment allez- vous?
   Reference:  comment ça va?
2. Translation: comment s'appelle- t- il?
   Reference:  comment tu t'appelles
3. Translation: bonjour, où vas- tu?
   Reference:  bonjour, où vas-tu?
4. Translation: je suis désolé, je ne comprends pas.
   Reference:  je suis désolé, je ne vous comprends pas
5. Translation: je suis malade aujourd'hui.
   Reference:  je suis malade aujourd'hui
6. Translation: quelle heure fait la partie de la part?
   Reference:  à quelle heure part le train?
7. Translation: le chat est sur le toit.
   Reference:  le chat dort sur le canapé.
8. Translation: je t'aime beaucoup.
   Reference:  je t'aime beaucoup.
9. Translation: mais je ne suis pas sûr si je peux vous aider.
   Reference:  bien que je ne sois pas sûr de pouvoir vous aider.
10. Translation: je vais essayer de vous avoir dérang.
   Reference:  je ferai de mon mieux pour vous aider.
11. Translation: aidez- moi avec cette attaque.
   Reference:  s'il vous p