In [1]:
!pip install torch torchtext spacy tqdm
!python -m spacy download en_core_web_sm
!python -m spacy download fr_core_news_sm


Collecting torchtext
  Downloading torchtext-0.18.0-cp312-cp312-manylinux1_x86_64.whl.metadata (7.9 kB)
Downloading torchtext-0.18.0-cp312-cp312-manylinux1_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m41.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchtext
Successfully installed torchtext-0.18.0
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m150.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import math
from tqdm import tqdm
import spacy


In [4]:
spacy_en = spacy.load("en_core_web_sm")
spacy_fr = spacy.load("fr_core_news_sm")

def tokenize_en(text):
    return [tok.text.lower() for tok in spacy_en.tokenizer(text)]

def tokenize_fr(text):
    return [tok.text.lower() for tok in spacy_fr.tokenizer(text)]


In [5]:
!wget https://www.manythings.org/anki/fra-eng.zip
!unzip fra-eng.zip


--2025-12-27 11:17:41--  https://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8186368 (7.8M) [application/zip]
Saving to: ‘fra-eng.zip’


2025-12-27 11:17:41 (27.7 MB/s) - ‘fra-eng.zip’ saved [8186368/8186368]

Archive:  fra-eng.zip
  inflating: _about.txt              
  inflating: fra.txt                 


In [6]:
pairs = []

with open("fra.txt", encoding="utf-8") as f:
    for line in f:
        eng, fr, _ = line.strip().split("\t")
        pairs.append((eng, fr))

# Use subset (fast training)
pairs = pairs[:30000]

print(pairs[0])


('Go.', 'Va !')


In [7]:
from collections import Counter

def build_vocab(sentences, tokenizer, min_freq=2):
    counter = Counter()
    for sent in sentences:
        counter.update(tokenizer(sent))

    vocab = {"<pad>":0, "<sos>":1, "<eos>":2, "<unk>":3}
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)

    return vocab

eng_vocab = build_vocab([p[0] for p in pairs], tokenize_en)
fr_vocab = build_vocab([p[1] for p in pairs], tokenize_fr)

print(len(eng_vocab), len(fr_vocab))


3044 4604


In [8]:
def encode(sentence, vocab, tokenizer):
    tokens = tokenizer(sentence)
    return [vocab.get(tok, vocab["<unk>"]) for tok in tokens]

data = []
for eng, fr in pairs:
    src = [eng_vocab["<sos>"]] + encode(eng, eng_vocab, tokenize_en) + [eng_vocab["<eos>"]]
    trg = [fr_vocab["<sos>"]] + encode(fr, fr_vocab, tokenize_fr) + [fr_vocab["<eos>"]]
    data.append((src, trg))


In [9]:
def encode(sentence, vocab, tokenizer):
    tokens = tokenizer(sentence)
    return [vocab.get(tok, vocab["<unk>"]) for tok in tokens]

data = []
for eng, fr in pairs:
    src = [eng_vocab["<sos>"]] + encode(eng, eng_vocab, tokenize_en) + [eng_vocab["<eos>"]]
    trg = [fr_vocab["<sos>"]] + encode(fr, fr_vocab, tokenize_fr) + [fr_vocab["<eos>"]]
    data.append((src, trg))


In [10]:
def pad_batch(batch):
    src, trg = zip(*batch)
    src_len = max(len(s) for s in src)
    trg_len = max(len(t) for t in trg)

    src_pad = [s + [0]*(src_len-len(s)) for s in src]
    trg_pad = [t + [0]*(trg_len-len(t)) for t in trg]

    return torch.tensor(src_pad), torch.tensor(trg_pad)

BATCH_SIZE = 64
train_loader = torch.utils.data.DataLoader(
    data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad_batch
)


In [11]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, n_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.LSTM(
            emb_dim, hid_dim, n_layers,
            bidirectional=True, batch_first=True
        )

    def forward(self, src):
        embedded = self.embedding(src)
        outputs, (hidden, cell) = self.rnn(embedded)
        return outputs, hidden, cell


In [12]:
class Attention(nn.Module):
    def __init__(self, hid_dim):
        super().__init__()
        self.attn = nn.Linear(hid_dim*3, hid_dim)
        self.v = nn.Linear(hid_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        src_len = encoder_outputs.shape[1]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = self.v(energy).squeeze(2)
        return torch.softmax(attention, dim=1)


In [13]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, attention):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.attention = attention
        self.rnn = nn.LSTM(hid_dim*2 + emb_dim, hid_dim, batch_first=True)
        self.fc = nn.Linear(hid_dim*3, vocab_size)

    def forward(self, input, hidden, cell, encoder_outputs):
        input = input.unsqueeze(1)
        embedded = self.embedding(input)

        attn_weights = self.attention(hidden[-1], encoder_outputs)
        context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs)

        rnn_input = torch.cat((embedded, context), dim=2)
        output, (hidden, cell) = self.rnn(rnn_input, (hidden[:1], cell[:1]))

        output = self.fc(torch.cat((output.squeeze(1), context.squeeze(1)), dim=1))
        return output, hidden, cell


In [18]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        vocab_size = self.decoder.fc.out_features

        outputs = torch.zeros(
            batch_size, trg_len, vocab_size
        ).to(src.device)

        encoder_outputs, hidden, cell = self.encoder(src)

        input = trg[:, 0]

        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(
                input, hidden, cell, encoder_outputs
            )
            outputs[:, t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            input = trg[:, t] if teacher_force else output.argmax(1)

        return outputs


In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

encoder = Encoder(len(eng_vocab), 256, 512, 2)
attention = Attention(512)
decoder = Decoder(len(fr_vocab), 256, 512, attention)
model = Seq2Seq(encoder, decoder).to(device)

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=0)


In [20]:
def train(model, loader):
    model.train()
    epoch_loss = 0

    for src, trg in tqdm(loader):
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        output = model(src, trg)
        loss = criterion(output[:,1:].reshape(-1, output.shape[-1]),
                         trg[:,1:].reshape(-1))
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    return epoch_loss / len(loader)


In [25]:
for epoch in range(10):
    loss = train(model, train_loader)
    print(f"Epoch {epoch+1} Loss: {loss:.4f}")


100%|██████████| 469/469 [00:25<00:00, 18.58it/s]


Epoch 1 Loss: 0.6836


100%|██████████| 469/469 [00:25<00:00, 18.73it/s]


Epoch 2 Loss: 0.6220


100%|██████████| 469/469 [00:26<00:00, 17.99it/s]


Epoch 3 Loss: 0.5885


100%|██████████| 469/469 [00:24<00:00, 18.78it/s]


Epoch 4 Loss: 0.5783


100%|██████████| 469/469 [00:25<00:00, 18.29it/s]


Epoch 5 Loss: 0.5513


100%|██████████| 469/469 [00:24<00:00, 18.80it/s]


Epoch 6 Loss: 0.5448


100%|██████████| 469/469 [00:24<00:00, 18.77it/s]


Epoch 7 Loss: 0.5239


100%|██████████| 469/469 [00:25<00:00, 18.70it/s]


Epoch 8 Loss: 0.5157


100%|██████████| 469/469 [00:25<00:00, 18.74it/s]


Epoch 9 Loss: 0.5113


100%|██████████| 469/469 [00:25<00:00, 18.72it/s]

Epoch 10 Loss: 0.4945





In [26]:
def translate(sentence):
    model.eval()
    tokens = [eng_vocab["<sos>"]] + encode(sentence, eng_vocab, tokenize_en) + [eng_vocab["<eos>"]]
    src = torch.tensor(tokens).unsqueeze(0).to(device)

    with torch.no_grad():
        encoder_outputs, hidden, cell = model.encoder(src)

    input = torch.tensor([fr_vocab["<sos>"]]).to(device)
    result = []

    for _ in range(20):
        with torch.no_grad():
            output, hidden, cell = model.decoder(input, hidden, cell, encoder_outputs)
        pred = output.argmax(1).item()
        if pred == fr_vocab["<eos>"]:
            break
        result.append(pred)
        input = torch.tensor([pred]).to(device)

    inv_vocab = {v:k for k,v in fr_vocab.items()}
    return " ".join(inv_vocab[i] for i in result)


In [28]:
print(translate("i am a student"))
print(translate("how are you"))
print(translate("this is a good project"))


je suis étudiant .
ça va ?
c' est une bonne <unk> .


In [29]:
MODEL_PATH = "nmt_seq2seq_attention.pth"

torch.save({
    "encoder_state_dict": model.encoder.state_dict(),
    "decoder_state_dict": model.decoder.state_dict(),
    "eng_vocab": eng_vocab,
    "fr_vocab": fr_vocab
}, MODEL_PATH)

print("Model saved successfully")


Model saved successfully


In [30]:
from google.colab import files
files.download(MODEL_PATH)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>