In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/language-translation-englishfrench/eng_-french.csv'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install nltk==3.6.5

Collecting nltk==3.6.5
  Downloading nltk-3.6.5-py3-none-any.whl.metadata (3.0 kB)
Downloading nltk-3.6.5-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: nltk
  Attempting uninstall: nltk
    Found existing installation: nltk 3.2.4
    Uninstalling nltk-3.2.4:
      Successfully uninstalled nltk-3.2.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
preprocessing 0.1.13 requires nltk==3.2.4, but you have nltk 3.6.5 which is incompatible.
textblob 0.18.0.post0 requires nltk>=3.8, but you have nltk 3.6.5 which is incompatible.[0m[31m
[0mSuccessfully installed nltk-3.6.5


In [23]:
import torch
import torch.nn as nn
import torch.optim as optim
import string
import re
from collections import Counter,OrderedDict
from tqdm.auto import tqdm
import nltk
from nltk.tokenize import word_tokenize
from nltk.lm import Vocabulary
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import random

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [5]:
df = pd.read_csv('/kaggle/input/language-translation-englishfrench/eng_-french.csv')
df.columns = ["eng","fre"]
df.head()

Unnamed: 0,eng,fre
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [6]:
print(df.isna().sum())
print(df.isnull().sum())

eng    0
fre    0
dtype: int64
eng    0
fre    0
dtype: int64


In [7]:
valid_puncs = "-,'!?."
invalid_puncs = string.punctuation         #returns all sets of punctuations

for punc in valid_puncs:
    invalid_puncs = invalid_puncs.replace(punc,"")
invalid_puncs_re = re.compile("["+ invalid_puncs +"]")   #expression to match any invalid punctuations 

def text_sanitization(text: str) -> str:
    text = invalid_puncs_re.sub("", text)
    text = re.sub(r"\s+", " ", text)
    return text

In [8]:
df["eng"] = df["eng"].apply(text_sanitization)
df["fre"] = df["fre"].apply(text_sanitization)
df.tail(5)

Unnamed: 0,eng,fre
175616,"Top-down economics never works, said Obama. Th...","« L'économie en partant du haut vers le bas, ç..."
175617,A carbon footprint is the amount of carbon dio...,Une empreinte carbone est la somme de pollutio...
175618,Death is something that we're often discourage...,La mort est une chose qu'on nous décourage sou...
175619,Since there are usually multiple websites on a...,Puisqu'il y a de multiples sites web sur chaqu...
175620,If someone who doesn't know your background sa...,Si quelqu'un qui ne connaît pas vos antécédent...


In [9]:
en_words = Counter()
for sentence in tqdm(df["eng"]):
    en_words.update(word_tokenize(sentence, language='english'))
en_words = OrderedDict(sorted(en_words.items(), key=lambda x:-x[1]))

fr_words = Counter()
for sentence in tqdm(df["fre"]):
    fr_words.update(word_tokenize(sentence, language='french'))
fr_words = OrderedDict(sorted(fr_words.items(), key=lambda x:-x[1]))

print(f"No. of english words: {len(en_words)}")
print(f"No. of french words: {len(fr_words)}")

  0%|          | 0/175621 [00:00<?, ?it/s]

  0%|          | 0/175621 [00:00<?, ?it/s]

No. of english words: 16220
No. of french words: 31537


In [10]:
unk_token = '<unk>'  # Unknown word token
sos_token = '<sos>'  # Start of sentence token
eos_token = '<eos>'  # End of sentence token
pad_token = '<pad>'  #padding token

en_words.update({unk_token: 0, eos_token: 1, pad_token: 2})
fr_words.update({unk_token: 0, sos_token: 1, eos_token: 2, pad_token: 3})

en_vocab = Vocabulary(list(en_words.keys()), unk_label=unk_token)
fr_vocab = Vocabulary(list(fr_words.keys()), unk_label=unk_token)

en_token_to_index = {token: index for index, token in enumerate(en_vocab)} #pad_token 16222
fr_token_to_index = {token: index for index, token in enumerate(fr_vocab)} #pad_token 31540
print(len((en_token_to_index)))
print(len((fr_token_to_index)))
print(en_token_to_index['<pad>'],fr_token_to_index['<pad>'])

16223
31541
16222 31540


In [11]:
def en_tokenize(text: str, append_eos=True):
    words = word_tokenize(text, language='english')
    if append_eos:
        words.append(eos_token)
    return [en_token_to_index.get(word, en_token_to_index[unk_token]) for word in words]

def fr_tokenize(text: str, append_eos=True):
    words = [sos_token] + word_tokenize(text, language='french')
    if append_eos:
        words.append(eos_token)
    return [fr_token_to_index.get(word, fr_token_to_index[unk_token]) for word in words]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(df["eng"], df["fre"], train_size = 0.8)

In [13]:
X_train_seqs = [en_tokenize(x) for x in X_train]
X_test_seqs = [en_tokenize(x) for x in X_test]

y_train_seqs = [fr_tokenize(y) for y in y_train]
y_test_seqs = [fr_tokenize(y) for y in y_test]

In [14]:
print(en_tokenize("Hi this is Swayam."))
print(en_tokenize("lets talk about project x"))

[4283, 23, 8, 16223, 0, 16221]
[4554, 138, 55, 1217, 16223, 16221]


In [15]:
class SeqDataset(Dataset):
    def __init__(self,en_seqs,fr_seqs):
        super().__init__()
        self.en_seqs = en_seqs
        self.fr_seqs = fr_seqs
        
    def __len__(self):
        return len(self.en_seqs)
    
    def __getitem__(self, index):
        encoder_input = torch.tensor(self.en_seqs[index]).long()
        decoder_input = torch.tensor(self.fr_seqs[index][:-1]).long()     
        answer = torch.tensor(self.fr_seqs[index][1:]).long()     #right shift
        return encoder_input, decoder_input, answer
    
def collate_fn(batch):
    encoder_ip = []
    decoder_ip = []
    answer = []
    for e, d, a in batch:
        encoder_ip.append(e)
        decoder_ip.append(d)
        answer.append(a)
        
    encoder_ip = nn.utils.rnn.pad_sequence(encoder_ip, batch_first= True, padding_value= 16222)
    decoder_ip = nn.utils.rnn.pad_sequence(decoder_ip, batch_first= True, padding_value= 31540)
    answer = nn.utils.rnn.pad_sequence(answer, batch_first= True, padding_value= 31540)
    return encoder_ip, decoder_ip, answer

In [16]:
trainset = SeqDataset(X_train_seqs, y_train_seqs)
testset = SeqDataset(X_test_seqs, y_test_seqs)
len(trainset), len(testset)

trainloader = DataLoader(trainset, batch_size= 8, shuffle= True, collate_fn= collate_fn)
testLoader = DataLoader(testset, batch_size= 8, shuffle= False, collate_fn= collate_fn)

In [17]:
ip_dim = len(en_vocab)
op_dim = len(fr_vocab)
d_model = 512       #default value 512
num_enc_layers = 1  #default value 6
num_dec_layers = 1  #default value 6
n_heads= 8          #default value 8
dropout_val = 0.1   #default value 0.1
src_pad_idx= 16222
tgt_pad_idx= 31540 

In [18]:
class TransformersTranslation(nn.Module):
    def __init__(self, ip_dim, op_dim, d_model, num_enc_layers, num_dec_layers, n_heads, dropout_val, src_pad_idx, tgt_pad_idx):
        super(TransformersTranslation, self).__init__()
        self.embeddings_ip = nn.Embedding(ip_dim, d_model, padding_idx=src_pad_idx)
        self.embeddings_op = nn.Embedding(op_dim, d_model, padding_idx=tgt_pad_idx)
        
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=n_heads,
            num_encoder_layers=num_enc_layers,
            num_decoder_layers=num_dec_layers,
            dropout=dropout_val,
            batch_first=True
        )
        
        self.fc_out = nn.Linear(d_model, op_dim)
        self.dropout = nn.Dropout(dropout_val)
        
        self.layer_norm_src = nn.LayerNorm(d_model)
        self.layer_norm_tgt = nn.LayerNorm(d_model)
        
        self.src_pad_idx = src_pad_idx
        self.tgt_pad_idx = tgt_pad_idx
        
    def create_padding_mask(self, seq, pad_idx):
        # seq shape: [batch_size, seq_len]
        # pad_idx is the index to be masked
        return (seq == pad_idx)  # Shape: [batch_size, seq_len]

    def create_look_ahead_mask(self, size):
        mask = torch.triu(torch.ones(size, size, dtype=torch.bool), diagonal=1)
        return mask.masked_fill(mask == 1, float('-inf'))  # Shape: [seq_len, seq_len]
        
    def forward(self, src, tgt):
        src = self.embeddings_ip(src)  # Shape: [batch_size, src_seq_len, d_model]
        tgt = self.embeddings_op(tgt)  # Shape: [batch_size, tgt_seq_len, d_model]
        
        src = self.dropout(src)
        tgt = self.dropout(tgt)
        
        src = self.layer_norm_src(src)
        src = src.to(device)
        tgt = self.layer_norm_tgt(tgt)
        tgt = tgt.to(device)
        
        src_padding_mask = self.create_padding_mask(src[:, :, 0], self.src_pad_idx)  # Shape: [batch_size, src_seq_len]
        src_padding_mask = src_padding_mask.to(device)
        tgt_padding_mask = self.create_padding_mask(tgt[:, :, 0], self.tgt_pad_idx)  # Shape: [batch_size, tgt_seq_len]
        tgt_padding_mask = tgt_padding_mask.to(device)
        tgt_look_ahead_mask = self.create_look_ahead_mask(tgt.size(1))  # Shape: [tgt_seq_len, tgt_seq_len]
        tgt_look_ahead_mask = tgt_look_ahead_mask.to(device)
        
        output = self.transformer(
            src, tgt,
            src_key_padding_mask=src_padding_mask,
            tgt_key_padding_mask=tgt_padding_mask,
            memory_key_padding_mask=src_padding_mask,
            tgt_mask=tgt_look_ahead_mask
        )
        
        output = self.fc_out(output)
        output = output.to(device)
        
        return output

In [19]:
model = TransformersTranslation(ip_dim, op_dim, d_model, num_enc_layers, num_dec_layers, n_heads, dropout_val, src_pad_idx, tgt_pad_idx).to(device)
# model

In [None]:
def train_epoch(model, dataloader, loss_fn, optimizer, device):
    model.train()  
    epoch_loss = 0
    correct_predictions = 0
    total_predictions = 0

    for batch in tqdm(dataloader, desc="Training", leave=False):
        src, tgt, tgt_y = batch
        src, tgt, tgt_y = src.to(device), tgt.to(device), tgt_y.to(device)
        
        optimizer.zero_grad()
        
        output = model(src, tgt)
        output = output.to(device)
        
        loss = loss_fn(output.view(-1, output.size(-1)), tgt_y.contiguous().view(-1))
        loss = loss.to(device)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        
        preds = output.argmax(dim=-1)  
        correct_predictions += (preds == tgt_y).sum().item()
        total_predictions += tgt_y.numel()
    
    avg_epoch_loss = epoch_loss / len(dataloader)
    accuracy = correct_predictions / total_predictions
    return avg_epoch_loss, accuracy

In [None]:
def test_epoch(model, dataloader, loss_fn, device):
    model.eval()  
    epoch_loss = 0
    correct_predictions = 0
    total_predictions = 0
    with torch.no_grad():  
        for batch in tqdm(dataloader, desc="Testing", leave=False):
            src, tgt, tgt_y = batch
            src, tgt, tgt_y = src.to(device), tgt.to(device), tgt_y.to(device)
            
            output = model(src, tgt)
            output = output.to(device)
            
            loss = loss_fn(output.view(-1, output.size(-1)), tgt_y.contiguous().view(-1))
            loss = loss.to(device)
            
            epoch_loss += loss.item()
            
            preds = output.argmax(dim=-1)  
            correct_predictions += (preds == tgt_y).sum().item()
            total_predictions += tgt_y.numel()
    
    avg_epoch_loss = epoch_loss / len(dataloader)
    accuracy = correct_predictions / total_predictions
    return avg_epoch_loss, accuracy

In [None]:
def save_model(model, epoch, train_acc, train_loss, val_acc, val_loss, directory='/kaggle/working/'):
    filename = (f"ep-{epoch+1:03d}-train_acc-{train_acc:.4f}-train_loss-{train_loss:.4f}-"
                f"val_acc-{val_acc:.4f}-val_loss-{val_loss:.4f}.pth")
    filepath = os.path.join(directory, filename)
    
    torch.save(model.state_dict(), filepath)
    print(f"Model saved to {filepath}")

In [None]:
loss_fn = nn.CrossEntropyLoss(ignore_index= tgt_pad_idx) 
loss_fn = loss_fn.to(device)
optimizer = optim.AdamW(model.parameters(), lr= 3e-4, weight_decay= 1e-2)
EPOCHS= 90

for epoch in range(EPOCHS):
    train_loss, train_accuracy = train_epoch(model, trainloader, loss_fn, optimizer, device)
    test_loss, test_accuracy = test_epoch(model, testLoader, loss_fn, device)
    
    print(f'Epoch {epoch+1}/{EPOCHS}')
    print(f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}')
    print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')
    if (epoch + 1) % 5 == 0:
        save_model(model, epoch, train_accuracy, train_loss, test_accuracy, test_loss)
        print(f'Model saved at {epoch+1}')

In [20]:
model.load_state_dict(torch.load('/kaggle/input/85-th-epoch-translation-model-saved/ep-085-train_acc-0.5526-train_loss-0.6162-val_acc-0.4782-val_loss-1.6575.pth'))

  model.load_state_dict(torch.load('/kaggle/input/85-th-epoch-translation-model-saved/ep-085-train_acc-0.5526-train_loss-0.6162-val_acc-0.4782-val_loss-1.6575.pth'))


<All keys matched successfully>

In [34]:
def inference(model, en_sentence):
    model.eval()
    
    encode_seq = torch.tensor([en_tokenize(en_sentence, append_eos=False)]).to(device)
    
    decode_seq = torch.tensor([[fr_token_to_index[sos_token]]]).to(device)
    
    for _ in range(10):  
        with torch.no_grad():
            
            out_logits = model(encode_seq, decode_seq)
            
            out_logits = out_logits[:, -1, :]  
            out_seqs = torch.argmax(out_logits, dim=-1)  
            
            next_token = out_seqs.item()
            
            decode_seq = torch.cat([decode_seq, torch.tensor([[next_token]], device=device)], dim=1)
            
            if next_token == fr_token_to_index[eos_token]:
                break
    
    tokens = [word for token_id in decode_seq[0].tolist() for word, idx in fr_token_to_index.items() if idx == token_id]
    
    result = ' '.join(tokens)
    
    return result.replace(f' {eos_token}', '')

# Prompt user for input
def main():
    while True:
        user_input = input("Enter an English sentence for translation (or type 'exit' to quit): ")
        if user_input.lower() == 'exit':
            break
        translated_sentence = inference(model, user_input)
        print(f"French translation: {translated_sentence}")

if __name__ == "__main__":
    main()


Enter an English sentence for translation (or type 'exit' to quit):  answer this morning


French translation: <sos> ce ce ce ce matin ce ce ce ce ce


Enter an English sentence for translation (or type 'exit' to quit):  answer today morning


French translation: <sos> aujourd'hui ne aujourd'hui quelques aujourd'hui , aujourd'hui , aujourd'hui ,


Enter an English sentence for translation (or type 'exit' to quit):  answer in morning


French translation: <sos> Comme ce matin à Comme il Comme à Comme aujourd'hui


Enter an English sentence for translation (or type 'exit' to quit):  answer by today


French translation: <sos> aujourd'hui ne aujourd'hui , aujourd'hui , aujourd'hui , aujourd'hui ,


Enter an English sentence for translation (or type 'exit' to quit):  roaming in garden


French translation: <sos> Je temps à ce que tu arriva à temps dans


Enter an English sentence for translation (or type 'exit' to quit):  time


French translation: <sos> temps l'heure temps l'heure l'heure l'heure l'heure l'heure l'heure l'heure


Enter an English sentence for translation (or type 'exit' to quit):  in today


French translation: <sos> aujourd'hui , aujourd'hui , aujourd'hui , aujourd'hui , aujourd'hui ,


Enter an English sentence for translation (or type 'exit' to quit):  in


French translation: <sos> dans deux dans dans dans un dans temps pièce pièce


Enter an English sentence for translation (or type 'exit' to quit):  on foot


French translation: <sos> Le temps sur le départ à des sur le pied


Enter an English sentence for translation (or type 'exit' to quit):  exit


In [40]:
def inference(model, en_sentence):
    model.eval()  # Set the model to evaluation mode
    
    # Tokenize and prepare the input sequence
    encode_seq = torch.tensor([en_tokenize(en_sentence, append_eos=False)]).to(device)
    
    # Start with the start-of-sequence token
    decode_seq = torch.tensor([[fr_token_to_index[sos_token]]]).to(device)
    
    result_tokens = []  # List to accumulate tokens
    
    for _ in range(200):  # Limit to 200 tokens for the output
        with torch.no_grad():
            out_logits = model(encode_seq, decode_seq)  # Get model output
            out_logits = out_logits[:, -1, :]  # Shape: [batch_size, vocab_size]
            
            # Get the predicted token IDs
            out_seqs = torch.argmax(out_logits, dim=-1)  # Shape: [batch_size]
            
            # Extract the predicted token
            next_token = out_seqs.item()  # Get the single token ID
            
            # Append the predicted token to the result tokens list
            result_tokens.append(next_token)
            
            # Check for end-of-sequence token
            if next_token == fr_token_to_index[eos_token]:
                break
            
            # Update the decode sequence with the predicted token
            decode_seq = torch.cat([decode_seq, torch.tensor([[next_token]], device=device)], dim=1)
    
    # Convert token IDs to tokens manually using index-to-string mapping
    tokens = [list(fr_token_to_index.keys())[list(fr_token_to_index.values()).index(token_id)] for token_id in result_tokens]
    
    # Join tokens to form the final translation string
    result = ' '.join(tokens)
    
    # Return the result, removing any trailing <eos> token
    return result.replace(f' {eos_token}', '')

# Testing the function
data = list(zip(X_test, y_test))  # Assuming X_test and y_test are the test datasets
random.shuffle(data)

for x, y in data[:10]:
    result = inference(model, x)
    print("English:", x)
    print("Model Translation:", result)
    print("Actual French:", y)
    print()

English: It's sunny, but the water is cold.
Model Translation: Il y a des choses , mais il y a des faut .
Actual French: Il y a du soleil, mais l'eau est froide.

English: Good friends are like stars. You don't always see them, but you know they are always there.
Model Translation: On ne sait pas les amis , mais il y a tout le temps .
Actual French: Les bons amis sont comme les étoiles on ne les voit pas toujours, mais on sait qu'ils sont toujours là.

English: I think you lied to me.
Model Translation: Je pense que tu me voies . Je pense que tu m'as menti .
Actual French: Je pense que tu m'as menti.

English: What did you say?
Model Translation: Qu ' a dit ?
Actual French: Qu'avez-vous dit ?

English: The system is rigged.
Model Translation: Le .
Actual French: Le système est truqué.

English: I always enjoy listening to classical music when I have some free time.
Model Translation: J'ai toujours aimé disposer d'un petit peu de temps lorsque j'aime à Lorsque j'ai un peu de temps en te