In [76]:
import pandas as pd
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import torch.optim as optim
from torch.utils.data import Dataset
#from torchvision import Datasets
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import random
import spacy
import torch.nn.functional as F
import os

In [3]:
spacy_en = spacy.load('en_core_web_sm')
spacy_fr = spacy.load('fr_core_news_sm')

In [4]:
def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

def tokenize_fr(text):
    return [tok.text for tok in spacy_fr.tokenizer(text)]

In [5]:
data=pd.read_csv('engfrench.csv')

In [6]:
print(len(data))

175621


In [7]:
data.iloc[170000]

English words/sentences    The guards found a hacksaw blade in the prison...
French words/sentences     Les gardiens trouvèrent une lame de scie à mét...
Name: 170000, dtype: object

In [8]:
data[10000:10005]

Unnamed: 0,English words/sentences,French words/sentences
10000,Be very careful.,Sois très prudente !
10001,Bees make honey.,Les abeilles font du miel.
10002,Behave yourself.,Comporte-toi bien.
10003,Bite the bullet.,Serre les dents.
10004,Bite the bullet.,Serrez les dents.


from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")

In [9]:
PAD_TOKEN_ID=0
SOS_TOKEN_ID = 1
EOS_TOKEN_ID = 2

In [10]:
txten=data.iloc[0,0]
txtfr=data.iloc[0,1]


In [11]:
txten

'Hi.'

In [12]:
txtfr

'Salut!'

In [13]:
en_check=torch.load('engEmbeddings.pt')
fr_check=torch.load('frEmbeddings.pt')
en_weights=en_check['weights']
en_vocab=en_check['vocab_dict']
fr_weights=fr_check['weights']
fr_vocab=fr_check['vocab_dict']

  en_check=torch.load('engEmbeddings.pt')
  fr_check=torch.load('frEmbeddings.pt')


In [14]:
class TranslationDataset(Dataset):
    def __init__(self, dataframe, en_vocab, fr_vocab):
        self.df = dataframe
        self.en_vocab = en_vocab
        self.fr_vocab = fr_vocab
        
        # Define special token IDs
        # IMPORTANT: Ensure these keys exist in your vocab dictionaries!
        self.unk_idx_en = en_vocab.get('<unk>', 0) # Default to 0 if not found
        self.eos_idx_en = en_vocab.get('<eos>', 2)
        
        self.unk_idx_fr = fr_vocab.get('<unk>', 0)
        self.sos_idx_fr = fr_vocab.get('<sos>', 1)
        self.eos_idx_fr = fr_vocab.get('<eos>', 2)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        # 1. Get Raw Text
        src_text = str(self.df.iloc[index, 0])
        trg_text = str(self.df.iloc[index, 1])

        # 2. Tokenize (using your Spacy functions)
        # We use .lower() because FastText matches better with lowercase
        src_tokens = tokenize_en(src_text.lower())
        trg_tokens = tokenize_fr(trg_text.lower())

        # 3. Numericalize Source (English)
        # Use .get() to map unknown words to <unk> index
        src_indices = [self.en_vocab.get(token, self.unk_idx_en) for token in src_tokens]
        
        # 4. Numericalize Target (French)
        trg_indices = [self.fr_vocab.get(token, self.unk_idx_fr) for token in trg_tokens]

        # 5. Add Special Tokens
        # Source usually needs <eos> at the end so the LSTM knows the sentence stopped
        src_indices.append(self.eos_idx_en)
        
        # Target needs <sos> to start generation and <eos> to end it
        trg_indices = [self.sos_idx_fr] + trg_indices + [self.eos_idx_fr]

        return torch.tensor(src_indices, dtype=torch.long), torch.tensor(trg_indices, dtype=torch.long)

In [15]:
datanewdataset=TranslationDataset(data,en_vocab,fr_vocab)

In [16]:
def custom_collate_fn(batch):
    # 'batch' is a list of tuples: [(input_tensor_1, target_tensor_1), (input_tensor_2, target_tensor_2), ...]

    # 1. Separate inputs and targets
    inputs = [torch.tensor(item[0]) for item in batch]
    targets = [torch.tensor(item[1]) for item in batch]

    # 2. Pad the inputs and targets to the length of the longest sequence in the batch
    # batch_first=True makes the output shape (BatchSize, SequenceLength)
    inputs_padded = pad_sequence(inputs, batch_first=True, padding_value=PAD_TOKEN_ID)
    targets_padded = pad_sequence(targets, batch_first=True, padding_value=PAD_TOKEN_ID)
    
    return inputs_padded, targets_padded

In [17]:
haha=DataLoader(datanewdataset,batch_size=32,shuffle=True,collate_fn=custom_collate_fn)

SAMPLE TO REMEMBER LSTM 

In [18]:
xa=torch.randn(10,100)

In [19]:
ha=nn.LSTM(100,200)

In [20]:
ya=ha(xa)

In [21]:
ya[0].shape

torch.Size([10, 200])

In [22]:
ya[1][0].shape

torch.Size([1, 200])

In [23]:
ya[1][1].shape

torch.Size([1, 200])

In [24]:
embed_dim=300
#since using pre trained Fast TExt aligned vectors

In [25]:
VOCAB_SIZE_ENC=len(en_vocab)

In [26]:
VOCAB_SIZE_FR=len(fr_vocab)

In [27]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        # The attention linear layer transforms the concatenated hidden states
        self.attn = nn.Linear((enc_hid_dim) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        # hidden = [batch_size, dec_hid_dim] (we usually take the last layer of hidden state)
        # encoder_outputs = [batch_size, src_len, enc_hid_dim]
        
        batch_size = encoder_outputs.shape[0]
        src_len = encoder_outputs.shape[1]
        
        # Repeat decoder hidden state src_len times to match encoder output shape
        # hidden shape becomes: [batch_size, src_len, dec_hid_dim]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        
        # Calculate Energy
        # energy = [batch_size, src_len, dec_hid_dim]
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        
        # Calculate Attention (weights)
        # attention = [batch_size, src_len]
        attention = self.v(energy).squeeze(2)
        
        return F.softmax(attention, dim=1)

In [29]:
class enc(nn.Module):
    def __init__(self):
        super().__init__()
        self.emb = nn.Embedding.from_pretrained(en_weights, freeze=True)
        self.lstm=nn.LSTM(input_size=embed_dim,hidden_size=1024,num_layers=3,batch_first=True,bidirectional=False)
        self.rel=nn.ReLU()
    def forward(self,x):
        embx=self.emb(x)
        out,(hid,cell)=self.lstm(embx)
        return out,hid,cell
        

        
        

In [30]:
class dec(nn.Module):
    def __init__(self, attention):
        super().__init__()
        self.attention = attention
        self.emb = nn.Embedding.from_pretrained(fr_weights, freeze=True)
        
        # Input to LSTM is now: Embedding Dimension + Encoder Hidden Dimension (Context Vector)
        self.lstm = nn.LSTM(input_size=embed_dim + 1024, hidden_size=1024, num_layers=3, batch_first=True, bidirectional=False)
        
        self.fc_out = nn.Linear(1024, VOCAB_SIZE_FR) 
        
    def forward(self, x, hidden, cell, encoder_outputs):
        # x shape: [batch_size] -> [batch_size, 1]
        x = x.unsqueeze(1)
        embx = self.emb(x) # [batch_size, 1, embed_dim]
        
        # 1. Calculate Attention Weights
        # We use hidden[-1] because we have 3 layers, we want the last layer's state for attention
        attn_weights = self.attention(hidden[-1], encoder_outputs) # [batch, src_len]
        
        # 2. Calculate Context Vector
        # attn_weights shape: [batch, 1, src_len]
        attn_weights = attn_weights.unsqueeze(1)
        
        # Weighted sum of encoder outputs
        # [batch, 1, src_len] x [batch, src_len, hid_dim] = [batch, 1, hid_dim]
        context = torch.bmm(attn_weights, encoder_outputs)
        
        # 3. Concatenate Context + Embedding
        # rnn_input: [batch, 1, embed_dim + hid_dim]
        rnn_input = torch.cat((embx, context), dim=2)
        
        # 4. Feed to LSTM
        output, (hidden, cell) = self.lstm(rnn_input, (hidden, cell))
        
        # prediction shape: [batch_size, VOCAB_SIZE]
        prediction = self.fc_out(output.squeeze(1))
        
        return prediction, hidden, cell

In [31]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, source, target, teacher_forcing_ratio=0.5):
        # source: [batch, src_len]
        # target: [batch, trg_len]
        
        batch_size = source.shape[0]
        target_len = target.shape[1]
        vocab_size = len(fr_vocab) # Assuming fr_vocab is global
        
        outputs = torch.zeros(batch_size, target_len, vocab_size).to(self.device)
        
        # 1. Encode - NOW CAPTURING encoder_outputs
        encoder_outputs, hidden, cell = self.encoder(source)
        
        # 2. First input to decoder (Start Token)
        input_token = torch.full((batch_size,), SOS_TOKEN_ID, dtype=torch.long).to(self.device)
        
        # 3. Decode
        for t in range(1, target_len):
            # Pass encoder_outputs to decoder
            output, hidden, cell = self.decoder(input_token, hidden, cell, encoder_outputs)
            
            outputs[:, t, :] = output
            
            # Teacher forcing logic (same as before)
            top1 = output.argmax(1)
            use_teacher = random.random() < teacher_forcing_ratio
            if t < target_len - 1:
                input_token = target[:, t] if use_teacher else top1
                
        return outputs

In [82]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
attn_layer = Attention(enc_hid_dim=1024, dec_hid_dim=1024)
encoder = enc()
decoder = dec(attn_layer)
model = Seq2Seq(encoder, decoder, device).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_TOKEN_ID)


if os.path.exists("FR_ENGMODELANDALL.pth"):
    checkpoint=torch.load("FR_ENGMODELANDALL.pth")
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    


Using device: cuda


  checkpoint=torch.load("FR_ENGMODELANDALL.pth")


In [83]:
def train_one_epoch(model, iterator, optimizer, criterion,epoch, clip=1):
    model.train()
    epoch_loss = 0
    for kek in range(epoch):
        for i, (src, trg) in enumerate(iterator):
            src = src.to(device)
            trg = trg.to(device)
            
            optimizer.zero_grad()
            
            # Forward pass
            output = model(src, trg)
            
            # Reshape for Loss
            # output: [batch, len, vocab] -> [batch*len, vocab]
            # trg: [batch, len] -> [batch*len]
            output_dim = output.shape[-1]
            output = output.view(-1, output_dim)
            trg = trg.view(-1)
            
            loss = criterion(output, trg)
            loss.backward()
            
            # Clip gradients to prevent explosion
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
            optimizer.step()
            
            epoch_loss += loss.item()
            
            if i % 500 == 0:
                print(f"Epoch number {kek} batch {i}, Loss: {loss.item()}")
                
        print( epoch_loss / len(iterator))

In [84]:
def predict_translation(model, sentence, max_len=20):
    model.eval()
    
    # ... (Tokenization Logic remains the same) ...
    tokens = tokenize_en(sentence.lower())
    unk_idx = en_vocab.get('<unk>', 3)
    eos_idx = en_vocab.get('<eos>', 2)
    ids = [en_vocab.get(token, unk_idx) for token in tokens]
    ids.append(eos_idx)
    src_tensor = torch.tensor(ids, dtype=torch.long).unsqueeze(0).to(device)

    # 1. Encode (Get outputs too)
    with torch.no_grad():
        encoder_outputs, hidden, cell = model.encoder(src_tensor)

    # ... (Setup inputs loop remains same) ...
    sos_idx = fr_vocab['<sos>']
    eos_idx = fr_vocab['<eos>']
    inputs = [sos_idx]

    for _ in range(max_len):
        input_tensor = torch.tensor([inputs[-1]], dtype=torch.long).to(device)

        with torch.no_grad():
            # Pass encoder_outputs here
            output, hidden, cell = model.decoder(input_tensor, hidden, cell, encoder_outputs)
        
        pred_token = output.argmax(1).item()
        
        if pred_token == eos_idx:
            break
        
        inputs.append(pred_token)

    # ... (Detokenization remains same) ...
    idx_to_word = {v: k for k, v in fr_vocab.items()}
    predicted_tokens = [idx_to_word.get(idx, '<unk>') for idx in inputs[1:]]
    return " ".join(predicted_tokens)

In [85]:
# Example run (Uncomment to train)
print("Starting Training...")
loss = train_one_epoch(model, haha, optimizer, criterion,epoch=10)
print(f"Epoch Loss: {loss}")

Starting Training...


  inputs = [torch.tensor(item[0]) for item in batch]
  targets = [torch.tensor(item[1]) for item in batch]


Epoch number 0 batch 0, Loss: 1.5416104793548584


KeyboardInterrupt: 

In [40]:
torch.save({
    'epoch': 10,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
}, "FR_ENGMODELANDALL.pth")

In [72]:
torch.save({'just_model':model.state_dict()},"modelkek.pth")

In [41]:
translation = predict_translation(model, "I am doing great , WHat about you?")
print(f"Translation: {translation}")

Translation: je me bien bien bien , ?


In [42]:
translation = predict_translation(model, "Today's a good day.")
print(f"Translation: {translation}")

Translation: aujourd'hui , il fait une bonne journée .


In [43]:
translation = predict_translation(model, "How was prison?")
print(f"Translation: {translation}")

Translation: comment s' est -il été ?


In [44]:
translation = predict_translation(model, "Bite the bullet.")
print(f"Translation: {translation}")

Translation: serre la dents .


In [45]:
translation = predict_translation(model, "what's the time?")
print(f"Translation: {translation}")

Translation: c' est quel son temps ?


In [46]:
translation = predict_translation(model, "what's the time")
print(f"Translation: {translation}")

Translation: c' est quel   ?


In [47]:
translation = predict_translation(model, "whats the time?")
print(f"Translation: {translation}")

Translation: alors est en temps du temps .


In [51]:
translation = predict_translation(model, "behave yourself!")
print(f"Translation: {translation}")

Translation: les - toi bien   !


In [49]:
translation = predict_translation(model, "please stop!")
print(f"Translation: {translation}")

Translation: s' il vous plaît ,


In [50]:
translation = predict_translation(model, "please stop")
print(f"Translation: {translation}")

Translation: s' il vous plaît .


In [56]:
translation = predict_translation(model, "Be very careful of bees.")
print(f"Translation: {translation}")

Translation: soyez très flexibles aux des animaux .


In [62]:
translation = predict_translation(model, "It's all nonsense.")
print(f"Translation: {translation}")

Translation: tout , c' est des sens .


In [66]:

data[22000:22005]

Unnamed: 0,English words/sentences,French words/sentences
22000,Why is this funny?,Pourquoi c'est marrant ?
22001,Why must I suffer?,Pourquoi dois-je souffrir ?
22002,Why must we do it?,Pourquoi devons-nous le faire ?
22003,Why not just quit?,Pourquoi ne pas simplement démissionner ?
22004,Why not just quit?,Pourquoi ne pas simplement arrêter ?


In [64]:
translation = predict_translation(model, "It's a funny country.")
print(f"Translation: {translation}")

Translation: c' est un enfance ensoleillé .


In [65]:
translation = predict_translation(model, "It's a distraction.")
print(f"Translation: {translation}")

Translation: c' est un distraction .


In [70]:
translation = predict_translation(model, "Why not just quit?")
print(f"Translation: {translation}")

Translation: pourquoi ne pas simplement arrêter ?


In [67]:
translation = predict_translation(model, "Why must I suffer?")
print(f"Translation: {translation}")


Translation: pourquoi dois -je souffrir   ?
