In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### $\text{Importing libraries}$

In [3]:
import re
import torch
import shutil
import numpy as np
import pandas as pd
import seaborn as sns
import torch.nn as nn
from tqdm import tqdm
import matplotlib.pyplot as plt
from transformers import AutoTokenizer
from sklearn.metrics import roc_auc_score
from torch.nn.utils import clip_grad_norm_
from torch.optim import AdamW, lr_scheduler
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [8]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

### $\text{Importing data}$

In [None]:
path = '/content/drive/MyDrive/en_fr.csv'
MAX_SEQ_LEN = 20

In [None]:
data = pd.read_csv(path, nrows=15000)
data

Unnamed: 0,en_sentence,fr_sentence
0,go,va
1,go,marche
2,go,en route
3,go,bouge
4,hi,salut
...,...,...
14995,he has no money,il na pas dargent
14996,he has ten cows,il possède dix vaches
14997,he has two cats,il a deux chats
14998,he has two dogs,il a deux chiens


### $\text{Preparing data}$

In [None]:
data['en_inf'] = data['en_sentence'].apply(
    lambda x: tokenizer.encode_plus(
        x,
        max_length = MAX_SEQ_LEN,
        truncation = True,
        return_tensors = 'pt'
    )
)
data['fr_inf'] = data['fr_sentence'].apply(
    lambda x: tokenizer.encode_plus(
        x,
        max_length = MAX_SEQ_LEN,
        truncation = True,
        return_tensors = 'pt'
    )
)

In [None]:
data['en_tokenized'] = data['en_inf'].apply(lambda x: x['input_ids'])
data['fr_tokenized'] = data['fr_inf'].apply(lambda x: x['input_ids'])
data['fr_len'] = data['fr_inf'].apply(lambda x: x['attention_mask'].sum().item())
# data['en_len'] = data['en_inf'].apply(lambda x: x['attention_mask'].sum().item())
data['en_tokenized'] = data['en_tokenized'].apply(lambda x: x.squeeze(0))
data['fr_tokenized'] = data['fr_tokenized'].apply(lambda x: x.squeeze(0))

In [None]:

drops = ['en_inf', 'fr_inf']
data = data.drop(columns=drops)

In [None]:
data.iloc[:4]

Unnamed: 0,en_sentence,fr_sentence,en_tokenized,fr_tokenized,fr_len
0,go,va,"[tensor(101), tensor(11783), tensor(102)]","[tensor(101), tensor(10321), tensor(102)]",3
1,go,marche,"[tensor(101), tensor(11783), tensor(102)]","[tensor(101), tensor(56229), tensor(102)]",3
2,go,en route,"[tensor(101), tensor(11783), tensor(102)]","[tensor(101), tensor(10110), tensor(13933), te...",4
3,go,bouge,"[tensor(101), tensor(11783), tensor(102)]","[tensor(101), tensor(94335), tensor(10525), te...",4


In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['en_tokenized'], data[['fr_tokenized', 'fr_len']],
                                                    test_size=0.2, random_state=42)

In [None]:
y_train.iloc[:5]

Unnamed: 0,fr_tokenized,fr_len
9839,"[tensor(101), tensor(18354), tensor(10322), te...",7
9680,"[tensor(101), tensor(19132), tensor(10321), te...",5
7093,"[tensor(101), tensor(49301), tensor(10381), te...",9
11293,"[tensor(101), tensor(10144), tensor(10911), te...",10
820,"[tensor(101), tensor(13621), tensor(23932), te...",6


In [None]:
class EnFrDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        return (self.x.iloc[index],
                self.y.fr_tokenized.iloc[index],
                self.y.fr_len.iloc[index])

In [None]:
train_data = EnFrDataset(X_train, y_train)
test_data = EnFrDataset(X_test, y_test)

In [None]:
train_loader = DataLoader(train_data, batch_size=1, shuffle=True)
test_loader = DataLoader(test_data, batch_size=1, shuffle=True)

### $\text{Encoder structure}$

In [4]:
class Encoder(nn.Module):
    def __init__(self, input_size, emb_size, hidden_size, dropout):
        super(Encoder, self).__init__()
        self.input_size = input_size
        self.emb_size = emb_size
        self.hidden_size = hidden_size
        self.dropout = dropout

        self.emb = nn.Embedding(self.input_size, self.emb_size)
        self.GRU = nn.GRU(self.emb_size, self.hidden_size, batch_first=True)
        self.dropout_1 = nn.Dropout(self.dropout)

    def forward(self, X):
        emb = self.dropout_1(self.emb(X))
        output, hidden = self.GRU(emb)

        return output, hidden

### $\text{Decoder structure}$

In [5]:
class Decoder(nn.Module):
    def __init__(self, out_size, emb_size, hidden_size, dropout):
        super(Decoder, self).__init__()
        self.out_size = out_size
        self.emb_size = emb_size
        self.hidden_size = hidden_size
        self.dropout = dropout

        self.emb = nn.Embedding(self.out_size, self.emb_size)
        self.GRU = nn.GRU(self.emb_size, self.hidden_size, batch_first=True)
        self.fc = nn.Linear(self.hidden_size, self.out_size)
        self.dropout = nn.Dropout(self.dropout)
        self.Wc = nn.Linear(self.hidden_size*2, hidden_size)
        self.tanh = nn.Tanh()

    def forward_step(self, X, hidden, enc_outs):
        emb = self.dropout(self.emb(X))
        out, hidden = self.GRU(emb, hidden) # out and hidden are the same things
        context = self._attention_func(enc_outs, hidden) # dim: batch_size*seq_len*hidden_size
        new_vector = torch.concat((context, hidden),
                                  dim=-1) # dim: batch_size*1*hidden_size*2
        new_hidden = self.tanh(self.Wc(new_vector)) # dim: batch_size*1*hidden_size
        out = self.fc(new_hidden)

        return out, new_hidden

    def forward(self, encoder_outputs, hidden_state,
                y=None, y_len=None, teacher_forcing_rate=0.75):

        hidden = hidden_state
        enc_outs = encoder_outputs

        input = torch.tensor([101]).reshape(-1, 1).to(device)
        outputs = []

        if y is not None and y_len is not None:
            for i in range(1, y_len):
                out, hidden = self.forward_step(input, hidden, enc_outs)
                # print(out.shape)
                # print(f'hidden_size = {hidden.shape}')
                if np.random.rand() < teacher_forcing_rate:
                    input = y[:, i].reshape(-1, 1)
                else:
                    input = out.argmax(dim=-1).reshape(-1, 1)
                outputs.append(out.squeeze(1))
            outputs = torch.stack(outputs, dim=1).to(device)

        else:
            while input.item() != 102:
                out, hidden = self.forward_step(input, hidden, enc_outs)
                input = out.argmax(dim=-1).reshape(-1, 1)
                outputs.append(out.squeeze(1))

            outputs = torch.stack(outputs, dim=1).to(device)

        return outputs

    def _attention_func(self, encoder_outputs, decoder_hidden):
        attention_scores = torch.bmm(encoder_outputs,
                                      decoder_hidden.transpose(1, 2))
        # print(f'attention_weigths.shape = {attention_scores.shape}')
        attention_weights = torch.softmax(attention_scores, dim=1)
        # print(attention_weights)
        # print(f'norm_weights = {attention_weights.shape}')
        context_vector = torch.bmm(attention_weights.transpose(1, 2), encoder_outputs)
        # print(f'context_vector = {context_vector.shape}')

        return context_vector

### $\text{Final model}$

In [6]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()

        self.encoder = encoder
        self.decoder = decoder

    def forward(self, X, y=None, y_len=None):
        enc_outs, hidden = self.encoder(X)
        if y_len is not None:
            y_len = y_len.item()
            outputs = self.decoder(enc_outs, hidden, y, y_len)
        else:
            outputs = self.decoder(enc_outs, hidden)

        return outputs

In [9]:
epochs = 10
out_size = tokenizer.vocab_size
input_size = tokenizer.vocab_size
emb_size = 300
hidden_size = 256
dropout_p = 0.3

In [13]:
encoder = Encoder(input_size = input_size,
                  emb_size = emb_size,
                  hidden_size = hidden_size,
                  dropout = dropout_p).to(device)

In [14]:
decoder = Decoder(out_size = out_size,
                  emb_size = emb_size,
                  hidden_size = hidden_size,
                  dropout = dropout_p).to(device)

In [15]:
model = Seq2Seq(encoder = encoder,
                decoder = decoder).to(device)

In [16]:
def initialize_weights(model):
    for name, param in model.named_parameters():
        if 'weight' in name and param.dim() > 1:
            nn.init.xavier_uniform_(param)  # Xavier Uniform Initialization
        elif 'bias' in name:
            nn.init.constant_(param, 0)

In [17]:
initialize_weights(model)
initialize_weights(encoder)
initialize_weights(encoder)

### $\text{Training process}$

In [None]:
optimizer = AdamW(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()
sheduler = lr_scheduler.ExponentialLR(optimizer, gamma = 0.1**(0.1))

In [None]:
model.train()
epoch_losses = []

for epoch in range(epochs):
    batch_loss = []
    for batch in tqdm(train_loader):
        X_batch, y_batch, y_len = batch
        optimizer.zero_grad()

        out = model(X_batch.to(device), y_batch.to(device), y_len)
        y_batch = y_batch[:, 1:].contiguous().to(device).to(torch.long).squeeze(0)
        out = out.contiguous().view(-1, tokenizer.vocab_size).to(torch.float)
        # print(out.shape, y_batch.shape)
        loss = criterion(out, y_batch)
        loss.backward()
        clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        batch_loss.append(loss.item())

    epoch_losses.append(np.mean(batch_loss))
    print(f'epoch_loss = {np.mean(batch_loss)}')
    sheduler.step()

100%|██████████| 12000/12000 [14:48<00:00, 13.50it/s]


epoch_loss = 1.8283188236717445


100%|██████████| 12000/12000 [14:47<00:00, 13.53it/s]


epoch_loss = 1.7503193752334143


100%|██████████| 12000/12000 [14:53<00:00, 13.43it/s]


epoch_loss = 1.6800989822942454


100%|██████████| 12000/12000 [14:55<00:00, 13.39it/s]


epoch_loss = 1.6457418732300866


100%|██████████| 12000/12000 [14:55<00:00, 13.40it/s]


epoch_loss = 1.5934185882090435


100%|██████████| 12000/12000 [14:50<00:00, 13.48it/s]


epoch_loss = 1.5557692031658565


100%|██████████| 12000/12000 [14:43<00:00, 13.58it/s]


epoch_loss = 1.5397512387759829


100%|██████████| 12000/12000 [14:43<00:00, 13.58it/s]


epoch_loss = 1.5188103967184823


 39%|███▉      | 4682/12000 [05:43<08:44, 13.94it/s]

### $\text{Saving model weights}$

In [None]:
path = 'attention_weigths_1.pth'
optimizer_path = 'optimizer_weights.pth'
torch.save(model.state_dict(), path)
torch.save(optimizer.state_dict(), optimizer_path)


gdrive_path = '/content/drive/MyDrive/attention_weigths_1.pth'
gdrive_path_opt = '/content/drive/MyDrive/optimizer_weights.pth'

shutil.move(path, gdrive_path)
shutil.move(optimizer_path, gdrive_path_opt)
print(f"Weights uploaded to Google Drive: {gdrive_path}")

Weights uploaded to Google Drive: /content/drive/MyDrive/attention_weigths_1.pth


### $\text{Import weights}$

In [None]:
gdrive_path = '/content/drive/MyDrive/attention_weigths_1.pth'
gdrive_path_opt = '/content/drive/MyDrive/optimizer_weights.pth'

In [None]:
model.load_state_dict(torch.load(gdrive_path))
optimizer.load_state_dict(torch.load(gdrive_path_opt))
model.to(device)

  model.load_state_dict(torch.load(gdrive_path))
  optimizer.load_state_dict(torch.load(gdrive_path_opt))


Seq2Seq(
  (encoder): Encoder(
    (emb): Embedding(119547, 300)
    (GRU): GRU(300, 256, batch_first=True)
    (dropout_1): Dropout(p=0.3, inplace=False)
  )
  (decoder): Decoder(
    (emb): Embedding(119547, 300)
    (GRU): GRU(300, 256, batch_first=True)
    (fc): Linear(in_features=256, out_features=119547, bias=True)
    (dropout): Dropout(p=0.3, inplace=False)
    (Wc): Linear(in_features=512, out_features=256, bias=True)
    (tanh): Tanh()
  )
)

### $\text{Inference}$

In [None]:
X_test, y_test, y_len = next(iter(test_loader))
print(X_test)
outputs = model(X_test.to(device))
outputs.size()
outputs = outputs.argmax(dim=-1)  # Берем индексы с наивысшей вероятностью (предсказанные токены)
print("Predicted Token IDs:", outputs)

predicted_sentences = []
for output in outputs:
    # Конвертируем токены в текст (удаляем специальные токены [PAD], [SOS], [EOS])
    predicted_sentence = tokenizer.decode(output.tolist(), skip_special_tokens=True)
    predicted_sentences.append(predicted_sentence)

# Декодируем эталонные предложения
true_sentences = []
for target in y_test:
    true_sentence = tokenizer.decode(target.tolist(), skip_special_tokens=True)
    true_sentences.append(true_sentence)

# Печатаем результаты
for i in range(len(predicted_sentences)):
    print(f"Input Sentence: {tokenizer.decode(X_test[i].tolist(), skip_special_tokens=True)}")
    print(f"Predicted Translation: {predicted_sentences[i]}")
    print(f"True Translation: {true_sentences[i]}")

tensor([[  101, 19132, 11552, 13028,   102]])
Predicted Token IDs: tensor([[19132, 24931,   169, 25419,   102]], device='cuda:0')
Input Sentence: tom called you
Predicted Translation: tom vous a appelé
True Translation: tom vous a appelé
