In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### $\text{Import libraries}$

In [None]:
import re
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import torch.nn as nn
from tqdm import tqdm
import matplotlib.pyplot as plt
from transformers import AutoTokenizer
from sklearn.metrics import roc_auc_score
from torch.nn.utils import clip_grad_norm_
from torch.optim import AdamW, lr_scheduler
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

### $\text{In this task I`ll use multilingual tokenizer from Hagging Face}$

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

### $\text{I will translate sequence from english to french with my model}$

In [None]:
# path = '/content/drive/MyDrive/en_de_sentences.txt'
path = '/content/drive/MyDrive/en_fr.csv'
MAX_SEQ_LEN = 20

In [None]:
data = pd.read_csv(path, nrows=15000)
data

Unnamed: 0,en_sentence,fr_sentence
0,go,va
1,go,marche
2,go,en route
3,go,bouge
4,hi,salut
...,...,...
14995,he has no money,il na pas dargent
14996,he has ten cows,il possède dix vaches
14997,he has two cats,il a deux chats
14998,he has two dogs,il a deux chiens


### $\text{Make .encode_plus to get 3 parametrs: ['input_ids', 'token_type_ids', 'attention_mask']}$

In [None]:
data['en_inf'] = data['en_sentence'].apply(
    lambda x: tokenizer.encode_plus(
        x,
        max_length = MAX_SEQ_LEN,
        truncation = True,
        return_tensors = 'pt'
    )
)
data['fr_inf'] = data['fr_sentence'].apply(
    lambda x: tokenizer.encode_plus(
        x,
        max_length = MAX_SEQ_LEN,
        truncation = True,
        return_tensors = 'pt'
    )
)

### $\text{Create 3 new columns with new information}$

In [None]:
data['en_tokenized'] = data['en_inf'].apply(lambda x: x['input_ids'])
data['fr_tokenized'] = data['fr_inf'].apply(lambda x: x['input_ids'])
data['fr_len'] = data['fr_inf'].apply(lambda x: x['attention_mask'].sum().item())
# data['en_len'] = data['en_inf'].apply(lambda x: x['attention_mask'].sum().item())

In [None]:
data['en_tokenized'] = data['en_tokenized'].apply(lambda x: x.squeeze(0))
data['fr_tokenized'] = data['fr_tokenized'].apply(lambda x: x.squeeze(0))

In [None]:
drops = ['en_inf', 'fr_inf']
data = data.drop(columns=drops)

In [None]:
data.iloc[:4]

Unnamed: 0,en_sentence,fr_sentence,en_tokenized,fr_tokenized,fr_len
0,go,va,"[tensor(101), tensor(11783), tensor(102)]","[tensor(101), tensor(10321), tensor(102)]",3
1,go,marche,"[tensor(101), tensor(11783), tensor(102)]","[tensor(101), tensor(56229), tensor(102)]",3
2,go,en route,"[tensor(101), tensor(11783), tensor(102)]","[tensor(101), tensor(10110), tensor(13933), te...",4
3,go,bouge,"[tensor(101), tensor(11783), tensor(102)]","[tensor(101), tensor(94335), tensor(10525), te...",4


In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['en_tokenized'], data[['fr_tokenized', 'fr_len']],
                                                    test_size=0.2, random_state=42)

In [None]:
y_train.iloc[:4]

Unnamed: 0,fr_tokenized,fr_len
9839,"[tensor(101), tensor(18354), tensor(10322), te...",7
9680,"[tensor(101), tensor(19132), tensor(10321), te...",5
7093,"[tensor(101), tensor(49301), tensor(10381), te...",9
11293,"[tensor(101), tensor(10144), tensor(10911), te...",10


### $\text{Making EnFrDataset for our model}$

In [None]:
class EnFrDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        return(self.X.iloc[index],
               self.y.fr_tokenized.iloc[index],
               self.y.fr_len.iloc[index])

In [None]:
train_data = EnFrDataset(X_train, y_train)
test_data = EnFrDataset(X_test, y_test)

In [None]:
train_loader = DataLoader(train_data, 1, shuffle=True)
test_loader = DataLoader(test_data, 1, shuffle=True)

In [None]:
next(iter(train_loader))

[tensor([[  101, 10347, 19509,   102]]),
 tensor([[   101, 103559,  12715,  20694,  10291,  54268,  10171,    102]]),
 tensor([8])]

### $\text{Encoder structure}$

In [None]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, dropout):
        super(Encoder, self).__init__()
        self.vocab_size = vocab_size
        self.emb_size = emb_size
        self.hidden_size = hidden_size
        self.dropout = dropout

        self.emb = nn.Embedding(self.vocab_size, self.emb_size)
        self.GRU = nn.GRU(input_size=self.emb_size,
                          hidden_size=self.hidden_size,
                          batch_first=True,
                          )
        self.dropout = nn.Dropout(p=self.dropout)

    def forward(self, X):
        X = self.dropout(self.emb(X))
        # print(X.size())
        out, hidden = self.GRU(X)
        out = self.dropout(out)

        return out, hidden

### $\text{Decoder structure}$

In [None]:
class Decoder(nn.Module):
    def __init__(self, out_size, emb_size, hidden_size, dropout):
        super(Decoder, self).__init__()
        self.out_size = out_size
        self.emb_size = emb_size
        self.hidden_size = hidden_size
        self.dropout = dropout

        self.emb = nn.Embedding(out_size, emb_size)
        self.GRU = nn.GRU(input_size=self.emb_size,
                          hidden_size=self.hidden_size,
                          batch_first=True,
                          )
        self.dropout = nn.Dropout(p=self.dropout)
        self.fc = nn.Linear(self.hidden_size, self.out_size)

    def forward_step(self, X, hidden):
        X = self.dropout(self.emb(X))
        out, hidden = self.GRU(X, hidden)
        out = self.fc(out)

        return out, hidden

    def forward(self, hidden_state, y=None, y_len=None, teacher_forcing_rate=0.5):
        hidden = hidden_state

        input = torch.tensor([101]).unsqueeze(0).to(device)
        outputs = []
        if y is not None and y_len is not None:
            for i in range(1, y_len):
                # print(input, input.size())
                out, hidden = self.forward_step(input, hidden)
                input = y[:, i].view(-1, 1)
                outputs.append(out.squeeze(1))

            # print(outputs)
            outputs = torch.stack(outputs, dim=1)
            return(outputs)
        else:
            while input.item() != 102:
                out, hidden = self.forward_step(input, hidden)
                # print(f"Out size = {out.size()}")
                input = out.argmax(dim=-1)
                # print(f"Input size = {input.size()}")
                outputs.append(out.squeeze(1))

            outputs = torch.stack(outputs, dim=1).to(device)
            return(outputs)

### $\text{Final model}$

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, X, y=None, y_len=None, teacher_forcing_rate=0.5):
        _, hidden = self.encoder(X)
        if y_len is not None:
            y_len = y_len.item()
            outputs = self.decoder(hidden, y, y_len, teacher_forcing_rate)
        else:
            outputs = self.decoder(hidden, teacher_forcing_rate)

        return outputs

In [None]:
epochs = 10
out_size = tokenizer.vocab_size
input_size = tokenizer.vocab_size
emb_size = 300
hidden_size = 256
dropout_p = 0.3

In [None]:
encoder = Encoder(vocab_size = input_size,
                  emb_size = emb_size,
                  hidden_size = hidden_size,
                  dropout = dropout_p).to(device)

In [None]:
decoder = Decoder(out_size = out_size,
                  emb_size = emb_size,
                  hidden_size = hidden_size,
                  dropout = dropout_p).to(device)

In [None]:
model = Seq2Seq(encoder, decoder).to(device)

In [None]:
def initialize_weights(model):
    for name, param in model.named_parameters():
        if 'weight' in name and param.dim() > 1:  # Применяем к параметрам с размерностью > 1 (обычно это веса слоёв)
            nn.init.xavier_uniform_(param)  # Xavier Uniform Initialization
        elif 'bias' in name:  # Инициализируем смещения
            nn.init.constant_(param, 0)  # Устанавливаем смещения в 0

In [None]:
initialize_weights(encoder)
initialize_weights(decoder)
initialize_weights(model)

### $\text{Training process}$

In [None]:
optimizer = AdamW(model.parameters(), lr=0.01)
scheduler = lr_scheduler.ExponentialLR(optimizer, gamma=0.1**(0.1))
criterion = nn.CrossEntropyLoss()

In [None]:
model.train()
epoch_loss = []

for epoch in range(epochs):
    batch_loss = []
    for batch in tqdm(train_loader):
        X_batch, y_batch, y_len_batch = batch
        optimizer.zero_grad()

        outputs = model(X_batch.to(device), y_batch.to(device), y_len_batch)
        outputs = outputs.contiguous().view(-1, tokenizer.vocab_size)
        y_batch = y_batch[:, 1:].contiguous().view(-1).to(device)
        loss = criterion(outputs, y_batch)

        loss.backward()
        clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        batch_loss.append(loss.item())

    scheduler.step()
    print(np.mean(batch_loss))
    epoch_loss.append(np.mean(batch_loss))

100%|██████████| 12000/12000 [14:39<00:00, 13.64it/s]


5.906177466270824


100%|██████████| 12000/12000 [14:37<00:00, 13.67it/s]


5.379757589727485


100%|██████████| 12000/12000 [14:37<00:00, 13.68it/s]


3.921056116256242


100%|██████████| 12000/12000 [14:37<00:00, 13.67it/s]


2.935795401330727


100%|██████████| 12000/12000 [14:36<00:00, 13.69it/s]


2.287437022817166


100%|██████████| 12000/12000 [14:35<00:00, 13.71it/s]


1.8298963842905747


100%|██████████| 12000/12000 [14:40<00:00, 13.63it/s]


1.5183114645970055


100%|██████████| 12000/12000 [14:39<00:00, 13.64it/s]


1.2826702057590205


100%|██████████| 12000/12000 [14:34<00:00, 13.73it/s]


1.083929545631516


100%|██████████| 12000/12000 [14:33<00:00, 13.73it/s]

0.9355591819067873





### $\text{Saving weights}$

In [None]:
model_path = "model_weights.pth"
torch.save(model.state_dict(), model_path)
print(f"Weights saved locally as {model_path}")

Weights saved locally as model_weights.pth


In [None]:
from google.colab import drive
import shutil

# Подключение Google Диска
drive.mount('/content/drive')

# Путь в Google Диске
gdrive_path = '/content/drive/MyDrive/model_weights.pth'

# Копирование файла
shutil.move(model_path, gdrive_path)
print(f"Weights uploaded to Google Drive: {gdrive_path}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Weights uploaded to Google Drive: /content/drive/MyDrive/model_weights.pth


In [None]:
weights_path = '/content/drive/MyDrive/model_weights.pth'
model.load_state_dict(torch.load(weights_path, map_location='cuda'))
model.to(device)

  model.load_state_dict(torch.load(weights_path, map_location='cuda'))


Seq2Seq(
  (encoder): Encoder(
    (emb): Embedding(119547, 300)
    (GRU): GRU(300, 256, batch_first=True)
    (dropout): Dropout(p=0.3, inplace=False)
  )
  (decoder): Decoder(
    (emb): Embedding(119547, 300)
    (GRU): GRU(300, 256, batch_first=True)
    (dropout): Dropout(p=0.3, inplace=False)
    (fc): Linear(in_features=256, out_features=119547, bias=True)
  )
)

Let`s see, how our model translate new sentences

### $\text{Inference}$

In [None]:
X_test, y_test, y_len = next(iter(test_loader))
print(X_test)
outputs = model(X_test.to(device))
outputs.size()
outputs = outputs.argmax(dim=-1)  # Берем индексы с наивысшей вероятностью (предсказанные токены)
print("Predicted Token IDs:", outputs)

predicted_sentences = []
for output in outputs:
    # Конвертируем токены в текст (удаляем специальные токены [PAD], [SOS], [EOS])
    predicted_sentence = tokenizer.decode(output.tolist(), skip_special_tokens=True)
    predicted_sentences.append(predicted_sentence)

# Декодируем эталонные предложения
true_sentences = []
for target in y_test:
    true_sentence = tokenizer.decode(target.tolist(), skip_special_tokens=True)
    true_sentences.append(true_sentence)

# Печатаем результаты
for i in range(len(predicted_sentences)):
    print(f"Input Sentence: {tokenizer.decode(X_test[i].tolist(), skip_special_tokens=True)}")
    print(f"Predicted Translation: {predicted_sentences[i]}")
    print(f"True Translation: {true_sentences[i]}")

tensor([[  101, 25430, 25923, 10147,   102]])
Predicted Token IDs: tensor([[29177, 22464, 25923, 10627,   102]], device='cuda:0')
Input Sentence: remain calm
Predicted Translation: garde ton calme
True Translation: gardez votre calme
