In [None]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import math, json, random, os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
import numpy as np
from tqdm import tqdm
from datetime import datetime
from nltk.translate.bleu_score import corpus_bleu
from rouge import Rouge
import copy

device = "cuda" if torch.cuda.is_available() else "cpu"
print("–í–∏–∫–æ—Ä–∏—Å—Ç–æ–≤—É—î—Ç—å—Å—è –ø—Ä–∏—Å—Ç—Ä—ñ–π:", device)

–í–∏–∫–æ—Ä–∏—Å—Ç–æ–≤—É—î—Ç—å—Å—è –ø—Ä–∏—Å—Ç—Ä—ñ–π: cuda


In [None]:
data = pd.read_csv("../data/biblie.csv", encoding="latin-1")
print(data.head())
print(data.columns)

                                               texts       books
0  he was caught up to the third hmmm and in nehe...    nehemiah
1  holy spirit is that he does good it says in ru...   1 timothy
2  is what the bible calls the flesh and when we ...      esther
3  and being led by the spirit of god in colossia...   ephesians
4  in isaiah chapter 41 from verse 10 to 14 isaia...  revelation
Index(['texts', 'books'], dtype='object')


### -- `Data Cleaning` --

In [None]:
data = data[['texts']].dropna()
data = data.sample(frac=0.20, random_state=42)

### -- `Tokenization` --

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
MAX_LEN = 64
def tokenize_data(texts, tokenizer, max_len=MAX_LEN):
    return tokenizer(
        texts.tolist(),
        padding=True,
        truncation=True,
        max_length=max_len,
        return_tensors="pt"
    )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

### -- `Attention Mask` --

In [None]:
class LMDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len=64):
        self.encodings = tokenize_data(texts, tokenizer, max_len)
        self.input_ids = self.encodings["input_ids"]
        self.attention_mask = self.encodings["attention_mask"]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        x = self.input_ids[idx]
        y = x.clone()
        y[:-1] = x[1:]
        y[-1] = tokenizer.pad_token_id
        return {"input_ids": x, "labels": y}

### -- `Model Architecture` --

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class TransformerLM(nn.Module):
    def __init__(self, vocab_size, d_model=128, nhead=2, num_layers=5, dim_ff=512, dropout=0.1, max_len=512):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_enc = PositionalEncoding(d_model, max_len)
        decoder_layer = nn.TransformerDecoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=dim_ff, dropout=dropout, batch_first=True
        )
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x = self.pos_enc(x)
        B, T, D = x.shape
        memory = torch.zeros(B, 1, D, device=x.device)
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(T).to(x.device)
        out = self.decoder(x, memory, tgt_mask=tgt_mask)
        logits = self.fc_out(out)
        return logits

### -- `Metrics` --

In [None]:
def compute_perplexity(loss):
    return math.exp(loss) if loss < 20 else float("inf")

def evaluate_metrics(model, dataloader, tokenizer):
    model.eval()
    rouge = Rouge()
    all_refs, all_hyps = [], []
    total_loss, total = 0, 0
    criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)
            logits = model(input_ids)
            loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1))
            total_loss += loss.item()
            total += 1

            preds = logits.argmax(-1).cpu().numpy()
            for ref, hyp in zip(labels.cpu().numpy(), preds):
                r_text = tokenizer.decode(ref, skip_special_tokens=True).strip()
                h_text = tokenizer.decode(hyp, skip_special_tokens=True).strip()
                if len(h_text) == 0 or len(r_text) == 0:
                    continue
                all_refs.append([r_text.split()])
                all_hyps.append(h_text.split())

    if not all_refs or not all_hyps:
        print(" –£–≤–∞–≥–∞: –ø—ñ–¥ —á–∞—Å –æ—Ü—ñ–Ω–∫–∏ –∑–≥–µ–Ω–µ—Ä–æ–≤–∞–Ω–æ –∑–∞–Ω–∞–¥—Ç–æ –∫–æ—Ä–æ—Ç–∫—ñ –≥—ñ–ø–æ—Ç–µ–∑–∏.")
        return float("inf"), 0.0, 0.0

    bleu = corpus_bleu(all_refs, all_hyps)
    refs_joined = [" ".join(r[0]) for r in all_refs]
    hyps_joined = [" ".join(h) for h in all_hyps]
    rouge_l = rouge.get_scores(refs_joined, hyps_joined, avg=True)["rouge-l"]["f"]

    ppl = compute_perplexity(total_loss / total)
    return ppl, bleu, rouge_l


### -- `Parameters Tuning` --

In [None]:
train_df = data.sample(frac=0.85, random_state=42)
val_df = data.drop(train_df.index)

train_ds = LMDataset(train_df["texts"], tokenizer, MAX_LEN)
val_ds = LMDataset(val_df["texts"], tokenizer, MAX_LEN)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=32)

param_grid = [
    # {"d_model": 128, "nhead": 2},
    # {"d_model": 256, "nhead": 4},
    {"d_model": 512, "nhead": 8}
]

results = []
best_perplexity = float('inf')
best_model_state = None
best_conf = None

for conf in param_grid:
    print(f"\n –ù–∞–≤—á–∞–Ω–Ω—è –∫–æ–Ω—Ñ—ñ–≥—É—Ä–∞—Ü—ñ—ó: {conf}")
    model = TransformerLM(
        vocab_size=tokenizer.vocab_size,
        d_model=conf["d_model"],
        nhead=conf["nhead"],
        num_layers=2
    ).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4)
    criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

    EPOCHS = 30
    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"–ï–ø–æ—Ö–∞ {epoch+1}"):
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)
            optimizer.zero_grad()
            logits = model(input_ids)
            loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"–°–µ—Ä–µ–¥–Ω—è –≤—Ç—Ä–∞—Ç–∞: {avg_loss:.4f}")

    ppl, bleu, rouge_l = evaluate_metrics(model, val_loader, tokenizer)
    current_result = {
        **conf,
        "perplexity": round(ppl, 3),
        "BLEU": round(bleu, 4),
        "ROUGE-L": round(rouge_l, 4),
    }
    results.append(current_result)

    if ppl < best_perplexity:
        print(f" –ó–Ω–∞–π–¥–µ–Ω–æ –Ω–æ–≤—É –Ω–∞–π–∫—Ä–∞—â—É –º–æ–¥–µ–ª—å –∑ perplexity: {ppl:.3f} (–ø–æ–ø–µ—Ä–µ–¥–Ω—è: {best_perplexity:.3f})")
        best_perplexity = ppl
        best_model_state = copy.deepcopy(model.state_dict())
        best_conf = conf

res = pd.DataFrame(results)
print("\n –†–µ–∑—É–ª—å—Ç–∞—Ç–∏ —Ç—é–Ω—ñ–Ω–≥—É:")
print(res)

print("\n –ù–∞–π–∫—Ä–∞—â–∞ –∫–æ–Ω—Ñ—ñ–≥—É—Ä–∞—Ü—ñ—è:", best_conf)

if best_model_state:
    torch.save(best_model_state, "../models/best_transformer_model.pt")
    tokenizer.save_pretrained("../models/my_tokenizer")
    print("\n –ù–∞–π–∫—Ä–∞—â—É –º–æ–¥–µ–ª—å —ñ —Ç–æ–∫–µ–Ω–∞–π–∑–µ—Ä –∑–±–µ—Ä–µ–∂–µ–Ω–æ!")
else:
    print("\n –ù–µ –≤–¥–∞–ª–æ—Å—è –∑–Ω–∞–π—Ç–∏ –Ω–∞–π–∫—Ä–∞—â—É –º–æ–¥–µ–ª—å –¥–ª—è –∑–±–µ—Ä–µ–∂–µ–Ω–Ω—è.")



üîß –ù–∞–≤—á–∞–Ω–Ω—è –∫–æ–Ω—Ñ—ñ–≥—É—Ä–∞—Ü—ñ—ó: {'d_model': 512, 'nhead': 8}


–ï–ø–æ—Ö–∞ 1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 531/531 [00:46<00:00, 11.41it/s]


–°–µ—Ä–µ–¥–Ω—è –≤—Ç—Ä–∞—Ç–∞: 4.6066


–ï–ø–æ—Ö–∞ 2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 531/531 [00:46<00:00, 11.35it/s]


–°–µ—Ä–µ–¥–Ω—è –≤—Ç—Ä–∞—Ç–∞: 3.4724


–ï–ø–æ—Ö–∞ 3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 531/531 [00:46<00:00, 11.54it/s]


–°–µ—Ä–µ–¥–Ω—è –≤—Ç—Ä–∞—Ç–∞: 2.8828


–ï–ø–æ—Ö–∞ 4: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 531/531 [00:46<00:00, 11.42it/s]


–°–µ—Ä–µ–¥–Ω—è –≤—Ç—Ä–∞—Ç–∞: 2.4509


–ï–ø–æ—Ö–∞ 5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 531/531 [00:46<00:00, 11.47it/s]


–°–µ—Ä–µ–¥–Ω—è –≤—Ç—Ä–∞—Ç–∞: 2.1384


–ï–ø–æ—Ö–∞ 6: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 531/531 [00:46<00:00, 11.44it/s]


–°–µ—Ä–µ–¥–Ω—è –≤—Ç—Ä–∞—Ç–∞: 1.9083


–ï–ø–æ—Ö–∞ 7: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 531/531 [00:46<00:00, 11.39it/s]


–°–µ—Ä–µ–¥–Ω—è –≤—Ç—Ä–∞—Ç–∞: 1.7366


–ï–ø–æ—Ö–∞ 8: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 531/531 [00:46<00:00, 11.41it/s]


–°–µ—Ä–µ–¥–Ω—è –≤—Ç—Ä–∞—Ç–∞: 1.5974


–ï–ø–æ—Ö–∞ 9: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 531/531 [00:46<00:00, 11.37it/s]


–°–µ—Ä–µ–¥–Ω—è –≤—Ç—Ä–∞—Ç–∞: 1.4884


–ï–ø–æ—Ö–∞ 10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 531/531 [00:46<00:00, 11.39it/s]


–°–µ—Ä–µ–¥–Ω—è –≤—Ç—Ä–∞—Ç–∞: 1.4029


–ï–ø–æ—Ö–∞ 11: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 531/531 [00:46<00:00, 11.38it/s]


–°–µ—Ä–µ–¥–Ω—è –≤—Ç—Ä–∞—Ç–∞: 1.3285


–ï–ø–æ—Ö–∞ 12: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 531/531 [00:46<00:00, 11.39it/s]


–°–µ—Ä–µ–¥–Ω—è –≤—Ç—Ä–∞—Ç–∞: 1.2659


–ï–ø–æ—Ö–∞ 13: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 531/531 [00:46<00:00, 11.41it/s]


–°–µ—Ä–µ–¥–Ω—è –≤—Ç—Ä–∞—Ç–∞: 1.2153


–ï–ø–æ—Ö–∞ 14: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 531/531 [00:46<00:00, 11.39it/s]


–°–µ—Ä–µ–¥–Ω—è –≤—Ç—Ä–∞—Ç–∞: 1.1683


–ï–ø–æ—Ö–∞ 15: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 531/531 [00:46<00:00, 11.39it/s]


–°–µ—Ä–µ–¥–Ω—è –≤—Ç—Ä–∞—Ç–∞: 1.1269


–ï–ø–æ—Ö–∞ 16: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 531/531 [00:46<00:00, 11.44it/s]


–°–µ—Ä–µ–¥–Ω—è –≤—Ç—Ä–∞—Ç–∞: 1.0926


–ï–ø–æ—Ö–∞ 17: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 531/531 [00:46<00:00, 11.45it/s]


–°–µ—Ä–µ–¥–Ω—è –≤—Ç—Ä–∞—Ç–∞: 1.0590


–ï–ø–æ—Ö–∞ 18: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 531/531 [00:46<00:00, 11.45it/s]


–°–µ—Ä–µ–¥–Ω—è –≤—Ç—Ä–∞—Ç–∞: 1.0300


–ï–ø–æ—Ö–∞ 19: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 531/531 [00:46<00:00, 11.43it/s]


–°–µ—Ä–µ–¥–Ω—è –≤—Ç—Ä–∞—Ç–∞: 1.0033


–ï–ø–æ—Ö–∞ 20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 531/531 [00:46<00:00, 11.45it/s]


–°–µ—Ä–µ–¥–Ω—è –≤—Ç—Ä–∞—Ç–∞: 0.9811


–ï–ø–æ—Ö–∞ 21: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 531/531 [00:46<00:00, 11.46it/s]


–°–µ—Ä–µ–¥–Ω—è –≤—Ç—Ä–∞—Ç–∞: 0.9609


–ï–ø–æ—Ö–∞ 22: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 531/531 [00:46<00:00, 11.45it/s]


–°–µ—Ä–µ–¥–Ω—è –≤—Ç—Ä–∞—Ç–∞: 0.9410


–ï–ø–æ—Ö–∞ 23: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 531/531 [00:46<00:00, 11.43it/s]


–°–µ—Ä–µ–¥–Ω—è –≤—Ç—Ä–∞—Ç–∞: 0.9196


–ï–ø–æ—Ö–∞ 24: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 531/531 [00:46<00:00, 11.40it/s]


–°–µ—Ä–µ–¥–Ω—è –≤—Ç—Ä–∞—Ç–∞: 0.9025


–ï–ø–æ—Ö–∞ 25: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 531/531 [00:46<00:00, 11.42it/s]


–°–µ—Ä–µ–¥–Ω—è –≤—Ç—Ä–∞—Ç–∞: 0.8871


–ï–ø–æ—Ö–∞ 26: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 531/531 [00:46<00:00, 11.38it/s]


–°–µ—Ä–µ–¥–Ω—è –≤—Ç—Ä–∞—Ç–∞: 0.8728


–ï–ø–æ—Ö–∞ 27: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 531/531 [00:46<00:00, 11.43it/s]


–°–µ—Ä–µ–¥–Ω—è –≤—Ç—Ä–∞—Ç–∞: 0.8590


–ï–ø–æ—Ö–∞ 28: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 531/531 [00:46<00:00, 11.40it/s]


–°–µ—Ä–µ–¥–Ω—è –≤—Ç—Ä–∞—Ç–∞: 0.8457


–ï–ø–æ—Ö–∞ 29: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 531/531 [00:46<00:00, 11.41it/s]


–°–µ—Ä–µ–¥–Ω—è –≤—Ç—Ä–∞—Ç–∞: 0.8341


–ï–ø–æ—Ö–∞ 30: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 531/531 [00:46<00:00, 11.42it/s]


–°–µ—Ä–µ–¥–Ω—è –≤—Ç—Ä–∞—Ç–∞: 0.8224
‚ú® –ó–Ω–∞–π–¥–µ–Ω–æ –Ω–æ–≤—É –Ω–∞–π–∫—Ä–∞—â—É –º–æ–¥–µ–ª—å –∑ perplexity: 7.800 (–ø–æ–ø–µ—Ä–µ–¥–Ω—è: inf)

üìä –†–µ–∑—É–ª—å—Ç–∞—Ç–∏ —Ç—é–Ω—ñ–Ω–≥—É:
   d_model  nhead  perplexity    BLEU  ROUGE-L
0      512      8         7.8  0.4428   0.6554

üèÜ –ù–∞–π–∫—Ä–∞—â–∞ –∫–æ–Ω—Ñ—ñ–≥—É—Ä–∞—Ü—ñ—è: {'d_model': 512, 'nhead': 8}

‚úÖ –ù–∞–π–∫—Ä–∞—â—É –º–æ–¥–µ–ª—å —ñ —Ç–æ–∫–µ–Ω–∞–π–∑–µ—Ä –∑–±–µ—Ä–µ–∂–µ–Ω–æ!
