# Imports

In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
from datasets import load_dataset
from tqdm.notebook import tqdm

import random

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Configs

In [3]:
MODEL_NAME = "intfloat/e5-small"
DATASET_NAME = "M-AI-C/quran-en-tafssirs"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

BATCH_SIZE = 4
LEARNING_RATE = 1e-5
BATCH_ACCUMULATION = 24
MAX_LENGTH = 256
TEST_SIZE = 0.1
EPOCHS = 10

CONTRASTIVE_COLUMNS = {"en-tafsir-ibn-kathir-text",
                       "en-tafsir-maarif-text",
                       "en-tafsir-mokhtasar-text",
                       "en-sahih",}

# Dataset

In [4]:
# dataset = load_dataset('M-AI-C/en-tafsir-mokhtasar', split={"train" :'train[:98%]', "validation" :'train[98%:]'})
dataset = load_dataset(DATASET_NAME)

Found cached dataset parquet (/home/mustapha/.cache/huggingface/datasets/M-AI-C___parquet/M-AI-C--quran-en-tafssirs-6bd3437dc9c6bbbd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
def get_ayah_id(row):
    ayah = row["ayah"]
    sorah = row["sorah"]
    ayah_id = f"{sorah}:{ayah}"

    row["ayah_id"] = ayah_id
    return row

dataset = dataset.map(get_ayah_id, batched=False)

Loading cached processed dataset at /home/mustapha/.cache/huggingface/datasets/M-AI-C___parquet/M-AI-C--quran-en-tafssirs-6bd3437dc9c6bbbd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-036ba0ac17ad04f2.arrow


In [6]:
dataset["train"][183]

{'en-ahmedali': 'O believers, ordained for you is retribution for the murdered, (whether) a free man (is guilty) of (the murder of) a free man, or a slave of a slave, or a woman of a woman. But he who is pardoned some of it by his brother should be dealt with equity, and recompense (for blood) paid with a grace. This is a concession from your Lord and a kindness. He who transgresses in spite of it shall suffer painful punishment.',
 'en-ahmedraza': 'O People who Believe! Retribution is made obligatory for you in the matter of those killed unjustly; a freeman for a freeman, and a slave for a slave, and a female for a female; and for him who is partly forgiven by his brother, seek compensation with courtesy and make payment in proper manner; this is a relief and a mercy upon you, from your Lord; so after this, a painful punishment is for whoever exceeds the limits.',
 'en-arberry': 'O believers, prescribed for you is retaliation, touching the slain; freeman for freeman, slave for slave, 

In [7]:
# split the dataset into train and validation
dataset = dataset['train'].train_test_split(test_size=TEST_SIZE)

In [8]:
dataset["test"][1]

{'en-ahmedali': 'Do they have the stores of the mercy of your Lord, the mighty and munificent?',
 'en-ahmedraza': 'Or do they hold the treasures of the mercy of your Lord, the Almighty, the Great Bestower?',
 'en-arberry': "Or have they the treasuries of thy Lord's mercy, the All-mighty, the All-giving?",
 'en-asad': 'Or do they [think that they] own the treasures of thy Sustainer’s grace - [the grace] of the Almighty, the Giver of Gifts?',
 'en-daryabadi': 'Or, are with them the treasures of the mercy of thy Lord, the Bestower?',
 'en-hilali': 'Or have they the treasures of the Mercy of your Lord, the All-Mighty, the Real Bestower?',
 'en-itani': 'Or do they possess the treasuries of the mercy of your Lord—the Majestic, the Giver?',
 'en-maududi': 'Do they possess the treasures of your Lord, the Most Mighty, the Great Bestower?',
 'en-mubarakpuri': 'Or have they the treasures of the mercy of your Lord, the Almighty, the Real Bestower',
 'en-pickthall': 'Or are theirs the treasures of 

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['en-ahmedali', 'en-ahmedraza', 'en-arberry', 'en-asad', 'en-daryabadi', 'en-hilali', 'en-itani', 'en-maududi', 'en-mubarakpuri', 'en-pickthall', 'en-qarai', 'en-qaribullah', 'en-sahih', 'en-sarwar', 'en-shakir', 'en-transliterati', 'en-wahiduddi', 'en-yusufali', 'ayah', 'sorah', 'sentence', 'en-tafsir-mokhtasar-html', 'en-tafsir-mokhtasar-text', 'en-tafsir-maarif-html', 'en-tafsir-maarif-text', 'en-tafsir-ibn-kathir-html', 'en-tafsir-ibn-kathir-text', 'ayah_id'],
        num_rows: 5611
    })
    test: Dataset({
        features: ['en-ahmedali', 'en-ahmedraza', 'en-arberry', 'en-asad', 'en-daryabadi', 'en-hilali', 'en-itani', 'en-maududi', 'en-mubarakpuri', 'en-pickthall', 'en-qarai', 'en-qaribullah', 'en-sahih', 'en-sarwar', 'en-shakir', 'en-transliterati', 'en-wahiduddi', 'en-yusufali', 'ayah', 'sorah', 'sentence', 'en-tafsir-mokhtasar-html', 'en-tafsir-mokhtasar-text', 'en-tafsir-maarif-html', 'en-tafsir-maarif-text', 'en-tafsir-

# Model

In [10]:
model = AutoModel.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# DataClass

In [11]:
class TripletRandomSampler(torch.utils.data.Dataset):
    def __init__(self, dataset, 
                 contrastive_columns=CONTRASTIVE_COLUMNS,
                 tokenizer=tokenizer,
                 seed=42):
        self.dataset = dataset
        self.contrastive_columns = contrastive_columns
        self.tokenizer = tokenizer
        self.tokenized_dataset = self.dataset.map(self.tokenize_data, batched=True, batch_size=BATCH_SIZE)
        self.seed = seed
        random.seed(seed)
        
    def tokenize_data(self, row):
        for column in CONTRASTIVE_COLUMNS:
            tokenized_row = self.tokenizer(row[column], 
                                           truncation=True, 
                                           max_length=MAX_LENGTH, 
                                           padding="max_length")
            row[column+"_input_ids"] = tokenized_row["input_ids"]
            row[column+"_attention_mask"] = tokenized_row["attention_mask"]
        return row
    
    def __len__(self):
        return len(self.tokenized_dataset)

    def __getitem__(self, idx):
        negative_idx = random.randint(0, len(self.tokenized_dataset) - 2)
        if negative_idx >= idx:
            negative_idx += 1
        
        anchor_col = random.choice(list(self.contrastive_columns))
        positive_negative_col = random.choice(list(self.contrastive_columns.difference({anchor_col})))

        return {
            "anchor_input_ids": torch.tensor(self.tokenized_dataset[idx][anchor_col+"_input_ids"]),
            "anchor_attention_mask": torch.tensor(self.tokenized_dataset[idx][anchor_col+"_attention_mask"]),
            "positive_input_ids": torch.tensor(self.tokenized_dataset[idx][positive_negative_col+"_input_ids"]),
            "positive_attention_mask": torch.tensor(self.tokenized_dataset[idx][positive_negative_col+"_attention_mask"]),
            "negative_input_ids": torch.tensor(self.tokenized_dataset[negative_idx][positive_negative_col+"_input_ids"]),
            "negative_attention_mask": torch.tensor(self.tokenized_dataset[negative_idx][positive_negative_col+"_attention_mask"]),
        }


In [12]:
train_data = TripletRandomSampler(dataset["train"])
test_data = TripletRandomSampler(dataset["test"])

train_data_loader = torch.utils.data.DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_data_loader = torch.utils.data.DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=True)

Map:   0%|          | 0/5611 [00:00<?, ? examples/s]

Map:   0%|          | 0/624 [00:00<?, ? examples/s]

# Finetuning

In [13]:
model.to(DEVICE);

In [14]:
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
optimizer.zero_grad()
# loss = torch.nn.TripletMarginLoss(margin=0.2, p=2)
# loss = torch.nn.CosineEmbeddingLoss()
def dist_function(*args):
    return 1 - torch.nn.CosineSimilarity()(*args)
loss_fn = torch.nn.TripletMarginWithDistanceLoss(margin=0.1, distance_function=dist_function)

test

In [15]:
# emb_anchor = model(train_data[0]["anchor_input_ids"].to(DEVICE).unsqueeze(0),
#                                         attention_mask=train_data[0]["anchor_attention_mask"].to(DEVICE).unsqueeze(0)).last_hidden_state[:,0,:]
# emb_positive = model(train_data[0]["positive_input_ids"].to(DEVICE).unsqueeze(0),
#                                         attention_mask=train_data[0]["positive_attention_mask"].to(DEVICE).unsqueeze(0)).last_hidden_state[:,0,:]
# emb_negative = model(train_data[0]["negative_input_ids"].to(DEVICE).unsqueeze(0),
#                                         attention_mask=train_data[0]["negative_attention_mask"].to(DEVICE).unsqueeze(0)).last_hidden_state[:,0,:]
# loss_fn(emb_anchor, emb_positive, emb_negative)

In [17]:
def average_pool(last_hidden_states: torch.Tensor,
                 attention_mask: torch.Tensor) -> torch.Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

def forward_backward(model, batch, loss_fn=loss_fn, testing=False):
    anchor_input, anchor_mask = batch["anchor_input_ids"].to(DEVICE), batch["anchor_attention_mask"].to(DEVICE)
    positive_input, positive_mask = batch["positive_input_ids"].to(DEVICE), batch["positive_attention_mask"].to(DEVICE)
    negative_input, negative_mask = batch["negative_input_ids"].to(DEVICE), batch["negative_attention_mask"].to(DEVICE)

    anchor_embeddings = average_pool(model(anchor_input, attention_mask=anchor_mask).last_hidden_state, anchor_mask)
    positive_embeddings = average_pool(model(positive_input, attention_mask=positive_mask).last_hidden_state, positive_mask)
    negative_embeddings = average_pool(model(negative_input, attention_mask=negative_mask).last_hidden_state, negative_mask)

    batch_loss = loss_fn(anchor_embeddings, positive_embeddings, negative_embeddings)
    batch_loss_item = batch_loss.item()
    
    if not testing:
        batch_loss.backward()
    
    accuracy = dist_function(anchor_embeddings, positive_embeddings) < dist_function(anchor_embeddings, negative_embeddings)
    accuracy = accuracy.sum().item() / accuracy.shape[0]

    return batch_loss_item / anchor_embeddings.shape[0], accuracy

def train_epoch(model, train_data_loader, optimizer, loss_fn=loss_fn):
    model.train()
    epoch_loss = 0
    epoch_accuracy = 0
    in_batch_counter = 0
    for batch in tqdm(train_data_loader, desc="Training epoch"):
        in_batch_counter += 1

        batch_loss, accuracy = forward_backward(model, batch, loss_fn)
        epoch_loss += batch_loss
        epoch_accuracy += accuracy

        if in_batch_counter % BATCH_ACCUMULATION == 0:
            optimizer.step()
            optimizer.zero_grad()
    return epoch_loss / len(train_data_loader), epoch_accuracy / len(train_data_loader)

def test_epoch(model, test_data_loader, loss_fn=loss_fn):
    model.eval()
    epoch_loss = 0
    epoch_accuracy = 0
    with torch.no_grad():
        for batch in tqdm(test_data_loader, desc="Testing epoch"):
            batch_loss, accuracy = forward_backward(model, batch, loss_fn, testing=True)
            epoch_loss += batch_loss
            epoch_accuracy += accuracy
    return epoch_loss / len(test_data_loader), epoch_accuracy / len(test_data_loader)

test_loss, test_accuracy = test_epoch(model, test_data_loader)
print(f"Test loss: {test_loss:.3f}, test accuracy: {test_accuracy:.3f}")

for epoch in range(EPOCHS):
    print(f"#### Epoch : {epoch}")
    train_loss, train_accuracy = train_epoch(model, train_data_loader, optimizer)
    print(f"Train loss: {train_loss:.3f}, train accuracy: {train_accuracy:.3f}")
    test_loss, test_accuracy = test_epoch(model, test_data_loader)
    print(f"Test loss: {test_loss:.3f}, test accuracy: {test_accuracy:.3f}")

Testing epoch:   0%|          | 0/156 [00:00<?, ?it/s]

Test loss: 0.01852982398122549, test accuracy: 0.5721153846153846
#### Epoch : 0


Training epoch:   0%|          | 0/1403 [00:00<?, ?it/s]