In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoModelForMaskedLM,AutoModel,AutoTokenizer

In [2]:
max_length = 128
batch_size = 32
model_name = 'HooshvareLab/bert-fa-base-uncased'
lr = 1e-5
num_epochs = 3
d_model = 768
nhead = 8
dim_feedforward = 3072
num_encoders = 2
test_size = 0.1
random_state = 0

In [3]:
dataset = pd.read_csv('cleaned_output.csv',encoding='utf-8')
dataset = dataset.dropna()

In [4]:
import pandas as pd

def split_and_save_csv(input_csv, output_dir):
    # Load the CSV data
    df = pd.read_csv(input_csv)
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    subset_size = len(df) // 5
    subsets = [df[i * subset_size:(i + 1) * subset_size] for i in range(5)]
    for i, subset in enumerate(subsets):
        subset.to_csv(f'{output_dir}/subset_{i + 1}.csv', index=False)

In [5]:
train , test = train_test_split(dataset,test_size=test_size,random_state=random_state)
len(train) , len(test)

(261407, 29046)

In [6]:
import random

def dataset_handler(sentences,tokenizer):
    mask_fraction = 0.25
    mask_dataset = []
    for sentence in sentences:
        tokens = tokenizer.tokenize(sentence)
        masked_tokens = tokens.copy()
        for i in range(len(tokens)):
            if random.random() < mask_fraction:
                masked_tokens[i] = '[MASK]'
        input_ids = tokenizer.encode(' '.join(masked_tokens), truncation=True, max_length=max_length, pad_to_max_length=True)
        labels = tokenizer.encode(' '.join(tokens), truncation=True, max_length=max_length, pad_to_max_length=True)
        mask_dataset.append({"input_ids": input_ids, "labels": labels})
    return mask_dataset

In [7]:
import torch
from torch.utils.data import Dataset

class MaskedLMDataSet(Dataset):
    def __init__(self, mask_dataset, tokenizer, max_length):
        self.mask_dataset = mask_dataset
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.mask_dataset)

    def __getitem__(self, idx):
        sample = self.mask_dataset[idx]

        input_ids = sample["input_ids"]
        labels = sample["labels"]

        # Pad or truncate input to a fixed length
        input_ids = input_ids
        labels = labels

        # Create attention mask
        attention_mask = [1] * len(input_ids)

        # Pad the input and attention mask to the same length
        while len(input_ids) < self.max_length:
            input_ids.append(0)  # Padding token ID
            attention_mask.append(0)  # Zero attention for padding tokens

        return {
            "input_ids": torch.tensor(input_ids),
            "attention_mask": torch.tensor(attention_mask),
            "labels": torch.tensor(labels)
        }


In [35]:
import torch
import torch.nn as nn
from transformers import AutoModel

class MLMNetwork(nn.Module):
    def __init__(self, model_name, d_model, nhead, dim_feedforward, vocab_size, num_encoders):
        super(MLMNetwork, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        encoder_config = {
            "d_model": d_model,
            "nhead": nhead,
            "dim_feedforward": dim_feedforward,
            "dropout": 0.1
        }
        
        self.encoders = nn.ModuleList()
        for _ in range(num_encoders):
            encoder_layer = nn.TransformerEncoderLayer(**encoder_config)
            self.encoders.append(encoder_layer)
        
        self.lm_head = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.LayerNorm(d_model),
            nn.Linear(d_model,self.bert.config.vocab_size)
        )

    def forward(self, input_ids, attention_mask):
        # Get BERT's output
        bert_output = self.bert(input_ids, attention_mask=attention_mask)
        encoded_output = bert_output.last_hidden_state
        for encoder in self.encoders:
            encoded_output = encoder(encoded_output)
        
        mlm_logits = self.lm_head(encoded_output)
        
        return mlm_logits

In [26]:
train_sentences = train['text'].tolist()
test_sentences = test['text'].tolist()

In [36]:
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = MLMNetwork(
    model_name=model_name,
    d_model=d_model,
    nhead=nhead,
    dim_feedforward=dim_feedforward,
    vocab_size=250002,
    num_encoders=2
).to(device)

Some weights of the model checkpoint at HooshvareLab/bert-fa-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
mask_dataset = dataset_handler(sentences=train_sentences,tokenizer=tokenizer)
mask_dataset = MaskedLMDataSet(mask_dataset, tokenizer, max_length)
eval_masked_dataset = dataset_handler(sentences=test_sentences,tokenizer=tokenizer)
eval_masked_dataset = MaskedLMDataSet(eval_masked_dataset, tokenizer, max_length)



In [28]:
train_loader = torch.utils.data.DataLoader(mask_dataset, batch_size=1, shuffle=True)
eval_loader = torch.utils.data.DataLoader(eval_masked_dataset, batch_size=1, shuffle=False)

In [39]:
model = model.cpu()

In [40]:
device = 'cpu'

In [44]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    total_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device).view(-1)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask).view(-1,tokenizer.vocab_size)
        loss = criterion(outputs,labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    average_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Average Loss: {average_loss:.4f}")


model.eval()
total_correct = 0
total_samples = 0
with torch.no_grad():
    for batch in tqdm(eval_loader, desc="Evaluation"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        predicted = torch.argmax(outputs.logits, dim=-1)

        total_correct += torch.sum(predicted == labels)
        total_samples += labels.numel()

accuracy = total_correct / total_samples
print(f"Evaluation Accuracy: {accuracy:.2%}")


Epoch 1:   0%|          | 0/261407 [00:00<?, ?it/s]

torch.Size([128, 100000]) torch.Size([128])
8.781203269958496


Epoch 1:   0%|          | 1/261407 [00:03<287:27:39,  3.96s/it]

torch.Size([128, 100000]) torch.Size([128])
8.673307418823242


Epoch 1:   0%|          | 2/261407 [00:07<264:28:17,  3.64s/it]

torch.Size([128, 100000]) torch.Size([128])
8.959466934204102


Epoch 1:   0%|          | 3/261407 [00:10<258:07:42,  3.55s/it]

torch.Size([128, 100000]) torch.Size([128])
5.821967601776123


Epoch 1:   0%|          | 4/261407 [00:14<268:47:16,  3.70s/it]


KeyboardInterrupt: 