In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertConfig, BertForMaskedLM, AutoTokenizer, AdamW, get_scheduler
from tqdm import tqdm
import math
from peft import LoraConfig, get_peft_model

# Custom Dataset to load tokenized data from a text file
class TokenizedTextDataset(Dataset):
    def _init_(self, file_path, tokenizer, max_length=512):
        self.input_ids = []
        self.attention_masks = []

        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                tokens = list(map(int, line.strip().split()))
                attention_mask = [1] * len(tokens)

                # Ensure all tokens are within the model's vocabulary size
                tokens = [min(token, tokenizer.vocab_size - 1) for token in tokens]

                # Pad or truncate the sequences to max_length
                if len(tokens) < max_length:
                    padding_length = max_length - len(tokens)
                    tokens += [tokenizer.pad_token_id] * padding_length
                    attention_mask += [0] * padding_length
                else:
                    tokens = tokens[:max_length]
                    attention_mask = attention_mask[:max_length]

                self.input_ids.append(tokens)
                self.attention_masks.append(attention_mask)

    def _len_(self):
        return len(self.input_ids)

    def _getitem_(self, idx):
        return {
            "input_ids": torch.tensor(self.input_ids[idx]),
            "attention_mask": torch.tensor(self.attention_masks[idx]),
        }

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Load tokenized dataset from the .txt file
file_path = "/kaggle/input/tokenized-dataset/Tokenized_Data.txt"
dataset = TokenizedTextDataset(file_path, tokenizer)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Define BERT configuration
bert_config = BertConfig(
    num_hidden_layers=8,
    hidden_size=200,
    intermediate_size=1024,
    num_attention_heads=8,
    vocab_size=tokenizer.vocab_size,
    max_position_embeddings=512
)

# Initialize the model for Masked Language Modeling
device = "cuda" if torch.cuda.is_available() else "cpu"
model = BertForMaskedLM(bert_config)

# Convert model to GPU and set it to half-precision (float16)
model = model.to(device)
model = model.half()

# Wrap model in a low-rank adaptation (LoRA) configuration
lora_config = LoraConfig(
    r=8,  # Rank of the low-rank matrices
    lora_alpha=16,  # Scaling parameter
    target_modules=["query", "value"],  # Apply LoRA on attention query/value projections
    lora_dropout=0.1,
    bias="none"
)
model = get_peft_model(model, lora_config)

print(f"Total Parameters with LoRA: {sum(p.numel() for p in model.parameters())}")
print(f"Trainable Parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")

# Optimizer using AdamW optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(dataloader) * 3)

# Mixed-precision training setup (using PyTorch AMP)
scaler = torch.cuda.amp.GradScaler()

# Logging
with open("perplexity_log.txt", "w") as f:
    f.write("Epoch, Step, Perplexity\n")

# Training Loop
model.train()
for epoch in range(3):
    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch + 1}/3")

    total_loss = 0
    total_steps = 0

    # Calculate the number of steps for 0.1 epoch
    steps_per_epoch = len(dataloader)
    steps_per_10_percent_epoch = steps_per_epoch // 10

    for batch in progress_bar:
        input_ids, attention_mask = batch["input_ids"].to(device), batch["attention_mask"].to(device)

        # Forward pass with mixed precision
        with torch.cuda.amp.autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
            loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        # Track loss
        total_loss += loss.item()
        total_steps += 1

        progress_bar.set_postfix({"loss": loss.item()})

        # Calculate and print perplexity every 0.1 epoch
        if total_steps % steps_per_10_percent_epoch == 0:
            avg_loss = total_loss / total_steps
            perplexity = math.exp(avg_loss)
            print(f"Epoch {epoch + 1}, Step {total_steps}/{steps_per_epoch}, Perplexity: {perplexity:.2f}")

            # Log perplexity
            with open("perplexity_log.txt", "a") as f:
                f.write(f"{epoch + 1}, {total_steps}, {perplexity:.2f}\n")

    avg_loss = total_loss / total_steps
    perplexity = math.exp(avg_loss)
    print(f"Epoch {epoch + 1} Perplexity: {perplexity:.2f}")

# Save model
model.save_pretrained("quantized_lora_bert")
print("Training complete! Model saved at 'quantized_lora_bert'.")