In [None]:
"""
Fine-Tuning Script for Multilingual-E5-Large-Instruct Model.

This script provides a complete workflow to fine-tune the
'intfloat/multilingual-e5-large-instruct' embedding model on a custom dataset
generated from a single PDF document.

The process involves:
1.  Setting up the environment and API keys.
2.  Parsing the PDF document to extract text.
3.  Cleaning and preprocessing the extracted text.
4.  Splitting the text into manageable chunks.
5.  Synthetically generating a question-answer dataset from the text chunks using a generator LLM.
6.  Configuring the fine-tuning process using the sentence-transformers library.
7.  Running the training and saving the fine-tuned model.
8.  Demonstrating how to load and use the newly fine-tuned model.

Prerequisites:
- Install all required libraries:
  pip install sentence-transformers torch datasets llama_parse langchain langchain-groq python-dotenv rank_bm25
- A LlamaParse API key for parsing the document.
- A Groq API key for the generator model.
- Place these keys in a .env file in the same directory.
"""

import os
import re
import unicodedata
from typing import List

import torch
from torch.utils.data import DataLoader
from datasets import Dataset
from sentence_transformers import SentenceTransformer, InputExample, losses
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from llama_cloud_services import LlamaParse

from langchain_groq import ChatGroq

# --- 1. Environment Setup ---
# Load API keys and environment variables from a .env file

# Check for necessary API keys
# LLAMA_CLOUD_API_KEY = userdata.get("LLAMA_CLOUD_API_KEY")
# GROQ_API_KEY = userdata.get("GROQ_API_KEY")

# if not LLAMA_CLOUD_API_KEY or not GROQ_API_KEY:
#     raise ValueError(
#         "API keys for LlamaParse (LLAMA_CLOUD_API_KEY) and Groq (GROQ_API_KEY) are required. "
#         "Please add them to your .env file."
#     )

# --- 2. PDF Parsing and Text Extraction ---
from dotenv import load_dotenv
load_dotenv()


def parse_pdf(file_path: str) -> List[Document]:
    """
    Parses a PDF file using LlamaParse and returns the content as LangChain documents.
    """
    print(f"Starting to parse the PDF: {file_path}...")
    parser = LlamaParse(
        result_type="markdown",
        use_vendor_multimodal_model=True,
        vendor_multimodal_model_name="anthropic-sonnet-4.0",
        verbose=True,
        language="bn"
    )
    try:
        llama_parse_documents = parser.load_data(file_path)
        print("PDF parsing completed successfully.")
        return [Document(page_content=doc.text) for doc in llama_parse_documents]
    except Exception as e:
        print(f"An error occurred during PDF parsing: {e}")
        return []


def clean_raw_markdown(raw_text: str) -> str:
    """
    Cleans raw markdown text by removing boilerplate, correcting OCR errors,
    and normalizing whitespace. This function is adapted from your notebook.
    """
    boilerplate_patterns = [
        re.compile(p, flags=re.MULTILINE | re.IGNORECASE) for p in [
            r"HSC\s?'?26\s+অনলাইন\s+(?:ব্যাচ|গাইড|স্কুল|ক্যাট)",
            r"বাংলা\s*[•-]\s*ইংরেজি\s*[•-]\s*(?:আইসিটি|গণিত|তথ্য\s+ও\s+যোগাযোগ(?: প্রযুক্তি)?)",
            r'10\s*MINUTE\s*SCHOOL',
            r'---',
            r'^\d+$',
        ]
    ]

    text = raw_text
    for pattern in boilerplate_patterns:
        text = pattern.sub('', text)

    correction_map = {
        'কন্যাগী': 'কল্যাণী', 'শত্রুনাশ': 'শম্ভুনাথ', 'শুভনাথ': 'শম্ভুনাথ',
        'গুনের': 'গুণের', 'রুপ': 'রূপ', 'বিদ্রোপ': 'বিদ্রূপ', 'কাহিনী': 'কাহিনি',
        'বাড়ী': 'বাড়ি', 'গাড়ী': 'গাড়ি', 'শাড়ী': 'শাড়ি',
    }
    for wrong, right in correction_map.items():
        text = text.replace(wrong, right)

    text = re.sub(r'(\w)-\n(\w)', r'\1\2', text)
    text = re.sub(r'[ \t]+', ' ', text)
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = '\n'.join([line.strip() for line in text.split('\n')])

    return unicodedata.normalize('NFC', text).strip()


def chunk_documents(documents: List[Document]) -> List[str]:
    """
    Splits a list of documents into smaller text chunks.
    """
    print("Chunking documents...")
    full_text = "\n\n".join([doc.page_content for doc in documents])
    cleaned_text = clean_raw_markdown(full_text)

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=512,
        chunk_overlap=50,
        separators=["\n\n", "\n", "।", " ", ""]
    )
    chunks = text_splitter.split_text(cleaned_text)
    print(f"Created {len(chunks)} text chunks.")
    return [chunk for chunk in chunks if len(chunk.strip()) > 100]


def generate_finetuning_dataset(chunks: List[str]) -> List[InputExample]:
    """
    Generates a synthetic dataset for fine-tuning by creating question-passage pairs.
    It uses a generator model to create a question for each text chunk.
    """
    print("Generating synthetic dataset for fine-tuning...")
    generator_llm = ChatGroq(model_name="moonshotai/kimi-k2-instruct")

    prompt_template = """
    Based on the following text passage from a story, please generate one precise and relevant question that can be answered directly from the text.
    The question should be in Bengali.

    Text Passage:
    ---
    {passage}
    ---

    Generated Question:
    """

    training_examples = []
    for i, chunk in enumerate(chunks):
        try:
            full_prompt = prompt_template.format(passage=chunk)
            question = generator_llm.invoke(full_prompt).content

            example = InputExample(texts=[question, chunk])
            training_examples.append(example)

            if (i + 1) % 10 == 0:
                print(f"Generated {i + 1}/{len(chunks)} examples...")

        except Exception as e:
            print(
                f"Skipping a chunk due to an error during question generation: {e}")
            continue

    print(
        f"Successfully generated {len(training_examples)} training examples.")
    return training_examples


def main():
    """
    Main function to run the end-to-end fine-tuning process.
    """
    pdf_file_path = "../data/raw/HSC26-Bangla1st-Paper.pdf"
    base_model_name = "intfloat/multilingual-e5-large-instruct"
    fine_tuned_model_output_path = "../models"

    num_epochs = 2
    batch_size = 16

    if not os.path.exists(pdf_file_path):
        print(f"Error: The file '{pdf_file_path}' was not found.")
        print("Please make sure the PDF is in the same directory as this script.")
        return

    documents = parse_pdf(pdf_file_path)
    if not documents:
        print("Could not extract any documents from the PDF. Exiting.")
        return

    text_chunks = chunk_documents(documents)
    if not text_chunks:
        print("No text chunks were created. Exiting.")
        return

    training_dataset = generate_finetuning_dataset(text_chunks)
    if not training_dataset:
        print("Failed to generate a training dataset. Exiting.")
        return

    print("\n--- Starting Model Fine-Tuning ---")

    print(f"Loading base model: {base_model_name}")
    model = SentenceTransformer(base_model_name)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    train_dataloader = DataLoader(
        training_dataset, shuffle=True, batch_size=batch_size)

    train_loss = losses.MultipleNegativesRankingLoss(model)

    # Calculate warmup steps
    warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)

    print(
        f"Training parameters: epochs={num_epochs}, batch_size={batch_size}, warmup_steps={warmup_steps}")

    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=num_epochs,
        warmup_steps=warmup_steps,
        optimizer_class=type(optimizer),
        optimizer_params={'lr': 2e-5},
        output_path=fine_tuned_model_output_path,
        show_progress_bar=True,
        checkpoint_save_steps=100,
        checkpoint_path=f"{fine_tuned_model_output_path}/checkpoints"
    )

    print(
        f"--- Fine-tuning complete. Model saved to: {fine_tuned_model_output_path} ---")

    print("\n--- Testing the Fine-Tuned Model ---")
    fine_tuned_model = SentenceTransformer(fine_tuned_model_output_path)

    test_query = "বিবাহ অনুষ্ঠানে মামার আচরণ কেমন ছিল?"
    test_passage = "মামা বিবাহ-বাড়িতে ঢুকিয়া খুশি হইলেন না। একে তো উঠানটাতে বরযাত্রীদের জায়গা সংকুলান হওয়াই শক্ত, তাহার পরে সমস্ত আয়োজন নিতান্ত মধ্যম রকমের।"

    query_embedding = fine_tuned_model.encode(
        test_query, convert_to_tensor=True)
    passage_embedding = fine_tuned_model.encode(
        test_passage, convert_to_tensor=True)

    similarity = torch.nn.functional.cosine_similarity(
        query_embedding, passage_embedding, dim=0)

    print(f"Test Query: {test_query}")
    print(f"Test Passage: {test_passage}")
    print(f"Cosine Similarity: {similarity.item():.4f}")


if __name__ == "__main__":
    main()

Starting to parse the PDF: ../data/raw/HSC26-Bangla1st-Paper.pdf...
Started parsing the file under job_id 016f7b0c-2f9e-4703-85de-7d36de57ccf0
PDF parsing completed successfully.
Chunking documents...
Created 226 text chunks.
Generating synthetic dataset for fine-tuning...
Generated 10/217 examples...
Generated 20/217 examples...
Generated 30/217 examples...
Generated 40/217 examples...
Generated 50/217 examples...
Generated 60/217 examples...
Generated 70/217 examples...
Generated 80/217 examples...
Generated 90/217 examples...
Generated 100/217 examples...
Generated 110/217 examples...
Generated 120/217 examples...
Generated 130/217 examples...
Generated 140/217 examples...
Generated 150/217 examples...
Generated 160/217 examples...
Generated 170/217 examples...
Generated 180/217 examples...
Generated 190/217 examples...
Generated 200/217 examples...
Generated 210/217 examples...
Successfully generated 217 training examples.

--- Starting Model Fine-Tuning ---
Loading base model: int

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 MiB. GPU 0 has a total capacity of 4.00 GiB of which 0 bytes is free. Of the allocated memory 9.85 GiB is allocated by PyTorch, and 68.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)