In [1]:
import fitz  # PyMuPDF
from PIL import Image
import pytesseract
import io

# Set Tesseract path
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

def extract_text_from_pdf_pymupdf(pdf_path, lang="eng+ben"):
    doc = fitz.open(pdf_path)
    all_text = ""

    for i, page in enumerate(doc):
        pix = page.get_pixmap(dpi=300)
        img = Image.open(io.BytesIO(pix.tobytes()))
        text = pytesseract.image_to_string(img, lang=lang)
        all_text += f"\n--- Page {i+1} ---\n{text}\n"

    with open("output.txt", "w", encoding="utf-8") as f:
        f.write(all_text)

    print("✅ Text saved to output.txt")

# Example usage
extract_text_from_pdf_pymupdf("HSC26-Bangla1st-Paper.pdf")


✅ Text saved to output.txt


In [2]:
import torch
from transformers import AutoModel, AutoTokenizer, AutoConfig
import os
import shutil

class JinaEmbedding:
    def __init__(self, model_path="jina-embeddings-v3"):
        self.model_path = model_path
        
        # Verify PyTorch version
        if not hasattr(torch, 'compiler'):
            raise RuntimeError(
                "Jina embeddings require PyTorch 2.2+ with compiler support.\n"
                "Please upgrade with: pip install --upgrade torch torchvision torchaudio"
            )

        # First-time setup: download if directory doesn't exist or is incomplete
        if not os.path.exists(os.path.join(self.model_path, "config.json")):
            os.makedirs(self.model_path, exist_ok=True)
            print("Downloading Jina model...")
            try:
                model = AutoModel.from_pretrained(
                    "jinaai/jina-embeddings-v3",
                    trust_remote_code=True
                )
                model.save_pretrained(self.model_path)
                tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v3")
                tokenizer.save_pretrained(self.model_path)
            except Exception as e:
                shutil.rmtree(self.model_path)
                raise RuntimeError(f"Model download failed: {e}")

    def jina(self):
        if hasattr(torch.backends, "disable_math_sdp"):
            torch.backends.disable_math_sdp = True
        
        tokenizer = AutoTokenizer.from_pretrained(
            self.model_path,
            trust_remote_code=True
        )
        embedding_model = AutoModel.from_pretrained(
            self.model_path,
            trust_remote_code=True,
            torch_dtype=torch.float16
        )
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        embedding_model.to(device)
        return tokenizer, embedding_model, device
    


In [3]:
def read_and_split_text(file_path, max_chunk_length=512):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read().strip()
    return [text[i:i+max_chunk_length] for i in range(0, len(text), max_chunk_length)]


def embed_text_file(file_path="output.txt", model_dir="jina-embeddings-v3", task="text-matching"):
    # Load texts from file
    texts = read_and_split_text(file_path)

    # Initialize Jina model
    jina_model = JinaEmbedding(model_path=model_dir)
    tokenizer, model, device = jina_model.jina()

    # Tokenize texts
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(device)

    # Get embeddings
    with torch.no_grad():
        outputs = model(**inputs, task=task)
        embeddings = outputs.last_hidden_state[:, 0]  # CLS token

    # Save embeddings
    torch.save(embeddings.cpu(), "output_embeddings.pt")
    print("✅ Embeddings saved to output_embeddings.pt")


# Run it
embed_text_file()


Flash attention implementation does not support kwargs: task


✅ Embeddings saved to output_embeddings.pt
