In [None]:
!pip install langchain_community
!pip install unstructured
!pip install pillow
!pip install pdf2image
!sudo apt-get install poppler-utils
!pip install pdfminer.six
!pip install pillow_heif
!pip install unstructured_inference
!pip install unstructured_pytesseract
!pip install pytesseract
!pip install tesseract
!sudo apt-get install tesseract-ocr
!pip install langchain-chroma
!pip install sentence-transformers

In [None]:
import os
from pdf2image import convert_from_path
from langchain_community.document_loaders import UnstructuredImageLoader
from langchain_chroma import Chroma
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from transformers import AutoTokenizer

def convert_pdf_to_images(pdf_path, output_dir):
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Convert PDF pages to images
    images = convert_from_path(pdf_path)

    # Save each image in the output directory
    image_paths = []
    for i, image in enumerate(images):
        image_path = os.path.join(output_dir, f'page_{i + 1}.png')
        image.save(image_path, 'PNG')
        image_paths.append(image_path)

    print(f"PDF pages have been converted and saved in '{output_dir}'")
    return image_paths

def extract_text_from_images(image_paths):
    text = ""
    for image in image_paths:
        loader = UnstructuredImageLoader(image)
        doc = loader.load()
        text += doc[0].page_content
    return text

def text_splitter_and_similarity_search(documents):
    model_name = "BAAI/bge-large-en-v1.5"
    model_kwargs = {"device": "cuda", "trust_remote_code": True}
    encode_kwargs = {"normalize_embeddings": True}

    embedding_function = SentenceTransformerEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs,
    )

    db = Chroma.from_documents(documents, embedding_function, persist_directory="./drive/MyDrive/book2_embed")
    return db

def get_docs(text):
    docs = Document(page_content=text, metadata={"source": "book"})
    doc = [docs]
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    documents = text_splitter.split_documents(doc)
    return documents

def check_token_limit(new_documents):
    tokenizer = AutoTokenizer.from_pretrained("mlabonne/NeuralDaredevil-8B-abliterated")
    tokens = tokenizer(new_documents)
    n_tokens = len(tokens.input_ids)
    return n_tokens




In [None]:
def main():
    pdf_path = 'book2.pdf'
    output_dir = 'output_images'

    # Convert PDF to images
    image_paths = convert_pdf_to_images(pdf_path, output_dir)

    # Extract text from images
    text = extract_text_from_images(image_paths)

    # Get documents from text
    documents = get_docs(text)

    # Create vector database
    db = text_splitter_and_similarity_search(documents)

    # Check token limit
    n_tokens = check_token_limit(documents)
    print(f"Number of tokens: {n_tokens}")

    # Initialize vector database
    vectordb = Chroma(persist_directory="embeddings", embedding_function=db.embedding_function)
    print(vectordb)

if __name__ == "__main__":
    main()