In [None]:

import os
import uuid
import base64
from unstructured.partition.pdf import partition_pdf
from langchain.chat_models import ChatOllama
from langchain.embeddings import OpenAIEmbeddings
from langchain_experimental.open_clip import OpenCLIPEmbeddings

from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.schema.messages import HumanMessage, SystemMessage
from langchain.schema.document import Document
from langchain.vectorstores import FAISS

openai_api_key = "some"


In [None]:
BOOK_ROUTE = "../books/libro-cocina.pdf"
OUTPUT_PATH = "data_final/"
SAVE_PATH=os.getenv("PROJECT_PATH")
#"/home/gilbert/Documentos/experiments/mm-rag/"

In [None]:
raw_pdf_elements = partition_pdf(
    filename=BOOK_ROUTE,
    extract_images_in_pdf=True,
    infer_table_structure=True,
    chunking_strategy="by_title",
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
    image_output_dir_path=os.path.join(SAVE_PATH, OUTPUT_PATH)
)

In [None]:
raw_pdf_elements

In [None]:
text_elements = []
table_elements = []

text_summaries = []
table_summaries = []

summary_prompt = """
Eres un asistente encargado de resumir tablas y texto para recuperarlos. \
Estos resúmenes se incrustarán y utilizarán para recuperar el texto sin formato o los elementos de la tabla. \
Proporcione un resumen conciso de la tabla o texto que esté bien optimizado para su recuperación.
Tipo de texto: {element_type}
Texto: {element}:

"""
summary_chain = LLMChain(
    llm=ChatOllama(model="llama3", max_tokens=1024),
    prompt=PromptTemplate.from_template(summary_prompt)
)

for e in raw_pdf_elements:
    if 'CompositeElement' in repr(e):
        text_elements.append(e.text)
        summary = summary_chain.run({'element_type': 'text', 'element': e})
        text_summaries.append(summary)

    elif 'Table' in repr(e):
        table_elements.append(e.text)
        summary = summary_chain.run({'element_type': 'table', 'element': e})
        table_summaries.append(summary)

In [None]:
text_elements

In [None]:

# Get image summaries
image_elements = []
image_summaries = []

def encode_image(image_path):
    with open(image_path, "rb") as f:
        return base64.b64encode(f.read()).decode('utf-8')

def summarize_image(encoded_image):
    prompt = [
        SystemMessage(content="""Eres un asistente encargado de resumir imágenes de recetas de cocina.Ofrezca un resumen conciso de la imagen que esté bien optimizado para su recuperación.
    
    Si en la imagen no hay nada asociado a un plato de comida, es decir esta relacionado solo a texto, responde con el siguiente texto:
    'No es una receta'"""),
        HumanMessage(content=[
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{encoded_image}"
                },
            },
        ])
    ]
    response = ChatOllama(model="llava", max_tokens=1024).invoke(prompt)
    return response.content

for i in os.listdir(os.path.join(SAVE_PATH, OUTPUT_PATH)):
    if i.endswith(('.png', '.jpg', '.jpeg')):
        image_path = os.path.join(os.path.join(SAVE_PATH, OUTPUT_PATH), i)
        encoded_image = encode_image(image_path)
        image_elements.append(encoded_image)
        summary = summarize_image(encoded_image)
        image_summaries.append(summary)

In [None]:
test_img = []
for img in image_summaries:
    if img != 'No es una receta':
        continue
    test_img.append(img)
len(test_img)

In [None]:
documents = []
retrieve_contents = []

for e, s in zip(text_elements, text_summaries):
    i = str(uuid.uuid4())
    doc = Document(
        page_content = s,
        metadata = {
            'id': i,
            'type': 'text',
            'original_content': e
        }
    )
    retrieve_contents.append((i, e))
    documents.append(doc)

for e, s in zip(table_elements, table_summaries):
    doc = Document(
        page_content = s,
        metadata = {
            'id': i,
            'type': 'table',
            'original_content': e
        }
    )
    retrieve_contents.append((i, e))
    documents.append(doc)

for e, s in zip(image_elements, image_summaries):
    if s == 'No es una receta':
        continue
    doc = Document(
        page_content = s,
        metadata = {
            'id': i,
            'type': 'image',
            'original_content': e
        }
    )
    retrieve_contents.append((i, s))
    documents.append(doc)

vectorstore = FAISS.from_documents(documents=documents, embedding=OpenCLIPEmbeddings())

In [None]:
vectorstore.save_local(os.path.join(SAVE_PATH, "vectordb/faiss_index"))
