In [1]:
import os
import pandas as pd


def leer_y_combinar_csv(directorio):
    archivos_csv = [archivo for archivo in os.listdir(directorio) if archivo.endswith('.csv')]
    dataframes = [pd.read_csv(os.path.join(directorio, archivo), index_col=0) for archivo in archivos_csv]
    df_combinado = pd.concat(dataframes, ignore_index=True)
    return df_combinado


# Uso de ejemplo
directorio = './data/raw'
file_name = 'Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks.pdf'
# df_raw = leer_y_combinar_csv(directorio)
df_raw = pd.read_csv(os.path.join(directorio, file_name + '.csv'), index_col=0)

In [2]:
df_raw.columns

Index(['type', 'element_id', 'text', 'coordinates_points',
       'coordinates_system', 'coordinates_layout_width',
       'coordinates_layout_height', 'last_modified', 'filetype', 'languages',
       'page_number', 'file_directory', 'filename', 'detection_class_prob',
       'parent_id', 'image_path'],
      dtype='object')

In [3]:
# Eliminar la columna coordinates_points, coordinates_system
df_raw = df_raw.drop(columns=['coordinates_system', 'coordinates_layout_width',
                              'coordinates_layout_height', 'last_modified', 'file_directory', ])

In [4]:
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field, validator
from langchain_ollama.chat_models import ChatOllama
from langchain_openai import ChatOpenAI

# model = ChatOllama(model='llama3.1', temperature=0)
model = ChatOpenAI(model='gpt-4o', temperature=0.0)

try:
    with open(os.path.join(directorio, file_name + '.json'), 'r') as archivo:
        meta = archivo.read()
except:
    meta = ""


In [5]:
# Assuming you have a class defined for handling the structured output
class TextEvaluation(BaseModel):
    makes_sense: bool = Field(
        default=False,
        description="Indicates whether the text is coherent and understandable."
    )
    block_type: str = Field(
        default="",
        description="Specifies the category of the text block, such as 'Title', 'Author Names', 'Bibliography', etc."
    )
    description: str = Field(
        default="",
        description="Provides a concise summary or explanation of the text block's content."
    )



In [6]:
# Define meta-information and evaluation criteria as a string
meta_info = """
Review the previously classified text block:
<Text_to_evaluate>
    {text_to_evaluate}
</Text_to_evaluate>

Classification: {old_type}

Assessment Task:
1. Determine if the text makes sense by itself.
   - If it does not make sense, label it as 'UncategorizedText'.
   - If it does make sense, classify it into one of the following categories based on its content:

Categories:
- 'Title': For main and sectional headings of a document, such as "Annual Report," "Introduction," "Methodology," or more specific parts like "1. Introduction," "2. Methodology."
- 'Author Names': For listing one or more authors of the document.
- 'Bibliography': For parts of the bibliography section, which may include citations of books, articles, and online sources typically found at the end of academic works.
- 'FigureCaption': For labels and descriptions of figures or tables, such as "Figure 1: Population Trends over Time" or "Table 2: Detailed Revenue Comparison by Quarter."
- 'NarrativeText': For extensive text blocks that form the main content of the document. These are usually longer, containing multiple sentences and spanning several paragraphs, providing in-depth information and discussion.
- 'UncategorizedText': For text that does not clearly fit into any other categories or whose meaning is unclear.

Additional Meta Information:
<meta>
{meta}
</meta>

Formatting Instructions:
{format_instructions}

Please follow the formatting guidelines provided to ensure consistency in the classification process.
"""

# Set up a parser + inject instructions into the prompt template.
parser = JsonOutputParser(pydantic_object=TextEvaluation)

prompt = PromptTemplate(
    template=meta_info,
    input_variables=["text_to_evaluate", "old_type"],
    partial_variables={
        "meta": meta,
        "format_instructions": parser.get_format_instructions(),
    },
)

# Assuming 'model' is an instance of a language model from langchain
chain = prompt | model | parser


In [7]:
df_raw["makes_sense"] = None
df_raw["block_type"] = None
df_raw["description"] = None
for i in range(len(df_raw)):
    if df_raw.loc[i, 'type'] in ['Image', 'Table']:
        continue
    # Text for evaluation (would be dynamically inserted in a real application)
    text_to_evaluate = df_raw.loc[i, 'text']
    old_type = df_raw.loc[i, 'type']

    try:
        # Invocation with dynamic content
        result = chain.invoke({"text_to_evaluate": text_to_evaluate, "old_type": old_type})
        print(i, result)

        df_raw.loc[i, "makes_sense"] = result["makes_sense"]
        df_raw.loc[i, "block_type"] = result["block_type"]
        df_raw.loc[i, "description"] = result["description"]
    except:
        print(1, "Failed")
        continue  # Salta a la siguiente iteración si el JSON es inválido

0 {'makes_sense': False, 'block_type': 'UncategorizedText', 'description': "The text block '1 2 0 2' does not provide coherent or understandable information and does not fit into any of the specified categories."}
1 {'makes_sense': False, 'block_type': 'UncategorizedText', 'description': 'The text block does not form a coherent or understandable sentence or phrase.'}
2 {'makes_sense': False, 'block_type': 'UncategorizedText', 'description': 'The text block does not make sense and does not fit into any of the specified categories.'}
3 {'makes_sense': True, 'block_type': 'Title', 'description': 'The text is a title for a document or section, likely related to a method or approach in natural language processing (NLP) that involves retrieval-augmented generation for tasks requiring extensive knowledge.'}
4 {'makes_sense': True, 'block_type': 'Author Names', 'description': 'The text block lists the names of authors, specifically Patrick Lewis and Ethan Perez.'}
5 {'makes_sense': True, 'bloc

Muchos titulos no son encontrados, tal vez hay que preguntar si puede ser el titulo de la siguiente seccion o por que es una frase corta o una sola palabra

categoriza como formula un bloque de texto que incluye o explica un fragmento de una formula

In [8]:
df_raw[['type', "block_type", 'text', "description", "makes_sense", 'languages', 'page_number', 'filename']]

Unnamed: 0,type,block_type,text,description,makes_sense,languages,page_number,filename
0,UncategorizedText,UncategorizedText,1 2 0 2,The text block '1 2 0 2' does not provide cohe...,False,['eng'],1,Retrieval-Augmented Generation for Knowledge-I...
1,Header,UncategorizedText,r p A 2 1 ] L C . s c [,The text block does not form a coherent or und...,False,['eng'],1,Retrieval-Augmented Generation for Knowledge-I...
2,UncategorizedText,UncategorizedText,4 v 1 0 4 1 1 . 5 0 0 2 : v i X r a,The text block does not make sense and does no...,False,['eng'],1,Retrieval-Augmented Generation for Knowledge-I...
3,Title,Title,Retrieval-Augmented Generation for Knowledge-I...,"The text is a title for a document or section,...",True,['eng'],1,Retrieval-Augmented Generation for Knowledge-I...
4,NarrativeText,Author Names,"Patrick Lewis'?, Ethan Perez*,","The text block lists the names of authors, spe...",True,['eng'],1,Retrieval-Augmented Generation for Knowledge-I...
...,...,...,...,...,...,...,...,...
219,Title,UncategorizedText,H Retrieval Collapse,The text 'H Retrieval Collapse' does not provi...,False,['eng'],19,Retrieval-Augmented Generation for Knowledge-I...
220,NarrativeText,NarrativeText,"In preliminary experiments, we observed that f...",The text discusses the behavior of a retrieval...,True,['eng'],19,Retrieval-Augmented Generation for Knowledge-I...
221,Title,Title,I Number of instances per dataset,This text block serves as a sectional heading ...,True,['eng'],19,Retrieval-Augmented Generation for Knowledge-I...
222,NarrativeText,FigureCaption,"The number of training, development and test d...","This text describes the contents of Table 7, w...",True,['eng'],19,Retrieval-Augmented Generation for Knowledge-I...


In [9]:
df_raw.to_csv(os.path.join('./data/pre-procesamiento/', file_name + '.1_2_openia.csv'))

In [15]:
import ollama


def describe_img(img_path):
    query = "Describe in comprehensive detail the contents of the image, focusing intensely on what it represents, its functions, and what it shows. Elaborate on the specific elements, their configurations, and how they contribute to the overall meaning or purpose of the image within the context of a scientific paper. Emphasize the significance of any notable features, colors, or patterns, explaining their relevance in understanding the figure. Ensure the description is thorough, avoiding personal references, and strictly maintain an objective tone by stating facts such as 'the image includes...' or 'the figure demonstrates...'"

    response = ollama.generate(
        model='llava-llama3',
        prompt=query,
        images=[img_path]
    )
    return response['response']

In [18]:
df_raw["img_description"] = None
for i in range(len(df_raw)):
    if df_raw.loc[i, 'type'] != 'Image':
        continue
    df_raw.loc[i, 'img_description'] = describe_img(df_raw.loc[i, 'image_path'])
    print(i, df_raw.loc[i, 'image_path'])

12 ./images/Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks.pdf/figure-2-1.jpg
79 ./images/Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks.pdf/figure-7-2.jpg
80 ./images/Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks.pdf/figure-7-3.jpg
99 ./images/Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks.pdf/figure-8-4.jpg
198 ./images/Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks.pdf/figure-17-5.jpg


In [20]:
print(df_raw.loc[12, 'img_description'])

The image is a detailed illustration of the "Flickr-End-to-End Speech-to-Text" system, as described in a scientific paper. The system is depicted through a series of diagrams and charts, set against a white background.

At the top, there's a diagram representing the encoder. It consists of three vertical rectangles, each with a different colored label: "Query Encoder", "Memory Encoder", and "Recurrent Neural Network". These labels indicate the various components that make up the encoder part of the system.

Next to this diagram is a flowchart showing the flow of information through the system. The flowchart begins with an input box labeled "query", which represents the initial input to the system. From here, the information flows through three boxes: "Query Encoder", "Memory Encoder", and "Recurrent Neural Network". These boxes represent different stages of processing within the system.

Finally, there's a chart that illustrates the perplexment score at each point in time during the de

In [52]:
def generate_markdown_tables(img_path):
    query = """
    Transform the tables visible in the image into Markdown table format. 
    Do not include any descriptive text, only provide the tables in Markdown format.
    """

    response = ollama.generate(
        model='llava-llama3',
        prompt=query,
        images=[img_path]
    )
    return response['response']

In [None]:
from IPython.display import display, Markdown

display(Markdown(generate_markdown_tables(
    './images/Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks.pdf/table-6-1.jpg')))

In [21]:
import base64


def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read()).decode('utf-8')
    return encoded_string

# df_raw.loc[12, 'image_path']


In [27]:
from langchain.chains import LLMChain
from langchain.llms import OpenAI
from langchain.schema import Prompt
from langchain.vision import LocalImageClassifier

# Configure your API key for the language model

llm = OpenAI()

# Set up the local image classifier
# This is a placeholder; you need to configure this according to your actual setup
image_processor = LocalImageClassifier(model_path=df_raw.loc[12, 'image_path'])

# Create a LangChain with both the language model and the image processor
chain = LLMChain(llm=llm, preprocessors=[image_processor])

# Define the path to your local image
image_path = df_raw.loc[12, 'image_path']

# Create a prompt with your image
prompt = Prompt(image=image_path)

# Process the image and get a description
result = chain.run(prompt)
print(result.output)


ModuleNotFoundError: No module named 'langchain_community'

In [30]:
from llama_index.multi_modal_llms.openai import OpenAIMultiModal

openai_mm_llm = OpenAIMultiModal( model="gpt-4o")

In [37]:
response_1 = openai_mm_llm.complete(
    prompt="Describe in comprehensive detail the contents of the image, focusing intensely on what it represents, its functions, and what it shows. Elaborate on the specific elements, their configurations, and how they contribute to the overall meaning or purpose of the image within the context of a scientific paper. Emphasize the significance of any notable features, colors, or patterns, explaining their relevance in understanding the figure. Ensure the description is thorough, avoiding personal references, and strictly maintain an objective tone by stating facts such as 'the image includes...' or 'the figure demonstrates...'",
    image_documents=[img]
    
)

print(response_1)

The image is a detailed schematic diagram illustrating an end-to-end backdrop process through two components, denoted as \( q \) and \( p_{\theta} \). The diagram is divided into three main sections: Query Encoder, Retriever \( p_n \) (Non-Parametric), and Generator \( p_{\theta} \) (Parametric).

1. **Query Encoder**:
   - The leftmost section of the diagram is labeled "Query Encoder."
   - It includes a box labeled \( q \) with an arrow pointing right, indicating the encoding of the input query.
   - Examples of input queries are provided, such as "Barack Obama was born in Hawaii." and "The Divine Comedy."
   - These examples are categorized under different tasks like Question Answering, Fact Verification, and Jeopardy Question Generation.

2. **Retriever \( p_n \) (Non-Parametric)**:
   - The middle section is labeled "Retriever \( p_n \) (Non-Parametric)."
   - It features a process where the encoded query \( q(x) \) is matched against a Document Index.
   - The matching process is

In [39]:
from IPython.display import display, Markdown
display(Markdown(response_1.text))

The image is a detailed schematic diagram illustrating an end-to-end backdrop process through two components, denoted as \( q \) and \( p_{\theta} \). The diagram is divided into three main sections: Query Encoder, Retriever \( p_n \) (Non-Parametric), and Generator \( p_{\theta} \) (Parametric).

1. **Query Encoder**:
   - The leftmost section of the diagram is labeled "Query Encoder."
   - It includes a box labeled \( q \) with an arrow pointing right, indicating the encoding of the input query.
   - Examples of input queries are provided, such as "Barack Obama was born in Hawaii." and "The Divine Comedy."
   - These examples are categorized under different tasks like Question Answering, Fact Verification, and Jeopardy Question Generation.

2. **Retriever \( p_n \) (Non-Parametric)**:
   - The middle section is labeled "Retriever \( p_n \) (Non-Parametric)."
   - It features a process where the encoded query \( q(x) \) is matched against a Document Index.
   - The matching process is indicated by arrows leading to a box labeled "MIPS" (Maximum Inner Product Search).
   - The Document Index is represented by a series of stacked documents, each with a wave-like pattern, suggesting a collection of textual data.
   - The output of this retrieval process is denoted as \( d(z) \), which is then

In [35]:
from llama_index.core.schema import ImageDocument

img = ImageDocument(image_path=df_raw.loc[12, 'image_path'])