In [6]:
%pip install datasets
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.llms import Ollama
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_community.vectorstores import DocArrayInMemorySearch
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import EmbeddingsFilter
from datasets import Dataset
import pandas as pd


# Initial configuration
pdf_folder = '../../reports-pdf'  # Replace with the path to your PDF folder
ollama_model = 'tinyllama'  # Or 'llama2' depending on the model you wish to use



Note: you may need to restart the kernel to use updated packages.


In [None]:

# Initialize the model and embeddings
model = Ollama(model=ollama_model)
embeddings = OllamaEmbeddings(model=ollama_model)

# Define the questions
def generate_questions(based_on):
    return [
        f"¿Cuáles son los hallazgos más relevantes en  {based_on}?",
        f"¿Qué eventos específicos se describen en {based_on}?",
        f"What recommendations are suggested in {based_on}?"
    ]

# Prompt template
template = """
Answer the question based on the following context. If you cannot answer, say "I don't have enough information to answer that question."

Context: {context}

Question: {question}
"""
prompt = PromptTemplate.from_template(template)

# List to store the instances
instances = []

# Iterate over the PDF files in the folder
for file in os.listdir(pdf_folder):
    if file.endswith('.pdf'):
        pdf_path = os.path.join(pdf_folder, file)
        
        # Load and extract text from the PDF
        loader = PyPDFLoader(pdf_path)
        pages = loader.load_and_split()
        
        if pages:  # Check if there are pages with text
            # Create a vector store from the document pages
            vectorstore = DocArrayInMemorySearch.from_documents(pages, embedding=embeddings)
            retriever = vectorstore.as_retriever()

            # Generate questions
            questions = generate_questions(file)
            
            for question in questions:
                # Retrieve relevant context for the question
                relevant_docs = retriever.get_relevant_documents(question)
                context = " ".join([doc.page_content for doc in relevant_docs])
                
                # Create the prompt
                formatted_prompt = prompt.format(context=context, question=question)
                print("search response for "+ file)
                # Get the model's response
                response = model.invoke(formatted_prompt)
                
                # Store the question and response
                instances.append({
                    "context": context,
                    "question": question,
                    "answer": response,
                     "generated_by": ollama_model
                })



Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 19 0 (offset 0)
Ignoring wrong pointing object 21 0 (offset 0)
Ignoring wrong pointing object 23 0 (offset 0)
Ignoring wrong pointing object 41 0 (offset 0)
Ignoring wrong pointing object 49 0 (offset 0)
Ignoring wrong pointing object 137 0 (offset 0)
  relevant_docs = retriever.get_relevant_documents(question)


search response for 058-CI-00233.pdf
search response for 058-CI-00233.pdf
search response for 058-CI-00233.pdf
search response for 058-CI-00235.pdf
search response for 058-CI-00235.pdf
search response for 058-CI-00235.pdf
search response for 058-CI-00627.pdf
search response for 058-CI-00627.pdf
search response for 058-CI-00627.pdf
search response for 058-CI-00628.pdf
search response for 058-CI-00628.pdf
search response for 058-CI-00628.pdf
search response for 058-CI-00661.pdf
search response for 058-CI-00661.pdf
search response for 058-CI-00661.pdf
search response for 058-CI-00662.pdf
search response for 058-CI-00662.pdf
search response for 058-CI-00662.pdf
search response for 058-CI-00771.pdf
search response for 058-CI-00771.pdf
search response for 058-CI-00771.pdf
search response for 058-CI-00772.pdf
search response for 058-CI-00772.pdf
search response for 058-CI-00772.pdf
search response for 058-CI-00776.pdf
search response for 058-CI-00776.pdf
search response for 058-CI-00776.pdf


KeyboardInterrupt: 

In [8]:

# Convert the list of instances to a DataFrame
df = pd.DataFrame(instances)

# Convert the DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Push the dataset to the Hugging Face Hub
# Replace 'your-username' with your Hugging Face username
# Replace 'your-dataset-name' with your desired dataset name
dataset.push_to_hub("jdavit/colombian-conflict-SQA")

print("Dataset uploaded successfully!")

# To view the dataset, you can use the following URL:
print(f"View your dataset at: https://huggingface.co/datasets/your-username/your-dataset-name")

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<?, ?ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:10<00:00, 10.58s/it]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Dataset uploaded successfully!
View your dataset at: https://huggingface.co/datasets/your-username/your-dataset-name


: 