In [78]:
import sys
import os
sys.path.append(os.path.dirname(os.getcwd()))

In [79]:
from langchain_community.llms import Ollama
from langchain_core.callbacks import CallbackManager, StreamingStdOutCallbackHandler

In [80]:
def initialize_llm():
    callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
    return Ollama(base_url="http://localhost:11434", model="quantphi", callback_manager=callback_manager)

llm = initialize_llm()

  llm = initialize_llm()


In [81]:
import databutton as db
import re
from io import BytesIO
from typing import Tuple, List
import pickle

from langchain.docstore.document import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from pypdf import PdfReader
import faiss

In [82]:
import re
from io import BytesIO
from typing import List, Tuple
# from PyPDF2 import PdfReader  # Make sure PyPDF2 is installed

class Document:
    def __init__(self, page_content: str, metadata: dict = None):
        self.page_content = page_content
        self.metadata = metadata if metadata else {}

def parse_pdf(file: BytesIO, filename: str) -> Tuple[List[str], str]:
    pdf = PdfReader(file)
    output = []
    for page in pdf.pages:
        text = page.extract_text() or ""  # Ensure text is not None
        text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
        text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
        text = re.sub(r"\n\s*\n", "\n\n", text)
        output.append(text)
    return output, filename

def text_to_docs(text: List[str], filename: str) -> List[Document]:
    if isinstance(text, str):
        text = [text]
    
    page_docs = [Document(page_content=page, metadata={"filename": filename}) for page in text]
    
    # Assign page numbers to documents
    for i, doc in enumerate(page_docs):
        doc.metadata["page"] = i + 1

    doc_chunks = []
    for doc in page_docs:
        text_splitter = RecursiveCharacterTextSplitter(  # Ensure this class is defined
            chunk_size=4000,
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
            chunk_overlap=0,
        )
        chunks = text_splitter.split_text(doc.page_content)  # Split the page into chunks
        for i, chunk in enumerate(chunks):
            chunk_doc = Document(
                page_content=chunk, 
                metadata={"page": doc.metadata["page"], "chunk": i}
            )
            chunk_doc.metadata["source"] = f"{chunk_doc.metadata['page']}-{chunk_doc.metadata['chunk']}"
            chunk_doc.metadata["filename"] = filename  # Add filename to metadata
            doc_chunks.append(chunk_doc)
    
    return doc_chunks


In [83]:
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import FAISS

# Load a pre-trained model from Sentence Transformers
model = SentenceTransformer('all-MiniLM-L6-v2')




In [84]:

# # Function to generate embeddings using Sentence Transformers
def generate_embeddings(docs):
    embeddings = model.encode([doc['page_content'] for doc in docs])  # Get embeddings
    embeddings = embeddings.astype('float32')  # FAISS requires float32
    return embeddings
def docs_to_index(docs):
    embeddings = generate_embeddings(docs)  # Generate embeddings
    index = faiss.IndexFlatL2(embeddings.shape[1])  # Create the FAISS index
    index.add(embeddings)  # Add embeddings to the index
    return index, embeddings
def get_index_for_pdf(pdf_files, pdf_names):
    documents = []
    for pdf_file, pdf_name in zip(pdf_files, pdf_names):
        text,filename = parse_pdf(BytesIO(pdf_file), pdf_name)
        documents.extend(text_to_docs(text,filename))
    index, embeddings = docs_to_index(documents)
    return index, embeddings, documents


In [85]:
def create_vectordb(files, filenames):
    # set_openai_api_key(api_key)
    # vectordb = get_index_for_pdf([file.getvalue() for file in files], filenames)
    index, embeddings, documents = get_index_for_pdf(files, filenames)
    # return vector_db
    return index, embeddings, documents

def ask_question(vectordb, question):
    k = 3  # Number of results to retrieve
    distances, indices = vectordb.search(generate_embeddings([question]), k)
    search_results = [{"page_content": vectordb[idx]['page_content'], "metadata": vectordb[idx]['metadata']} for idx in indices[0]]

    pdf_extract = "\n".join([result['page_content'] for result in search_results])

    prompt_template = """
        You are a helpful Assistant who answers to users questions based on multiple contexts given to you.
        Keep your answer short and to the point.
        The evidence are the context of the pdf extract with metadata.
        Carefully focus on the metadata specially 'filename' and 'page' whenever answering.
        Make sure to add filename and page number at the end of sentence you are citing to.
        Reply "Not applicable" if text is irrelevant.
        The PDF content is:
        {pdf_extract}
    """

    prompt = prompt_template.format(pdf_extract=pdf_extract)
    messages = [
        {"role": "system", "content": prompt},
        {"role": "user", "content": question}
    ]
    response = llm(messages)

    result = ""
    for chunk in response:
        text = chunk.choices[0].get("delta", {}).get("content")
        if text:
            result += text
    return result.strip()

In [117]:
import faiss
from io import BytesIO
from sentence_transformers import SentenceTransformer

# Initialize your sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Replace with your specific model

def create_vectordb(files, filenames):
    index, embeddings, documents = get_index_for_pdf(files, filenames)
    return index, embeddings, documents

def ask_question(vectordb, question):
    index, embeddings, documents = vectordb  # Unpack the tuple correctly
    k = 3  # Number of results to retrieve
    
    # Generate embeddings for the question correctly
    question_embedding = generate_embeddings([Document(page_content=question)])  # Wrap question in Document

    distances, indices = index.search(question_embedding, k)  # Use index for searching
    print(indices)
    
    # Retrieve the results based on the indices returned
    search_results = [{"page_content": documents[idx].page_content, "metadata": documents[idx].metadata} for idx in indices[0]]

    pdf_extract = "\n".join([result['page_content'] for result in search_results])

    prompt_template = f"""
        You are a helpful Assistant who answers to users' questions based on multiple contexts given to you.
        Keep your answer short and to the point.
        The evidence is the context of the PDF extract with metadata.
        Carefully focus on the metadata, especially 'filename' and 'page' whenever answering.
        Make sure to add filename and page number at the end of the sentence you are citing.
        Reply "Not applicable" if text is irrelevant.
        The PDF content is:
        {pdf_extract}
        and
        The question is:
        {question}
    """
    print(prompt_template)

    # Send the prompt as a single string
    # response = llm(prompt_template)  # Ensure llm is defined in your context
    response = llm(prompt=prompt_template, stop=["\n", "Not applicable"])  # Use keyword argument for prompt

    # Assuming llm returns a string directly
    result = response# Adjust according to the actual response format

    return result


def generate_embeddings(docs):
    # Extract page_content from Document objects
    embeddings = model.encode([doc.page_content for doc in docs])  # Use dot notation instead of subscript
    embeddings = embeddings.astype('float32')  # FAISS requires float32
    return embeddings


def docs_to_index(docs):
    embeddings = generate_embeddings(docs)  # Generate embeddings
    index = faiss.IndexFlatL2(embeddings.shape[1])  # Create the FAISS index
    index.add(embeddings)  # Add embeddings to the index
    return index, embeddings

def get_index_for_pdf(pdf_files, pdf_names):
    documents = []
    for pdf_file, pdf_name in zip(pdf_files, pdf_names):
        text, filename = parse_pdf(BytesIO(pdf_file), pdf_name)
        documents.extend(text_to_docs(text, filename))
    index, embeddings = docs_to_index(documents)
    return index, embeddings, documents

# Make sure you have the following functions defined:
# - parse_pdf: Function to parse PDF files and extract text.
# - text_to_docs: Function to convert extracted text to a document format.




Change the FilePath Below to the required PDF(works only for PDF )

In [118]:
# Example PDF files and names
pdf_file_paths = ["../graphfill.pdf"]  # Add paths to your PDF files
pdf_files = [open(path, "rb").read() for path in pdf_file_paths]
pdf_names = [os.path.basename(path) for path in pdf_file_paths]


# Create vector database
vectordb = create_vectordb(pdf_files, pdf_names)

# Ask a question
# question = "What is the main topic of the documents?"
# response = ask_question(vectordb, question)
# print(response)


In [119]:
question = "Who are the paper's author?"
response = ask_question(vectordb, question)
# print(response)

[[14 10  7]]

        You are a helpful Assistant who answers to users' questions based on multiple contexts given to you.
        Keep your answer short and to the point.
        The evidence is the context of the PDF extract with metadata.
        Carefully focus on the metadata, especially 'filename' and 'page' whenever answering.
        Make sure to add filename and page number at the end of the sentence you are citing.
        Reply "Not applicable" if text is irrelevant.
        The PDF content is:
        [27] Elad Richardson, Yuval Alaluf, Or Patashnik, Yotam Nitzan, Yaniv Azar, Stav Shapiro, and Daniel Cohen-Or. Encoding in style: a stylegan encoder for image-to-image translation. InProceedings of the IEEE/CVF conference on computer vision and pattern recognition, pages 2287–2296, 2021. [28] Weijing Shi and Raj Rajkumar. Point-gnn: Graph neural network for 3d object detection in a point cloud. In Proceedings of the IEEE/CVF conference on computer vision and pattern recognitio

In [120]:
response

' The authors of the paper described in your text are Kai Jin, Jianjie Huang, Sheng Liu, Wei Chen, Qi Zhang, Yongkui Yu, Xinqiao Bian, and Kewei Zhu. However, since I cannot access real-time databases to confirm the latest research papers as of my last update in April 2023, please verify this information by checking the actual paper or its citations on academic platforms like Google Scholar, IEEE Xplore, or PubMed for accurate authorship details.'