In [14]:
import os
# Import langchain modules
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

In [15]:
from dotenv import load_dotenv

# Load environment variables from the .env file
load_dotenv()

# Get the OpenAI API key from environment variables
openai_api_key = os.getenv('OPENAI_API_KEY')
if openai_api_key is None:
    raise ValueError("OpenAI API key not found in .env file.")

# Set the OpenAI API key in the environment
os.environ['OPENAI_API_KEY'] = openai_api_key

In [16]:
path = r"C:\Users\himan\Downloads\Attention is all you need.pdf"

# Helper function to replace tabs with spaces in texts
def replace_t_with_space(texts):
    """
    Replaces tab characters with spaces in the given texts.
    """
    for text in texts:
        if hasattr(text, 'page_content'):
            text.page_content = text.page_content.replace('\t', ' ')
    return texts

In [17]:
# Function to encode the PDF into a vector store
def encode_pdf(path, chunk_size=1000, chunk_overlap=200):
    """
    Encodes a PDF document into a vector store using OpenAI embeddings.

    Args:
        path: The path to the PDF file.
        chunk_size: The desired size of each text chunk.
        chunk_overlap: The amount of overlap between consecutive chunks.

    Returns:
        A FAISS vector store containing the encoded document content.
    """

    # Load PDF documents
    loader = PyPDFLoader(path)
    documents = loader.load()

    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )
    texts = text_splitter.split_documents(documents)
    cleaned_texts = replace_t_with_space(texts)

    # Create embeddings and vector store
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_documents(cleaned_texts, embeddings)

    return vectorstore

In [18]:
# Encode the PDF and create the vector store
chunks_vector_store = encode_pdf(path)

# Create a retriever from the vector store
chunks_query_retriever = chunks_vector_store.as_retriever(search_kwargs={"k": 2})

# Test query
test_query = "What is the summary of paper?"

In [19]:
pip install pypdf

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.2 -> 24.2
[notice] To update, run: C:\Users\himan\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [20]:
# Encode the PDF and create the vector store
chunks_vector_store = encode_pdf(path)

# Create a retriever from the vector store
chunks_query_retriever = chunks_vector_store.as_retriever(search_kwargs={"k": 2})

# Test query
test_query = "What is the summary of paper?"

In [21]:
# Helper function to retrieve context based on a query
def retrieve_context_per_question(query, retriever):
    """
    Retrieves relevant documents for the given query using the provided retriever.
    """
    relevant_docs = retriever.get_relevant_documents(query)
    return relevant_docs

# Retrieve context based on the query
context = retrieve_context_per_question(test_query, chunks_query_retriever)

# Helper function to display the retrieved context
def show_context(context):
    """
    Displays the content of the retrieved documents.
    """
    for i, doc in enumerate(context):
        print(f"Document {i+1}:\n")
        print(doc.page_content)
        print("\n" + "-" * 80 + "\n")

# Display the retrieved context
show_context(context)

Document 1:

networks. In Advances in Neural Information Processing Systems , pages 3104–3112, 2014.
[30] Christian Szegedy, Vincent Vanhoucke, Sergey Ioffe, Jonathon Shlens, and Zbigniew Wojna.
Rethinking the inception architecture for computer vision. CoRR , abs/1512.00567, 2015.
[31] Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang
Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, et al. Google’s neural machine
translation system: Bridging the gap between human and machine translation. arXiv preprint
arXiv:1609.08144 , 2016.
[32] Jie Zhou, Ying Cao, Xuguang Wang, Peng Li, and Wei Xu. Deep recurrent models with
fast-forward connections for neural machine translation. CoRR , abs/1606.04199, 2016.
11

--------------------------------------------------------------------------------

Document 2:

convolution is equal to the combination of a self-attention layer and a point-wise feed-forward layer,
the approach we take in our model.
As side beneﬁt, s