In [1]:
import os
import getpass
os.environ['LANGCHAIN_TRACING_V2'] = 'true'


In [2]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-4o-mini")

In [3]:
pdf_path = 'pdfs'
pdf_files = [os.path.join(pdf_path, f) for f in os.listdir(pdf_path) if f.endswith('.pdf')]

In [4]:
pdf_files = [ # always dict with keys path, title, authors
     {'path': 'pdfs/1706.03762v7.pdf', 'title': 'Attention Is All You Need', 'authors': 'Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, Lukasz and Polosukhin, Illia'},
     {'path': 'pdfs/srivastava14a.pdf', 'title': 'Dropout: A Simple Way to Prevent Neural Networks from Overfitting', 'authors': 'Srivastava, Nitish and Hinton, Geoffrey and Krizhevsky, Alex and Sutskever, Ilya and Salakhutdinov, Ruslan'}
]

In [5]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma

gather_pages = []
for pdf_file in pdf_files:
    # PdPDFLoader excepts strings as file paths, allows to extract images via extract_images = True
    loader = PyPDFLoader(pdf_file["path"])
    pages = loader.load_and_split()
    metadata = f"Title: {pdf_file['title']}\nAuthors: {pdf_file['authors']}"
    for page in pages:
        page.metadata["source"] = metadata
    gather_pages.extend(pages)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
splits = text_splitter.split_documents(gather_pages)

vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings(model="text-embedding-3-small"), persist_directory="db/")

In [50]:
vectorstore = Chroma(embedding_function=OpenAIEmbeddings(model="text-embedding-3-small"), persist_directory="db/")

In [51]:
def format_context(context):
    """Format the context for the prompt.
    Args:
        context: list of tuples (document, score) from the retriever
    Returns:
        str: formatted context
    """
    output = ""
    for doc in context:
        output += doc[0].metadata["source"] + f"\nPage: {doc[0].metadata['page']}" +\
        f"\nPage content: {doc[0].page_content}\n\n"
    return output


In [52]:
query = "Is reproduction an important concept in computer science?"
num_context_chunks = 10
results = vectorstore.similarity_search_with_score(
    query , k=num_context_chunks
)

formatted_context = format_context(results)
prompt = "Systemprompt:\nYou are an assistant for question-answering tasks. Use the following pieces of " + \
"retrieved context to answer the question and give reference to the publication title, authors " +\
"and page as footnotes but without duplicate references. If you don't know the answer, just say "+ \
"that you don't know. Please keep the answer concise and precise.\n\n" +\
f"Question:\n{query}\n\n" +\
f"Context:\n{formatted_context}"

answer = llm.call_as_llm(prompt)
print(answer)

Reproduction is an important concept in computer science, particularly in the context of evolutionary algorithms and neural network training. The principles of sexual reproduction, such as gene mixing and mutation, are often applied to optimize algorithms, enhancing robustness and adaptability in machine learning models. This concept is exemplified in the work that discusses dropout techniques in neural networks, drawing parallels between biological evolution and algorithmic improvement to prevent overfitting and enhance performance^1.

---
^1 Srivastava, Nitish et al. "Dropout: A Simple Way to Prevent Neural Networks from Overfitting," page 3.
