In [1]:
import os
import glob
from pathlib import Path
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

# LangChain imports
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma, FAISS
from langchain.llms import Ollama
from langchain.chains import RetrievalQA
from langchain.schema import Document

In [2]:
class Config:
    PDF_DIRECTORY = "papers/"  # Directory where PDF papers are stored
    VECTOR_DB_PATH = "./vector_db"  # Directory where vectorial data base is stored
    MODEL_NAME = "llama3.2"  # Ollama Model
    EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" # Embedding Model
    CHUNK_SIZE = 1000 # Size for the chunk decomposition
    CHUNK_OVERLAP = 200
    SEARCH_K = 4  # number of chunks to retrieve

config = Config()


In [3]:
class ResearchAssistant:
    def __init__(self, config):
        self.config = config
        self.vectorstore = None
        self.qa_chain = None
        self.setup_embeddings()
        
    def setup_embeddings(self):
        """Configures model embeddings"""
        self.embeddings = HuggingFaceEmbeddings(
            model_name=self.config.EMBEDDING_MODEL,
            model_kwargs={'device': 'cuda'},  # Use 'cuda' if you have a GPU, otherwise use 'cpu'
            encode_kwargs={'normalize_embeddings': True}
        )
        print(f"Embeddings correctly configured: {self.config.EMBEDDING_MODEL}")
    
    def load_and_process_pdfs(self):
        """Load and process all the PDFs in the directory"""
        pdf_files = glob.glob(os.path.join(self.config.PDF_DIRECTORY, "*.pdf"))
        
        if not pdf_files:
            raise ValueError(f"PDFs not found in {self.config.PDF_DIRECTORY}")
        
        print(f"{len(pdf_files)} PDF files found")
        
        all_documents = []
        for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
            try:
                loader = PyPDFLoader(pdf_file)
                documents = loader.load()
                
                # Add file's metadata
                for doc in documents:
                    doc.metadata['source_file'] = os.path.basename(pdf_file)
                
                all_documents.extend(documents)
                print(f"✓ {os.path.basename(pdf_file)}: {len(documents)} pages")
                
            except Exception as e:
                print(f"✗ Error with {pdf_file}: {str(e)}")
        
        return all_documents
    
    def split_documents(self, documents):
        """Divide the documents in chunks"""
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.config.CHUNK_SIZE,
            chunk_overlap=self.config.CHUNK_OVERLAP,
            length_function=len,
        )
        
        chunks = text_splitter.split_documents(documents)
        print(f"Documents divided in {len(chunks)} chunks")
        return chunks
    
    def create_vectorstore(self, chunks, use_chroma=True):
        """Create the vectorial database"""
        if use_chroma:
            self.vectorstore = Chroma.from_documents(
                documents=chunks,
                embedding=self.embeddings,
                persist_directory=self.config.VECTOR_DB_PATH
            )
            self.vectorstore.persist()
        else:
            self.vectorstore = FAISS.from_documents(chunks, self.embeddings)
            self.vectorstore.save_local(self.config.VECTOR_DB_PATH)
        
        print("Vector store created and saved")
        return self.vectorstore
    
    def load_vectorstore(self, use_chroma=True):
        """Load an existent vectorstore"""
        if use_chroma:
            self.vectorstore = Chroma(
                persist_directory=self.config.VECTOR_DB_PATH,
                embedding_function=self.embeddings
            )
        else:
            self.vectorstore = FAISS.load_local(
                self.config.VECTOR_DB_PATH, 
                self.embeddings, 
                allow_dangerous_deserialization=True
            )
        print("Vector store loaded")
        return self.vectorstore
    
    def setup_qa_chain(self):
        """Configure the string question-answer"""
        if not self.vectorstore:
            raise ValueError("Create or load vectorstore first")
        
        retriever = self.vectorstore.as_retriever(
            search_type="similarity",
            search_kwargs={"k": self.config.SEARCH_K}
        )
        
        # Initialize the Ollama model
        llm = Ollama(
            model=self.config.MODEL_NAME,
            temperature=0.1,  # Low temperatrure for precisse answers
            num_ctx=4096  # Wide context
        )
        
        self.qa_chain = RetrievalQA.from_chain_type(
            llm=llm,
            chain_type="stuff",
            retriever=retriever,
            return_source_documents=True,
            chain_type_kwargs={"verbose": False}
        )
        
        print("QA configured correctly")
        return self.qa_chain
    
    def ask_question(self, question, verbose=False):
        """Makes a question to the assistant"""
        if not self.qa_chain:
            raise ValueError("Configure the QA first")
        
        result = self.qa_chain.invoke({"query": question})
        
        if verbose:
            print(f"Question: {question}")
            print(f"Answer: {result['result']}")
            print("\nSources consulted:")
            for i, doc in enumerate(result['source_documents']):
                print(f"{i+1}. {doc.metadata['source_file']} - Page {doc.metadata.get('page', 'N/A')}")
            print("=" * 60)
        
        return result

In [4]:
assistant = ResearchAssistant(config)

  self.embeddings = HuggingFaceEmbeddings(


Embeddings correctly configured: sentence-transformers/all-MiniLM-L6-v2


In [5]:
pdf_files = glob.glob(os.path.join(config.PDF_DIRECTORY, "*.pdf"))
print("PDF files found:")
for pdf in pdf_files:
    print(f"  - {os.path.basename(pdf)}")

PDF files found:
  - 2022 Roadmap on integrated quantum photonics.pdf
  - 4H-SiC microring resonators for nonlinear integrated photonics.pdf
  - A solid-state light–matter interface at the single-photon level.pdf
  - An integrated photonic circuit for color qubit preparation by third‑order nonlinear interactions.pdf
  - An Introduction to Quantum Computing for Non-Physicists.pdf
  - Analysis of the Steady-state and Switch-on Characteristics of a Nonlinear Fibre Optic Ring Resonator.pdf
  - Analytical Approaches to the Description of Optical Microresonator Devices.pdf
  - Approaches for a quantum memory at telecommunication wavelengths.pdf
  - Bi-photon spectral correlation measurements from a silicon nanowire in the quantum and classical regimes.pdf
  - Broadband waveguide quantum memory for entangled photons.pdf
  - Characterizing and tailoring the spectro-temporal mode of photon pairs generated in few-mode fiber.pdf
  - Chip-integrated visible–telecom entangled photon pair source for

In [6]:
print("Processing PDFs...")
documents = assistant.load_and_process_pdfs()
chunks = assistant.split_documents(documents)
vectorstore = assistant.create_vectorstore(chunks, use_chroma=True)

Processing PDFs...
93 PDF files found


Processing PDFs:   0%|          | 0/93 [00:00<?, ?it/s]

✓ 2022 Roadmap on integrated quantum photonics.pdf: 86 pages
✓ 4H-SiC microring resonators for nonlinear integrated photonics.pdf: 4 pages
✓ A solid-state light–matter interface at the single-photon level.pdf: 5 pages
✓ An integrated photonic circuit for color qubit preparation by third‑order nonlinear interactions.pdf: 12 pages
✓ An Introduction to Quantum Computing for Non-Physicists.pdf: 36 pages
✓ Analysis of the Steady-state and Switch-on Characteristics of a Nonlinear Fibre Optic Ring Resonator.pdf: 10 pages
✓ Analytical Approaches to the Description of Optical Microresonator Devices.pdf: 27 pages
✓ Approaches for a quantum memory at telecommunication wavelengths.pdf: 12 pages
✓ Bi-photon spectral correlation measurements from a silicon nanowire in the quantum and classical regimes.pdf: 9 pages
✓ Broadband waveguide quantum memory for entangled photons.pdf: 4 pages
✓ Characterizing and tailoring the spectro-temporal mode of photon pairs generated in few-mode fiber.pdf: 5 pages
✓ 

Ignoring wrong pointing object 2 65536 (offset 0)
Ignoring wrong pointing object 9 65536 (offset 0)
Ignoring wrong pointing object 15 65536 (offset 0)
Ignoring wrong pointing object 21 65536 (offset 0)
Ignoring wrong pointing object 27 65536 (offset 0)
Ignoring wrong pointing object 33 65536 (offset 0)
Ignoring wrong pointing object 39 65536 (offset 0)
Ignoring wrong pointing object 45 65536 (offset 0)
Ignoring wrong pointing object 51 65536 (offset 0)
Ignoring wrong pointing object 57 65536 (offset 0)
Ignoring wrong pointing object 63 65536 (offset 0)
Ignoring wrong pointing object 69 65536 (offset 0)


✓ Theory of cavity-enhanced spontaneous four wave mixing.pdf: 12 pages
✓ Theory of two-photon entanglement in type-II optical parametric down-conversion.pdf: 12 pages
✓ Time-bin entangled photon pair generation from Si micro-ring resonator.pdf: 11 pages
✓ Two-photon interference the Hong–Ou–Mandel effect.pdf: 26 pages
✓ Ultranarrow-Band Photon-Pair Source Compatible with Solid State Quantum Memories and Telecommunication Networks.pdf: 5 pages
✓ Wavelength conversion in GaAs micro-ring resonators.pdf: 3 pages
✓ What are single photons good for .pdf: 8 pages
Documents divided in 8068 chunks
Vector store created and saved


  self.vectorstore.persist()


In [7]:
assistant.setup_qa_chain()

print("Configuration complete")
print(f"Model: {config.MODEL_NAME}")
print(f"PDFs loaded: {len(pdf_files)}")
print(f"PDFs Directory: {os.path.abspath(config.PDF_DIRECTORY)}")

QA configured correctly
Configuration complete
Model: llama3.2
PDFs loaded: 93
PDFs Directory: C:\Users\Gerardo\Documents\LLM\RAG\papers


  llm = Ollama(


In [8]:
questions = [
    "Why are microresonators useful?",
    "Quantum memories have been implemented?",
    "What is counter propagating SFWM?"
]

In [9]:
response = assistant.ask_question(questions[1], verbose=True)

Question: Quantum memories have been implemented?
Answer: Yes, quantum memories have been implemented. For example, experimental demonstrations of memory-enhanced quantum communication were performed in Nature (2020) and other publications. Additionally, a review based on the European integrated project "Qubit Applications" discussed the implementation of quantum memories.

Sources consulted:
1. Room temperature caesium quantum memory for quantum information applications.pdf - Page 11
2. Quantum Information processing with integrated silicon carbide photonics.pdf - Page 27
3. Gigahertz-Bandwidth Optical Memory in Pr3 Y2SiO5.pdf - Page 6
4. Quantum memories.pdf - Page 0


In [10]:
def interactive_chat(assistant):
    print("Interactive mode - Research assistant")
    print("Type 'quit' to exit or 'reset' to start a new conversation")
    print("=" * 60)
    
    while True:
        question = input("\nQuestion: ").strip()
        
        if question.lower() == 'quit':
            break
        if question.lower() == 'reset':
            print("Conversation reset")
            continue
        if not question:
            continue
        
        try:
            response = assistant.ask_question(question, verbose=True)
        except Exception as e:
            print(f"Error: {str(e)}")

In [11]:
interactive_chat(assistant)

Interactive mode - Research assistant
Type 'quit' to exit or 'reset' to start a new conversation



Question:  Why are microresonators useful?


Question: Why are microresonators useful?
Answer: Microresonators are useful because they greatly enhance light-matter interaction by spatially or temporally confining and enhancing radiation by several orders of magnitude. This is particularly true for highly nonlinear materials, such as silicon or III-V compounds, where cavities offer extreme enhancements in efficiency that can result in parametric fluorescence with pump powers on the order of microwatts only.

Sources consulted:
1. 2022 Roadmap on integrated quantum photonics.pdf - Page 48
2. Integrated sources of photon quantum states based on nonlinear optics.pdf - Page 6
3. Dispersion engineering and measurement of whispering gallery mode microresonator for Kerr frequency comb generation.pdf - Page 5
4. Photon pair generation from compact silicon microring resonators using microwatt-level pump powers.pdf - Page 13



Question:  What is counter propagating SFWM?


Question: What is counter propagating SFWM?
Answer: Counter-propagating SFWM (Second-Order Frequency Conversion) refers to a type of Superfluorescent White Light Emission (SFWM) process where two pump photons are launched from opposite ends of the nonlinear medium, such as a fiber or waveguide. In this process, one daughter photon (signal) is emitted in a direction that backpropagates with respect to one of the pumps, while the conjugate idler photon backpropagates with respect to the other pump.

Sources consulted:
1. Fiber-based photon-pair generation tutorial.pdf - Page 8
2. Counter-propagating spontaneous four wave mixing photon-pair factorability and ultra-narrowband single photons.pdf - Page 2
3. Counter-propagating spontaneous four wave mixing photon-pair factorability and ultra-narrowband single photons.pdf - Page 2
4. Counter-propagating spontaneous four wave mixing photon-pair factorability and ultra-narrowband single photons.pdf - Page 6



Question:  quit
