In [1]:
from dotenv import load_dotenv
load_dotenv()


True

In [5]:
import os
test = os.getenv("NEO4J_URI")
print(test)

neo4j+s://39ad3c3f.databases.neo4j.io


In [6]:
!python chatbot_api/src/generate_vector_store.py

Vector store created successfully.


In [7]:
# test_vector_retrieval.py (Run this locally)
import os
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

embed = OpenAIEmbeddings()
vector_store = Chroma(persist_directory="./vector_store", embedding_function=embed)

query = "Explain the methodology of the paper related to machine learning."
results = vector_store.similarity_search_with_score(query, k=5)

print("Raw retrieval results:")
for doc, score in results:
    print(f"SCORE: {score}, METADATA: {doc.metadata}, CONTENT: {doc.page_content[:100]}...")

# Check if you're applying threshold correctly
threshold = 0.5
filtered_docs = [d for d, s in results if s > threshold]  # If you're assuming higher=better
print("Filtered documents:", len(filtered_docs))


Raw retrieval results:
SCORE: 0.3464084565639496, METADATA: {'page': 0, 'source': 'chatbot_api/pdfs/12.pdf'}, CONTENT: of precision and analytical depth19–25. The majority of machine
learning-based on cell classificatio...
SCORE: 0.35743772983551025, METADATA: {'page': 8, 'source': 'chatbot_api/pdfs/12.pdf'}, CONTENT: in comparison to standalone implementations of the CNN or NN
models. This improvement underscores th...
SCORE: 0.36846837401390076, METADATA: {'page': 4, 'source': 'chatbot_api/pdfs/12.pdf'}, CONTENT: of several classification models, specifically Logistic Regression
(LR), Support Vector Machine (SVM...
SCORE: 0.3690042495727539, METADATA: {'page': 12, 'source': 'chatbot_api/pdfs/9.pdf'}, CONTENT: 33. Kourou K, Exarchos TP , Exarchos KP , Karamouzis MV, Fotiadis DI. Machine 
learning applications...
SCORE: 0.3694433867931366, METADATA: {'page': 7, 'source': 'chatbot_api/pdfs/12.pdf'}, CONTENT: plication in machine learning where models tailor themselves too
closely to the

## Debug the RetrievalQA Chain

In [9]:
# test_papers_qa_chain.py (Run this locally)
import os
from langchain.chains import RetrievalQA
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

model_name = os.getenv("AGENT_MODEL", "gpt-3.5-turbo")
chat_model = ChatOpenAI(model=model_name, temperature=0)

embeddings = OpenAIEmbeddings()
vector_store = Chroma(persist_directory="./vector_store", embedding_function=embeddings)

papers_qa_chain = RetrievalQA.from_chain_type(
    llm=chat_model,
    chain_type="stuff",
    retriever=vector_store.as_retriever(),
    return_source_documents=True
)

query = "What is the main contribution of the paper titled 'Multiplex Image Machine Learning'?"
result = papers_qa_chain({"query": query})

print("Papers QA chain result:")
print("Answer:", result.get('result', 'No result'))
print("Source documents:", result.get('source_documents', []))


  result = papers_qa_chain({"query": query})


Papers QA chain result:
Answer: The main contribution of the paper titled 'Multiplex Image Machine Learning' is the development of a novel machine learning architecture called the Multiplex Image Machine Learning (MIML) Architecture. This architecture enhances cell classification by integrating label-free cell images with biomechanical property data, allowing for a more comprehensive understanding of cellular properties. The MIML model achieves a remarkable accuracy of 98.3% in cell classification, addressing limitations in specificity and speed found in existing techniques. This advancement has the potential to transform cell biology and biomedical imaging by enabling less invasive studies and new breakthroughs in cell classification.
Source documents: [Document(metadata={'page': 7, 'source': 'chatbot_api/pdfs/12.pdf'}, page_content='2.4 Multiplex Image machine learning for cell detection\nTo enhance the accuracy of cell classification, we have developed\na novel architectural model, 

## Check Metadata Integration

In [None]:
# In generate_vector_store.py, after you fetch metadata:

async def fetch_metadata(self):
    with self.driver.session(database="neo4j") as session:
        query = """
        MATCH (p:Paper)
        OPTIONAL MATCH (p)-[:UTILIZES]->(s:Skill)
        WITH p, collect(s.skill) AS skills
        RETURN p.id AS id, properties(p) AS properties, skills
        """
        result = session.run(query)
        id_to_metadata = {}
        for record in result:
            properties = record["properties"]
            skills_str = ', '.join(record["skills"]) if record["skills"] else "No skills listed"
            properties["skills"] = skills_str
            # IMPORTANT: Actually store the metadata in the dictionary
            paper_id = record["id"]
            id_to_metadata[paper_id] = properties

        print("Fetched Metadata:", id_to_metadata)  # Debug print
        return id_to_metadata


In [None]:
# After splitting text_chunks
for doc in text_chunks:
    source_path = doc.metadata.get('source', '')
    filename = os.path.basename(source_path)
    file_id, _ = os.path.splitext(filename)
    json_meta = id_to_metadata.get(file_id, {})
    doc.metadata.update(json_meta)

print("Sample doc metadata after update:", text_chunks[0].metadata if text_chunks else "No documents")
