In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from typing import Literal, Optional


In [3]:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import models
from process_documents.prepare_professors_info import prepare_professors_info
from process_documents.prepare_labs_info import prepare_labs_info
professors_info_filepath = "crawl_NU/professors_info.json"
labs_info_filepath = "crawl_NU/labs_info.json"

professors_docs = prepare_professors_info(professors_info_filepath)
labs_docs = prepare_labs_info(labs_info_filepath)

In [4]:
prof_text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size = 1000,
    chunk_overlap = 100
)
prof_splits = prof_text_splitter.split_documents(professors_docs)

lab_text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size = 300,   
    chunk_overlap = 30  
)
lab_splits = lab_text_splitter.split_documents(labs_docs)

all_documents = prof_splits + lab_splits

In [5]:
embeddings = OpenAIEmbeddings()
# vector_db = QdrantVectorStore.from_documents(
#     documents,
#     embeddings,
#     path="./qdrant_db",  
#     collection_name="nu_professors",
# )

vector_db = QdrantVectorStore.from_documents(
    all_documents,
    embeddings,
    location=":memory:", 
    collection_name="nu_research", 
)

In [6]:
# define retriever and llm
retriever = vector_db.as_retriever(search_kwargs={"k": 5})
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

In [7]:
# Query structuring for metadata filters
from typing import Literal, Optional, Tuple

class ProfandLabSearch(BaseModel):
    """Search over a database of professors and labs information."""

    query: str = Field(..., description="The optimized search query for semantic similarity search. Focus on research topics or technical terms.")

    source_type: Optional[Literal["professor", "lab"]] = Field(
        None, 
        description="Filter results by source type. Use 'professor' for faculty and 'lab' for groups. Leave None for general queries."
    )
    professor_name: Optional[str] = Field(None, description="The specific name of a professor.")
    position: Optional[str] = Field(None, description="The academic position (e.g., Assistant Professor, Associate Professor).")

    lab_name: Optional[str] = Field(None, description="The formal name of the research laboratory.")
    lab_leader: Optional[str] = Field(None, description="The name of the faculty member heading the lab.")

    department: Optional[str] = Field(None, description="The department name (e.g., Computer Science, Electrical Engineering).")
    research_area: Optional[str] = Field(None, description="Broad research category (e.g., AI, Robotics, HCI).")

    def pretty_print(self) -> None:
        data = self.dict()
        for field, value in data.items():
            if value is not None and value != "":
                print(f"{field}: {value}")

In [8]:
system = """You are an expert at converting user questions into database queries. \
You have access to a database of Northwestern University professors and labs information. \
Given a question, return a database query optimized to retrieve the most relevant results.

If there are acronyms or words you are not familiar with, do not try to rephrase them."""
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "{question}"),
    ]
)
structured_llm = llm.with_structured_output(ProfandLabSearch)
query_analyzer = prompt | structured_llm

In [9]:
# query translation
template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}"""
prompt_queries = ChatPromptTemplate.from_template(template)

generated_queries = prompt_queries | llm | StrOutputParser() | (lambda x: x.split('\n'))

In [10]:
from langchain_core.load import dumps, loads

def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]

# retrieval_chain = generated_queries | retriever.map() | get_unique_union

In [15]:
def query_logic(structured_obj):
    query = structured_obj.query
    queries = generated_queries.invoke({"question": query})
    q_filter = None
    filters = []
    if structured_obj.source_type:
        filters.append(models.FieldCondition(key="source_type", match=models.MatchValue(value = structured_obj.source_type)))
    if structured_obj.professor_name: 
        filters.append(models.FieldCondition(key="name", match=models.MatchValue(value = structured_obj.professor_name)))
    if structured_obj.position: 
        filters.append(models.FieldCondition(key="position", match=models.MatchValue(value = structured_obj.position)))
    if structured_obj.lab_name: 
        filters.append(models.FieldCondition(key="name", match=models.MatchValue(value = structured_obj.lab_name)))
    if structured_obj.lab_leader: 
        filters.append(models.FieldCondition(key="leader", match=models.MatchValue(value = structured_obj.lab_leader)))
    if structured_obj.department: 
        filters.append(models.FieldCondition(key="department", match=models.MatchValue(value = structured_obj.department)))
    if filters:
        q_filter = models.Filter(must=filters)
    all_docs = []
    for q in queries:
        docs = retriever.invoke(q, config={"configurable": {"search_kwargs": {"filter": q_filter}}})
        all_docs.append(docs)
    return get_unique_union(all_docs)

In [16]:
retrieval_chain = query_analyzer | RunnableLambda(query_logic) 

In [26]:
def format_docs(docs):
    return "\n\n".join(
        f"Name: {d.metadata.get('name')}\n"
        f"Dept: {d.metadata.get('department')}\n"
        f"Contact: {d.metadata.get('contact')}\n"
        f"Position: {d.metadata.get('position', '')}\n"
        f"Website: {d.metadata.get('website', '')}\n"
        f"Related Content: {d.page_content}"
        for d in docs
    )

class RouteQuery(BaseModel):
    """Route a user query to the most relevant prompt based on their intent."""
    
    target: Literal["academic_search", "general_research"] = Field(
        ...,
        description="""Choose the destination based on the user's intent:
        - 'academic_search': Select this if the query relates to Northwestern University (NU) specifically. 
          This includes searching for specific professors, identifying research labs, inquiring about 
          departmental faculty, or finding which groups at NU work on a particular technology.
        - 'general_research': Select this if the query is a general scientific or technical question 
          that does NOT require Northwestern-specific data. This includes explaining terminologies 
          (e.g., 'What is Cross-Entropy?'), helping with general research methodology, or 
          conceptual explanations that apply universally regardless of the institution.
        """
    )
structured_llm = llm.with_structured_output(RouteQuery)

system = """You are an expert at routing a user's question to the most relevant pipeline.
1. 'academic_search': For questions about Northwestern University's McCormick School of Engineering, 
   including specific professors, labs, departments, or finding local experts in a field.
2. 'general_research': For general scientific knowledge, terminology explanations, or 
   research concepts that are independent of any specific institution.

If the user mentions a name or a lab that sounds like it belongs to an institution, default to 'academic_search'."""
router_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "{question}"),
    ]
)

system_content_choice_1 = """You are an expert academic research assistant specializing in the Northwestern University McCormick School of Engineering faculty and research laboratories.

Your goal is to provide comprehensive information about professors and their associated labs based on the provided context.

### Guidelines:
1. **Entity Linking**: Always identify the connection between professors and labs. If the context mentions a professor leads a specific lab (e.g., 'AquaLab'), ensure both are mentioned together.
2. **Contextual Retrieval**:
    - **For Professor Queries**: Provide their name, department, position, and the primary research areas they focus on.
    - **For Lab Queries**: Provide the lab name, the faculty director (Professor), and the specific projects or technologies the lab is developing.
3. **Technical Mapping**: If a user's technical term (e.g., 'LLM') doesn't have a literal match, bridge the gap by referring to broader context terms like 'Natural Language Processing', 'Machine Learning', or 'Artificial Intelligence'.
4. **Structured Response**:
    - Mention the **Department** and **Contact/Website** info if available in the context.
    - If multiple professors/labs are relevant, list them clearly with brief descriptions of their distinct focus.
5. **Strict Grounding & Integrity**: 
    - Only answer based on the **Context** provided. 
    - If the context lacks a specific professor or lab for a topic, state: "Based on the current Northwestern database, I couldn't find a specific professor or lab researching [Topic]." 
    - Do not hallucinate names or affiliations not present in the context.
"""

system_content_choice_2 = """ You are a professional research scientist. 
Your expertise is in general concepts. If the user mentions specific universities, professors, or faculty, do not try to search for local data; instead, explain the scientific concepts behind their inquiry
You are great at answering general research-related question, explain terminologies and concepts, and help plan experiments
You will answer all questions in a concise and easy to understand manner, explain in detail if the user asks.
When you don't know the answer to a question you admit that you don't know.
"""

def choose_prompt(result):
    if "academic_search" in result.target.lower():
        return system_content_choice_1
    else:
        return system_content_choice_2

selected_prompt = router_prompt | structured_llm | RunnableLambda(choose_prompt) 


prompt = ChatPromptTemplate.from_messages([
    ("system", "{system_message}"),
    ("human", "Context:\n{context}\n\nQuestion: {question}")
])

rag_chain = (
    {"system_message": selected_prompt, "context": retrieval_chain | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [18]:
import warnings
from langchain_core._api import LangChainBetaWarning

warnings.filterwarnings("ignore", category=LangChainBetaWarning)

In [27]:
for chunk in rag_chain.stream({"question": "What labs are related to Large Language Models?"}):
    print(chunk, end="", flush=True)

Several labs focus on research related to Large Language Models (LLMs) and their applications. Here are a few notable ones based on the context provided:

1. **Machine Learning and Language Lab**: This lab develops intelligent language models that integrate with various domains, such as vision and robotics. Their work likely involves LLMs as they explore how these models can reason, plan, and interact with the physical world.

2. **Language and Computation Lab** (led by Klinton Bicknell): This lab investigates how the human brain processes language and may utilize LLMs to build computational models of language behaviors. Their research could involve analyzing how LLMs can mimic or enhance human language processing.

3. **Research by Yiping Lu**: While not a lab per se, Yiping Lu's research on scaling laws in machine learning is highly relevant to LLMs. His work focuses on understanding how performance improves as resources are scaled, which is a critical aspect of developing and optimi