In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langsmith import Client
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from typing import Literal


In [3]:
from langchain_qdrant import QdrantVectorStore
from langchain_core.documents import Document
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
import json

with open('output.json', 'r', encoding='utf-8') as f:
    raw_data = json.load(f)

# reformatting the keys and make up content to store
documents = []
for item in raw_data:
    cleaned_item = {}
    for key, value in item.items():
        if not value: continue
        k = key.lower().strip()

        if "research" in k:
            target_key = "research"
        elif any(x in k for x in ["publications", "books", "project", "patents"]):
            target_key = "publications"
        elif any(x in k for x in ["courses", "lectures"]):
            target_key = "courses"
        elif any(x in k for x in ["teaching", "students", "classroom"]):
            target_key = "teaching"
        elif "experience" in k:
            target_key = "experience"
        elif "website" in k:
            target_key = "website"
        elif "news" in k:
            target_key = "news"
        elif k in ["positions", "position"]:
            target_key = "position"
        else:
            target_key = k
        if target_key in cleaned_item:
            if value not in cleaned_item[target_key]:
                cleaned_item[target_key] += "\n" + value
        else:
            cleaned_item[target_key] = value

    search_parts = [
        f"Research & Expertise: {cleaned_item.get('research', '')}",
        f"Publications & Works: {cleaned_item.get('publications', '')}",
        f"Biography: {cleaned_item.get('biography', '')}",
        f"Experience: {cleaned_item.get('experience', '')}",
        f"Courses & Teaching: {cleaned_item.get('courses', '')} {cleaned_item.get('teaching', '')}"
    ]

    full_search_text = "\n".join([p for p in search_parts if len(p) > 25])

    metadata = {
        "name": cleaned_item.get("name"),
        "position": cleaned_item.get("position"),
        "department": cleaned_item.get("departments"),
        "contact": cleaned_item.get("contact"),
        "website": cleaned_item.get("website")
    }
    if full_search_text.strip():
        doc = Document(page_content=full_search_text, metadata=metadata)
        documents.append(doc)

In [4]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size = 1000,
    chunk_overlap = 100
)
splits = text_splitter.split_documents(documents)

In [5]:
embeddings = OpenAIEmbeddings()
# vector_db = QdrantVectorStore.from_documents(
#     documents,
#     embeddings,
#     path="./qdrant_db",  
#     collection_name="nu_professors",
# )
vector_db = QdrantVectorStore.from_documents(
    documents,
    embeddings,
    location=":memory:", 
    collection_name="nu_professors",
)

In [None]:
# free version
# from langchain_huggingface import HuggingFaceEmbeddings
# from langchain_qdrant import QdrantVectorStore

# embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# vector_db = QdrantVectorStore.from_documents(
#     documents,
#     embeddings,
#     location=":memory:", 
#     collection_name="nu_professors",
# )

In [10]:
# define retriever and llm
retriever = vector_db.as_retriever(search_kwargs={"k": 5})
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

In [7]:
# query translation
template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}"""
prompt_queries = ChatPromptTemplate.from_template(template)

generated_queries = prompt_queries | llm | StrOutputParser() | (lambda x: x.split('\n'))

In [8]:
from langchain_core.load import dumps, loads

def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]

retrieval_chain = generated_queries | retriever.map() | get_unique_union

In [11]:
def format_docs(docs):
    return "\n\n".join(
        f"Name: {d.metadata.get('name')}\n"
        f"Dept: {d.metadata.get('department')}\n"
        f"Contact: {d.metadata.get('contact')}\n"
        f"Position: {d.metadata.get('position', '')}\n"
        f"Website: {d.metadata.get('website', '')}\n"
        f"Related Content: {d.page_content}"
        for d in docs
    )

class RouteQuery(BaseModel):
    """Route a user query to the most relevant prompt based on their intent."""
    target: Literal["academic_search", "general_research"] = Field(
        ...,
        description="""Choose the destination for the user's query:
        - 'academic_search': Select this if the user is asking about specific professors at Northwestern University, their labs, departments, or seeking specific faculty members.
        - 'general_research': Select this if the user is asking general scientific questions, explaining terminologies (like what is LLM), or needs help with research concepts without referring to a specific person.
        """
    )

structured_llm = llm.with_structured_output(RouteQuery)

system = """ You are an expert at routing a user's question to the most relevant prompt based on their intent. 
Choose the prompt based on whether the user is asking about specific professors and their research in Northwestern University or general questions
"""
router_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "{question}"),
    ]
)

system_content_choice_1 = """You are an expert academic research assistant specializing in the faculty of Northwestern University's McCormick School of Engineering.

Your goal is to help users find the most relevant professors based on the provided context. 

Guidelines:
1. **Identify Professors**: The context contains faculty names, departments, and research interests. Always mention the professor's name clearly.
2. **Handle Technical Terms**: If a user asks about a specific term like 'RAG', search the context for related fields like 'Natural Language Processing', 'Information Retrieval', or 'Artificial Intelligence' if a direct match isn't found.
3. **Be Specific**: Mention the department or specific lab if available.
4. **Admit Ignorance**: If the context does not contain information about a professor doing that specific research, explicitly state: "Based on the current database, no professor was found specifically researching [Topic]." Do not hallucinate or list irrelevant professors.
"""

system_content_choice_2 = """ You are a professional research scientist. 
Your expertise is in general concepts. If the user mentions specific universities, professors, or faculty, do not try to search for local data; instead, explain the scientific concepts behind their inquiry
You are great at answering general research-related question, explain terminologies and concepts, and help plan experiments
You will answer all questions in a concise and easy to understand manner, explain in detail if the user asks.
When you don't know the answer to a question you admit that you don't know.
"""

def choose_prompt(result):
    if "academic_search" in result.target.lower():
        return system_content_choice_1
    else:
        return system_content_choice_2

selected_prompt = router_prompt | structured_llm | RunnableLambda(choose_prompt) 


prompt = ChatPromptTemplate.from_messages([
    ("system", "{system_message}"),
    ("human", "Context:\n{context}\n\nQuestion: {question}")
])

rag_chain = (
    {"system_message": selected_prompt, "context": retrieval_chain | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [12]:
import warnings
from langchain_core._api import LangChainBetaWarning

warnings.filterwarnings("ignore", category=LangChainBetaWarning)

In [None]:
for chunk in rag_chain.stream({"question": "Who is doing research on CS + Biology?"}):
    print(chunk, end="", flush=True)

Based on the current database, no professor was found specifically researching under the name "Lizhen Shi." However, there is a Professor Naichen Shi, who leads the Integrative Artificial Intelligence Lab, focusing on advancing integrative and generative AI methods for aligning heterogeneous data and knowledge across complex engineering systems, particularly in advanced manufacturing. If you meant Naichen Shi, please let me know!

In [15]:
response = rag_chain.invoke("What is RAG in NLP?")
print(response)

RAG, or Retrieval-Augmented Generation, is a technique in natural language processing (NLP) that combines the strengths of retrieval-based and generation-based models. The primary goal of RAG is to enhance the quality and relevance of generated text by incorporating external knowledge from a retrieval system.

Here's how it works:

1. **Retrieval Component**: When a query or prompt is given, the system first retrieves relevant documents or pieces of information from a large corpus or database. This retrieval is typically done using techniques like vector similarity search or traditional keyword matching.

2. **Generation Component**: After retrieving the relevant information, a generative model (often based on transformer architectures like GPT or BERT) uses this information to produce a coherent and contextually relevant response. The generative model can leverage the retrieved content to provide more accurate, informative, and contextually appropriate answers.

3. **Integration**: Th