In [3]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langsmith import Client
from langchain_core.prompts import ChatPromptTemplate


In [5]:
from langchain_qdrant import QdrantVectorStore
from langchain_core.documents import Document
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
import json

with open('output.json', 'r', encoding='utf-8') as f:
    raw_data = json.load(f)

# reformatting the keys and make up content to store
documents = []
for item in raw_data:
    cleaned_item = {}
    for key, value in item.items():
        if not value: continue
        k = key.lower().strip()

        if "research" in k:
            target_key = "research"
        elif any(x in k for x in ["publications", "books", "project", "patents"]):
            target_key = "publications"
        elif any(x in k for x in ["courses", "lectures"]):
            target_key = "courses"
        elif any(x in k for x in ["teaching", "students", "classroom"]):
            target_key = "teaching"
        elif "experience" in k:
            target_key = "experience"
        elif "website" in k:
            target_key = "website"
        elif "news" in k:
            target_key = "news"
        elif k in ["positions", "position"]:
            target_key = "position"
        else:
            target_key = k
        if target_key in cleaned_item:
            if value not in cleaned_item[target_key]:
                cleaned_item[target_key] += "\n" + value
        else:
            cleaned_item[target_key] = value

    search_parts = [
        f"Research & Expertise: {cleaned_item.get('research', '')}",
        f"Publications & Works: {cleaned_item.get('publications', '')}",
        f"Biography: {cleaned_item.get('biography', '')}",
        f"Experience: {cleaned_item.get('experience', '')}",
        f"Courses & Teaching: {cleaned_item.get('courses', '')} {cleaned_item.get('teaching', '')}"
    ]

    full_search_text = "\n".join([p for p in search_parts if len(p) > 25])

    metadata = {
        "name": cleaned_item.get("name"),
        "position": cleaned_item.get("position"),
        "department": cleaned_item.get("departments"),
        "contact": cleaned_item.get("contact"),
        "website": cleaned_item.get("website")
    }
    if full_search_text.strip():
        doc = Document(page_content=full_search_text, metadata=metadata)
        documents.append(doc)

In [6]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size = 1000,
    chunk_overlap = 100
)
splits = text_splitter.split_documents(documents)

In [7]:
embeddings = OpenAIEmbeddings()
# vector_db = QdrantVectorStore.from_documents(
#     documents,
#     embeddings,
#     path="./qdrant_db",  
#     collection_name="nu_professors",
# )
vector_db = QdrantVectorStore.from_documents(
    documents,
    embeddings,
    location=":memory:",  # 不写磁盘，绕过文件锁
    collection_name="nu_professors",
)

In [None]:
# free version
# from langchain_huggingface import HuggingFaceEmbeddings
# from langchain_qdrant import QdrantVectorStore

# embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# vector_db = QdrantVectorStore.from_documents(
#     documents,
#     embeddings,
#     location=":memory:", 
#     collection_name="nu_professors",
# )

In [8]:
# define retriever and llm
retriever = vector_db.as_retriever(search_kwargs={"k": 5})
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

In [15]:
# query reconstruction
template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}"""
prompt_queries = ChatPromptTemplate.from_template(template)

generated_queries = prompt_queries | llm | StrOutputParser() | (lambda x: x.split('\n'))

In [17]:
from langchain_core.load import dumps, loads

def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]

retrieval_chain = generated_queries | retriever.map() | get_unique_union

In [None]:
def format_docs(docs):
    return "\n\n".join(
        f"Name: {d.metadata.get('name')}\n"
        f"Dept: {d.metadata.get('department')}\n"
        f"Contact: {d.metadata.get('contact')}\n"
        f"Position: {d.metadata.get('position', '')}\n"
        f"Website: {d.metadata.get('website', '')}\n"
        f"Related Content: {d.page_content}"
        for d in docs
    )

system_content = """You are an expert academic research assistant specializing in the faculty of Northwestern University's McCormick School of Engineering.

Your goal is to help users find the most relevant professors based on the provided context. 

Guidelines:
1. **Identify Professors**: The context contains faculty names, departments, and research interests. Always mention the professor's name clearly.
2. **Handle Technical Terms**: If a user asks about a specific term like 'RAG', search the context for related fields like 'Natural Language Processing', 'Information Retrieval', or 'Artificial Intelligence' if a direct match isn't found.
3. **Be Specific**: Mention the department or specific lab if available.
4. **Admit Ignorance**: If the context does not contain information about a professor doing that specific research, explicitly state: "Based on the current database, no professor was found specifically researching [Topic]." Do not hallucinate or list irrelevant professors.
"""

prompt = ChatPromptTemplate.from_messages([
    ("system", system_content),
    ("human", "Context:\n{context}\n\nQuestion: {question}")
])
rag_chain = (
    {"context": retrieval_chain | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


In [20]:
import warnings
from langchain_core._api import LangChainBetaWarning

warnings.filterwarnings("ignore", category=LangChainBetaWarning)

In [24]:
response = rag_chain.invoke("Who should I reach out to if I want to do research in distributed system? I want the professor to be friendly and patient to starter")
print(response)

Based on the current database, the best professor to reach out to for research in distributed systems, who is known to be friendly and patient with starters, would be **Peter Dinda**. Professor Dinda specializes in experimental computer systems, particularly parallel and distributed systems. His research involves virtualization and operating systems for distributed and parallel computing, making him a suitable choice for your research interests. You can find more information about him on his website at [Peter Dinda's Homepage](http://pdinda.org/).
