In [16]:
from dotenv import load_dotenv, find_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.retrievers import BM25Retriever
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from pathlib import Path
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string

load_dotenv(find_dotenv())

stopword_corpus = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
def preprocess_fn(text: str):
    text = text.lower()
    text = ''.join([c for c in text if c not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stopword_corpus]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return tokens

docs_folder = Path(
    r"D:\HKU\Inno Wing RA\UBC Exchange\code\output\Objectifying_China\docs"
)
loader = DirectoryLoader(str(docs_folder.absolute()), glob="**/*.md", loader_cls=TextLoader, loader_kwargs={"encoding": "utf-8"})
docs = loader.load()
retriever = BM25Retriever.from_documents(docs, preprocess_func=preprocess_fn)

In [4]:
### Retrieval Grader
import os
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import AzureChatOpenAI
from pydantic import BaseModel, Field

# Data model
class GradeDocuments(BaseModel):
    """Binary score for relevance check on retrieved documents."""

    binary_score: str = Field(
        description="Documents are relevant to the question, 'yes' or 'no'"
    )


# LLM with function call
llm = AzureChatOpenAI(
    api_version=os.environ["AZURE_API_VERSION"],
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    azure_deployment=os.environ["AZURE_OPENAI_DEPLOYEMENT"]
)

structured_llm_grader = llm.with_structured_output(GradeDocuments)

# Prompt
system = """You are a grader assessing relevance of a retrieved document to a user question. \n 
    It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
    If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. \n
    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question."""
grade_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "Retrieved document: \n\n {document} \n\n User question: {question}"),
    ]
)

retrieval_grader = grade_prompt | structured_llm_grader
question = "Introduce to me some vases"
docs = retriever.invoke(question)
doc_txt = docs[1].page_content
print(retrieval_grader.invoke({"question": question, "document": doc_txt}))

binary_score='yes'


In [5]:
### Generate
from langchain import hub
from langchain_core.output_parsers import StrOutputParser

# Prompt
prompt = hub.pull("rlm/rag-prompt")

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain
rag_chain = prompt | llm | StrOutputParser()

# Run
generation = rag_chain.invoke({"context": docs, "question": question})
print(generation)

Here are examples of vases:

1. Longquan celadon vases were highly valued in East Asia, particularly in Japan, known for their translucent glaze and mallet shapes, often used for burial purposes during the Southern Song and Yuan dynasties.

2. Mounted porcelain vases and bowls from Jingdezhen, China (Kangxi period), featured intricate designs like pierced "linglong" patterns and were later adorned with precious metal mounts in Europe to demonstrate wealth.

These highlight the cultural significance and craftsmanship of vases across civilizations.


In [6]:
### Hallucination Grader


# Data model
class GradeHallucinations(BaseModel):
    """Binary score for hallucination present in generation answer."""

    binary_score: str = Field(
        description="Answer is grounded in the facts, 'yes' or 'no'"
    )


# LLM with function call
structured_llm_grader = llm.with_structured_output(GradeHallucinations)

# Prompt
system = """You are a grader assessing whether an LLM generation is grounded in / supported by a set of retrieved facts. \n 
     Give a binary score 'yes' or 'no'. 'Yes' means that the answer is grounded in / supported by the set of facts."""
hallucination_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "Set of facts: \n\n {documents} \n\n LLM generation: {generation}"),
    ]
)

hallucination_grader = hallucination_prompt | structured_llm_grader
hallucination_grader.invoke({"documents": docs, "generation": generation})

GradeHallucinations(binary_score='yes')

In [7]:
### Answer Grader


# Data model
class GradeAnswer(BaseModel):
    """Binary score to assess answer addresses question."""

    binary_score: str = Field(
        description="Answer addresses the question, 'yes' or 'no'"
    )


# LLM with function call
structured_llm_grader = llm.with_structured_output(GradeAnswer)

# Prompt
system = """You are a grader assessing whether an answer addresses / resolves a question \n 
     Give a binary score 'yes' or 'no'. Yes' means that the answer resolves the question."""
answer_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "User question: \n\n {question} \n\n LLM generation: {generation}"),
    ]
)

answer_grader = answer_prompt | structured_llm_grader
answer_grader.invoke({"question": question, "generation": generation})

GradeAnswer(binary_score='yes')

In [8]:
### Question Re-writer

# Prompt
system = """You a question re-writer that converts an input question to a better version that is optimized \n 
     for vectorstore retrieval. Look at the input and try to reason about the underlying semantic intent / meaning."""
re_write_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        (
            "human",
            "Here is the initial question: \n\n {question} \n Formulate an improved question.",
        ),
    ]
)

question_rewriter = re_write_prompt | llm | StrOutputParser()
question_rewriter.invoke({"question": question})

'Can you suggest different types or styles of vases and their unique features?'

In [9]:
from typing import List

from typing_extensions import TypedDict


class GraphState(TypedDict):
    """
    Represents the state of our graph.

    Attributes:
        question: question
        generation: LLM generation
        documents: list of documents
    """

    question: str
    generation: str
    documents: List[str]

In [11]:
from langgraph.graph import END, StateGraph, START

workflow = StateGraph(GraphState)

# Define the nodes
workflow.add_node("retrieve", retrieve)  # retrieve
workflow.add_node("grade_documents", grade_documents)  # grade documents
workflow.add_node("generate", generate)  # generate
workflow.add_node("transform_query", transform_query)  # transform_query

# Build graph
workflow.add_edge(START, "retrieve")
workflow.add_edge("retrieve", "grade_documents")
workflow.add_conditional_edges(
    "grade_documents",
    decide_to_generate,
    {
        "transform_query": "transform_query",
        "generate": "generate",
    },
)
workflow.add_edge("transform_query", "retrieve")
workflow.add_conditional_edges(
    "generate",
    grade_generation_v_documents_and_question,
    {
        "not supported": "generate",
        "useful": END,
        "not useful": "transform_query",
    },
)

# Compile
app = workflow.compile()

In [1]:
import os
from langchain_openai import AzureOpenAIEmbeddings
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv, find_dotenv
from pinecone_text.sparse import BM25Encoder
from langchain_community.retrievers import PineconeHybridSearchRetriever

load_dotenv(find_dotenv())

# create the index
index_name = "umag-hybrid-search"
pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=3072,  # dimensionality of dense model
        metric="dotproduct",  # sparse values supported only for dotproduct
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )

index = pc.Index(index_name)

bm25_encoder = BM25Encoder.default()

embeddings = AzureOpenAIEmbeddings(
    azure_deployment=os.environ["AZURE_OPENAI_EMBEDDING_DEPLOYMENT"],
    azure_endpoint=os.environ["AZURE_OPENAI_EMBEDDING_ENDPOINT"],
)

retriever = PineconeHybridSearchRetriever(
    embeddings=embeddings, sparse_encoder=bm25_encoder, index=index, top_k=10
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
retriever.invoke("How's the porcelain made of?", )

[Document(metadata={'score': 0.598115683}, page_content="# Porcelain terminology  \n\nPorcelain terminologyThe Chinese term 'ci' (translated as porcelain in English) refers to all ceramics that are fired at high temperatures, including porcelain and stoneware. In the West, the term porcelain refers specifically to white ceramics made with a special type of clay called kaolin and fired to a temperature of about  $1300^{\\circ}\\mathrm{C}$ , which results in a translucent, glassy material that makes a ringing sound when struck. Stoneware is used to refer to related ceramics that are similarly hard and dense, but which are made with grey or brown clay, may or may not be white- bodied, do not transmit light, and are fired to a slightly lower temperature of 1000 to  $1250^{\\circ}\\mathrm{C}$ . Ceramics fired below this temperature range are called earthenwares. The terms 'poro- porcelain' or 'porcellaneous' are sometimes used to describe early ceramics made with some of the same ingredient

In [7]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from pathlib import Path

docs_folder = Path(
    r"D:\HKU\Inno Wing RA\UBC Exchange\self_rag\output\Objectifying_China\docs"
)
loader = DirectoryLoader(
    str(docs_folder.absolute()),
    glob="**/*.md",
    loader_cls=TextLoader,
    loader_kwargs={"encoding": "utf-8"},
)
docs = loader.load()

In [None]:
import re

image_links = []
for doc in docs:
    matches = re.findall("!\[\]\([a-zA-Z:/\-\.0-9]+\)", doc.page_content)
    image_links.append(matches)

print(image_links)

In [9]:
len(docs)

164

In [10]:
retriever.add_texts([
    d.page_content for d in docs
])

100%|██████████| 6/6 [00:16<00:00,  2.75s/it]


In [15]:
retriever.invoke("Please introduce some Jingdezhen porcelains")

[Document(metadata={'score': 0.456929594}, page_content='![](https://cdn-mineru.openxlab.org.cn/result/2025-07-27/26ec8c02-599c-4b79-9876-e092d6287e02/0135e38ba2b52933cfe06032bd440b643e0ea5f7622891c690c30c256dec39f8.jpg)  \n\nPlate, Jingdezhen, China, 1739- 1743. Porcelain with overglaze enamels, diameter  $22.9 \\mathrm{~cm}$ . The Metropolitan Museum of Art, New York  碟·中國景德鎮·一七三九年至一七四三年·釉上彩瓷，直徑22.9厘米。紐約，大都會藝術博物館  '),
 Document(metadata={'score': 0.442890882}, page_content='![](https://cdn-mineru.openxlab.org.cn/result/2025-07-27/26ec8c02-599c-4b79-9876-e092d6287e02/6287610049ad775024f987714a2df9afac918f938997e628dbc261362ba0ff5b.jpg)  \n\njug with Portuguese arms. Jingdezhen, China, ca. 1520- 1540. Porcelain with underglaze blue, height  $18.7 \\mathrm{~cm}$ . The Metropolitan Museum of Art, New York  箱莓牙徽章纹壶·中國景德鎮·約一五二零年至一五四零年，青花瓷，高18.7厘米。紐約，大都會藝術博物館  '),
 Document(metadata={'score': 0.39922142}, page_content="# Tankard  \n\nJingdezhen, China (Ming dynasty), early 15th century Porc

In [7]:
from langgraph_sdk import get_client

client = get_client(url="http://10.147.19.97:8000")
assistants = await client.assistants.search()

In [8]:
assistant = assistants[0]

In [4]:
thread = await client.threads.create()

In [None]:
async for chunk in client.runs.stream(
    thread["thread_id"],
    assistant["assistant_id"],
    input={
        "question": "Introduce the chinese porcelain"
    },
):
    print(chunk.event)

AttributeError: 'str' object has no attribute 'data'