In [None]:
from dotenv import load_dotenv, find_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.retrievers import BM25Retriever
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from pathlib import Path
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string

load_dotenv(find_dotenv())

stopword_corpus = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
def preprocess_fn(text: str):
    text = text.lower()
    text = ''.join([c for c in text if c not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stopword_corpus]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return tokens

docs_folder = Path(
    r"D:\HKU\Inno Wing RA\UBC Exchange\code\output\Objectifying_China\docs"
)
loader = DirectoryLoader(str(docs_folder.absolute()), glob="**/*.md", loader_cls=TextLoader, loader_kwargs={"encoding": "utf-8"})
docs = loader.load()
retriever = BM25Retriever.from_documents(docs, preprocess_func=preprocess_fn)

In [None]:
### Retrieval Grader
import os
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import AzureChatOpenAI
from pydantic import BaseModel, Field

# Data model
class GradeDocuments(BaseModel):
    """Binary score for relevance check on retrieved documents."""

    binary_score: str = Field(
        description="Documents are relevant to the question, 'yes' or 'no'"
    )


# LLM with function call
llm = AzureChatOpenAI(
    api_version=os.environ["AZURE_API_VERSION"],
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    azure_deployment=os.environ["AZURE_OPENAI_DEPLOYEMENT"]
)

structured_llm_grader = llm.with_structured_output(GradeDocuments)

# Prompt
system = """You are a grader assessing relevance of a retrieved document to a user question. \n 
    It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
    If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. \n
    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question."""
grade_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "Retrieved document: \n\n {document} \n\n User question: {question}"),
    ]
)

retrieval_grader = grade_prompt | structured_llm_grader
question = "Introduce to me some vases"
docs = retriever.invoke(question)
doc_txt = docs[1].page_content
print(retrieval_grader.invoke({"question": question, "document": doc_txt}))

In [None]:
### Generate
from langchain import hub
from langchain_core.output_parsers import StrOutputParser

# Prompt
prompt = hub.pull("rlm/rag-prompt")

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain
rag_chain = prompt | llm | StrOutputParser()

# Run
generation = rag_chain.invoke({"context": docs, "question": question})
print(generation)

In [None]:
### Hallucination Grader


# Data model
class GradeHallucinations(BaseModel):
    """Binary score for hallucination present in generation answer."""

    binary_score: str = Field(
        description="Answer is grounded in the facts, 'yes' or 'no'"
    )


# LLM with function call
structured_llm_grader = llm.with_structured_output(GradeHallucinations)

# Prompt
system = """You are a grader assessing whether an LLM generation is grounded in / supported by a set of retrieved facts. \n 
     Give a binary score 'yes' or 'no'. 'Yes' means that the answer is grounded in / supported by the set of facts."""
hallucination_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "Set of facts: \n\n {documents} \n\n LLM generation: {generation}"),
    ]
)

hallucination_grader = hallucination_prompt | structured_llm_grader
hallucination_grader.invoke({"documents": docs, "generation": generation})

In [None]:
### Answer Grader


# Data model
class GradeAnswer(BaseModel):
    """Binary score to assess answer addresses question."""

    binary_score: str = Field(
        description="Answer addresses the question, 'yes' or 'no'"
    )


# LLM with function call
structured_llm_grader = llm.with_structured_output(GradeAnswer)

# Prompt
system = """You are a grader assessing whether an answer addresses / resolves a question \n 
     Give a binary score 'yes' or 'no'. Yes' means that the answer resolves the question."""
answer_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "User question: \n\n {question} \n\n LLM generation: {generation}"),
    ]
)

answer_grader = answer_prompt | structured_llm_grader
answer_grader.invoke({"question": question, "generation": generation})

In [None]:
### Question Re-writer

# Prompt
system = """You a question re-writer that converts an input question to a better version that is optimized \n 
     for vectorstore retrieval. Look at the input and try to reason about the underlying semantic intent / meaning."""
re_write_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        (
            "human",
            "Here is the initial question: \n\n {question} \n Formulate an improved question.",
        ),
    ]
)

question_rewriter = re_write_prompt | llm | StrOutputParser()
question_rewriter.invoke({"question": question})

In [None]:
from typing import List

from typing_extensions import TypedDict


class GraphState(TypedDict):
    """
    Represents the state of our graph.

    Attributes:
        question: question
        generation: LLM generation
        documents: list of documents
    """

    question: str
    generation: str
    documents: List[str]

In [None]:
from langgraph.graph import END, StateGraph, START

workflow = StateGraph(GraphState)

# Define the nodes
workflow.add_node("retrieve", retrieve)  # retrieve
workflow.add_node("grade_documents", grade_documents)  # grade documents
workflow.add_node("generate", generate)  # generate
workflow.add_node("transform_query", transform_query)  # transform_query

# Build graph
workflow.add_edge(START, "retrieve")
workflow.add_edge("retrieve", "grade_documents")
workflow.add_conditional_edges(
    "grade_documents",
    decide_to_generate,
    {
        "transform_query": "transform_query",
        "generate": "generate",
    },
)
workflow.add_edge("transform_query", "retrieve")
workflow.add_conditional_edges(
    "generate",
    grade_generation_v_documents_and_question,
    {
        "not supported": "generate",
        "useful": END,
        "not useful": "transform_query",
    },
)

# Compile
app = workflow.compile()

In [None]:
import os
from langchain_openai import AzureOpenAIEmbeddings
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv, find_dotenv
from pinecone_text.sparse import BM25Encoder
from langchain_community.retrievers import PineconeHybridSearchRetriever

load_dotenv(find_dotenv())

# create the index
index_name = "umag-hybrid-search"
pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=3072,  # dimensionality of dense model
        metric="dotproduct",  # sparse values supported only for dotproduct
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )

index = pc.Index(index_name)

bm25_encoder = BM25Encoder.default()

embeddings = AzureOpenAIEmbeddings(
    azure_deployment=os.environ["AZURE_OPENAI_EMBEDDING_DEPLOYMENT"],
    azure_endpoint=os.environ["AZURE_OPENAI_EMBEDDING_ENDPOINT"],
)

retriever = PineconeHybridSearchRetriever(
    embeddings=embeddings, sparse_encoder=bm25_encoder, index=index, top_k=10
)

In [None]:
retriever.invoke("How's the porcelain made of?", )

In [None]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from pathlib import Path

docs_folder = Path(
    r"D:\HKU\Inno Wing RA\UBC Exchange\self_rag\output\Objectifying_China\docs"
)
loader = DirectoryLoader(
    str(docs_folder.absolute()),
    glob="**/*.md",
    loader_cls=TextLoader,
    loader_kwargs={"encoding": "utf-8"},
)
docs = loader.load()

In [None]:
import re

image_links = []
for doc in docs:
    matches = re.findall("!\[\]\([a-zA-Z:/\-\.0-9]+\)", doc.page_content)
    image_links.append(matches)

print(image_links)

In [None]:
len(docs)

In [None]:
retriever.add_texts([
    d.page_content for d in docs
])

In [None]:
retriever.invoke("Please introduce some Jingdezhen porcelains")

In [None]:
from langgraph_sdk import get_client

client = get_client(url="http://10.147.19.97:8000")
assistants = await client.assistants.search()

In [None]:
assistant = assistants[0]

In [None]:
thread = await client.threads.create()

In [None]:
async for chunk in client.runs.stream(
    thread["thread_id"],
    assistant["assistant_id"],
    input={
        "question": "Introduce the chinese porcelain"
    },
):
    print(chunk.event)

In [None]:
import wikipedia


title = wikipedia.suggest("Kangxi-era potters Jingdezhen")
title

In [3]:
from agent.web_search import keyword_generator

keywords = keyword_generator.invoke({
    "user_query": "What specific techniques did Kangxi-era potters in Jingdezhen develop that made their overglaze enamel decoration so detailed and vibrant, and how did imperial support contribute to these advancements?"
})
keywords.keywords

['Kangxi-era pottery techniques',
 'Jingdezhen overglaze enamel',
 'imperial support for ceramics',
 'Chinese porcelain decoration',
 'Kangxi porcelain advancements']

In [None]:
import wikipedia
titles = set()
for keyword in keywords.keywords:
    print(f"Searching for keyword: {keyword}")
    titles.update(wikipedia.search(keyword, results=1))

list(titles)

Searching for keyword: Kangxi-era pottery techniques
Searching for keyword: Jingdezhen overglaze enamel
Searching for keyword: imperial support for ceramics
Searching for keyword: Chinese porcelain decoration
Searching for keyword: Kangxi porcelain advancements


['Qing dynasty',
 'Imperial Porcelain Factory, Saint Petersburg',
 'Blue and white pottery',
 'Famille jaune, noire, rose, verte',
 'Jingdezhen porcelain',
 'Japanese pottery and porcelain',
 'Hard-paste porcelain',
 'Qing handicrafts',
 'Maya ceramics',
 '18th century',
 'Overglaze decoration',
 'Chinese ceramics']

In [5]:
from langchain_core.documents import Document
from wikipedia.exceptions import PageError
from concurrent.futures import ThreadPoolExecutor, as_completed

documents = []

def get_document(title):
    try:
        page = wikipedia.page(title=title)
        title = page.title
        content = page.content
        url = page.url
        image_lists = page.images
        images = [f"![]({image})" for image in image_lists]
        return Document(
            page_content=content + "\n" + "\n".join(images),
            metadata={"title": title, "url": url},
        )
    except PageError:
        return None  # Skip titles that do not correspond to a valid page

with ThreadPoolExecutor() as executor:
    futures = [
        executor.submit(get_document, title=title) for title in titles
    ]


    for future in as_completed(futures):
        doc = future.result()
        if doc:
            documents.append(doc)

In [16]:
sum([len(d.page_content) for d in documents])

292881

In [19]:
documents

[Document(metadata={'title': 'Hard-paste porcelain', 'url': 'https://en.wikipedia.org/wiki/Hard-paste_porcelain'}, page_content='Hard-paste porcelain, sometimes called "true porcelain", is a ceramic material that was originally made from a compound of the feldspathic rock petuntse and kaolin fired at a very high temperature, usually around 1400 °C. It was first made in China around the 7th or 8th century and has remained the most common type of Chinese porcelain.  \nFrom the Middle Ages onwards, it was very widely exported and admired by other cultures and fetched huge prices on foreign markets. Eventually Korean porcelain developed in the 14th century and Japanese porcelain in the 17th, but other cultures were unable to learn or reproduce the secret of its formula in terms of materials and firing temperature until it was worked out in Europe in the early 18th century and suitable mineral deposits of kaolin, feldspar, and quartz were discovered. This soon led to a large production in f

In [17]:
from agent.web_search import wiki_search

docs = wiki_search("What specific techniques did Kangxi-era potters in Jingdezhen develop that made their overglaze enamel decoration so detailed and vibrant, and how did imperial support contribute to these advancements?")

In [20]:
sum([len(d.page_content) for d in docs])

123267

In [None]:
wikipedia.page(title)