In [22]:
import os
import dotenv
from pathlib import Path

from langchain_core.messages import AIMessage, HumanMessage
from langchain_community.document_loaders.text import TextLoader
from langchain_community.document_loaders import (
    WebBaseLoader,
    PyPDFLoader,
    Docx2txtLoader,
)
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

dotenv.load_dotenv()

True

In [23]:
# load docs

doc_paths = [
    "docs/testdoc.pdf"
]

docs = []
for doc_file in doc_paths:
    file_path = Path(doc_file)

# assigning different types of file extensions to correct loader

    try:
        if doc_file.endswith(".pdf"):
            loader = PyPDFLoader(file_path)
        elif doc_file.endswith(".docx"):
            loader = Docx2txtLoader(file_path)
        elif doc_file.endswith(".txt") or doc_file.endswith(".md"):
            loader = TextLoader(file_path)
        else:
            print(f"Document type {doc_file.type} not supported.")
            continue

        docs.extend(loader.load())
    
    except Exception as e:
        print(f"Error loading document {doc_file.name}: {e}")

    
# load URLs

url = "https://minecraft.fandom.com/wiki/Minecraft_Wiki"

try:
    loader = WebBaseLoader(url)
    docs.extend(loader.load())

except Exception as e:
    print(f"Error loading document from {url}: {e}")

In [25]:
docs

[Document(metadata={'producer': 'Skia/PDF m140 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'testdoc', 'source': 'docs/testdoc.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content='1.  Adi  likes  to  eat  chicken  nuggets,  frozen  pizzas,  and  mayonnaise  2.  The  plane  has  177  passengers'),
 Document(metadata={'source': 'https://minecraft.fandom.com/wiki/Minecraft_Wiki', 'title': 'Minecraft Wiki – The Ultimate Resource for Minecraft', 'language': 'en'}, page_content='\n\n\n\nMinecraft Wiki – The Ultimate Resource for Minecraft\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\t\tSign In\t\n\n\n\n\t\tRegister\t\n\n\n\n\n\n\n \n\n\n\n\n\n\tMinecraft Wiki\n\n\n\n\n\n Explore\n\n \n\n\n\n\n Main Page\n\n\n\n\nAll Pages\n\n\n\n\nInteractive Maps\n\n\n\n\n\n\n\n\nGames\n\n \n\n\n\n\nMinecraft\n\n\n\n\nMinecraft Dungeons\n\n\n\n\nMinecraft Legends\n\n\n\

In [31]:
# split docs into chunks

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 5000,
    chunk_overlap = 1000,
)

document_chunks = text_splitter.split_documents(docs)

In [33]:
# tokenize and load the docs to the vector store (imported from langchain)

vector_db = Chroma.from_documents(
    documents = document_chunks,
    embedding = OpenAIEmbeddings(),
)

ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


In [36]:
def _get_context_retriever_chain(vector_db, llm):
    retriever = vector_db.as_retriever()
    prompt = ChatPromptTemplate.from_messages([
        MessagesPlaceholder(variable_name = "messages"),
        ("user", "{input}"),
        ("user", "Given the above conversation, generate a search query to look up information relevant to the conversation, focusing on the most recent messages.")
    ])
    retriever_chain = create_history_aware_retriever(llm, retriever, prompt)
    
    return retriever_chain

In [37]:
def get_conversation_rag_chain(llm):
    retriever_chain = _get_context_retriever_chain(vector_db,llm)

    prompt = ChatPromptTemplate.from_messages([
        (
            "system",
            """
            You are a helpful assistant. You will have to answer to user's queries.
            You will have some context to help with your answers, but they will not always be completely related or helpful.
            You can also use your knowledge to assist answering the user's queries.\n {context}
            """
        ),
        MessagesPlaceholder(variable_name="messages"),
        ("user","{input}")
    ])
    stuff_documents_chain = create_stuff_documents_chain(llm, prompt)

    return create_retrieval_chain(retriever_chain, stuff_documents_chain)

In [38]:
# Augmented Generation

llm_stream_openai = ChatOpenAI(
    model = "gpt-4o",
    temperature = 0.3,
    streaming = True,
)

llm_stream = llm_stream_openai

messages = [
    {"role": "user", "content": "Hi"},
    {"role": "assistant", "content": "Hi there! How can I assist you today?"},
    {"role": "user", "content": "What is the latest update of minecraft?"}
]

messages = [HumanMessage(content=m["content"]) if m["role"] == "user" else AIMessage(content=m["content"]) for m in messages] # converting message history to format of langchain using HumanMessage and AIMessage classes

conversation_rag_chain = get_conversation_rag_chain(llm_stream)
response_message = "*(RAG Response)*\n"
for chunk in conversation_rag_chain.pick("answer").stream({"messages": messages[:-1], "input": messages[-1].content}):
    response_message += chunk
    print(chunk, end = "", flush = True)

messages.append({"role": "assistant", "content": response_message})

ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given


The latest updates for Minecraft as of the information available are:

- Java Edition 1.20.2, released on September 21, 2023.
- Bedrock Edition 1.20.30, released on September 19, 2023.

If you need more details about these updates or anything else, feel free to ask!