## Environment Setup:

In [1]:
from functools import partial
from rich.console import Console
from rich.style import Style
from rich.theme import Theme

console = Console()
base_style = Style(color="#60B000", bold=True)
pprint = partial(console.print, style=base_style)

In [2]:
import os

model_name = os.environ['OLLAMA_MODEL']
embed_model_name = os.environ['OLLAMA_EMBEDDING_MODEL']
ollama_address = os.environ['OLLAMA_ADDRESS']
ollama_port = os.environ['OLLAMA_PORT']

print(f'models: {model_name}, {embed_model_name}')
print(f'ollama hosted at: {ollama_address}:{ollama_port}')

models: mistral:7b, mxbai-embed-large
ollama hosted at: http://ollama:11434


In [3]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM

template = """Question: {question}

Answer: Let's think step by step."""

prompt = ChatPromptTemplate.from_template(template)

print(ollama_address +":"+ ollama_port)

model = OllamaLLM(url=ollama_address +":"+ ollama_port ,model=model_name)

chain = prompt | model

def invoke(question):
    global chain
    return chain.invoke({"question": question})

# invoke("What is LangChain? and why should I use it?")

http://ollama:11434


In [4]:
from langchain_ollama import OllamaEmbeddings

embed = OllamaEmbeddings(
    base_url=ollama_address+":"+ollama_port,
    model=embed_model_name
)

def embed_text(text):
    global embed
    return embed.embed_query(text)

input_text = "The meaning of life is 42"
# print(embed_text(input_text)[:10])

In [5]:
!tar -xzf htmls2.tar.gz

# add data

In [6]:
# DELETES DATA

DELETE_DATA = False

if DELETE_DATA:
    from langchain_chroma import Chroma
    import chromadb
    
    database_address = os.environ['IP_ADDRESS']
    database_port = os.environ['DATABASE_PORT']

    chroma_client = chromadb.HttpClient(host=database_address, port=database_port)
    
    
    chroma_client = chromadb.HttpClient(host=database_address, port=database_port)
    
    new_collection = "annotated_data"
    
    chroma_client.delete_collection(name = new_collection)

In [5]:
from os import listdir
from os.path import isfile, join
path = "htmls"
onlyfiles = [f for f in listdir(path) if isfile(join(path, f))]
filepaths = [path + "/" + filename for filename in onlyfiles]
print(filepaths[:5])

['htmls/A.D.A.M._Unit_(5e_Race).html', 'htmls/A.I._(5e_Race).html', 'htmls/Aarongar_(5e_Race).html', 'htmls/Abdelian_(5e_Race).html', 'htmls/Aberrant_Hybrid_(5e_Race).html']


In [6]:
from langchain_community.document_loaders import UnstructuredHTMLLoader
from langchain_community.document_loaders import BSHTMLLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter, Language
from langchain_text_splitters import HTMLSemanticPreservingSplitter
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import LocalFileStore
from langchain.storage._lc_store import create_kv_docstore

from bs4 import Tag, BeautifulSoup

from langchain_chroma import Chroma
import chromadb

database_address = os.environ['IP_ADDRESS']
database_port = os.environ['DATABASE_PORT']

chroma_client = chromadb.HttpClient(host=database_address, port=database_port)

old_collection = "data"
new_collection = "annotated_data"
collection = None

filestore = LocalFileStore("./docstore")
docstore = create_kv_docstore(filestore)

docs = []
html_strings = []
part = 0

# Change to use different base and text splitter
useRecurveCharacterSplitter = False

Batched = True
BATCH_SIZE = 100

if useRecurveCharacterSplitter:
    collection_name = old_collection
else:
    collection_name = new_collection

collection = chroma_client.get_or_create_collection(name=collection_name)
    
vector_store_client = Chroma(
    client = chroma_client,
    collection_name=collection_name,
    embedding_function=embed
)


def extract_contents(html_string):
    soup=BeautifulSoup(html_string, 'html.parser')
    try:
        contents = soup.find("div", {"id": "content"})
        true_content = contents.find("div", {"id": "mw-content-text"})
        no_footer = true_content.find("div", {"class": "mw-parser-output"})
        
        for table in no_footer.find_all("table", {'class':'messagebox protected'}): 
            table.decompose()
        
        return str(no_footer.extract())
    except Exception:
        print(html_string)
    

headers_to_split_on = [
    ("h2", "Header 2"),
    ("h3", "Header 3"),
]

def code_handler(element: Tag) -> str:
    data_lang = element.get("data-lang")
    code_format = f"<code:{data_lang}>{element.get_text()}</code>"
    
    return code_format

semantic_splitter = HTMLSemanticPreservingSplitter(
    headers_to_split_on=headers_to_split_on,
    separators=["\n\n", "\n", ". ", "! ", "? "],
    max_chunk_size=1000,
    preserve_images=True,
    preserve_videos=True,
    elements_to_preserve=["table", "ul", "ol", "code"],
    denylist_tags=["script", "style", "head"],
    custom_handlers={"code": code_handler},
)

parent_text_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.HTML, 
    chunk_size=4000,
    chunk_overlap=800
)

text_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.HTML, 
    chunk_size=400,
    chunk_overlap=80
)

retriever = ParentDocumentRetriever(
    vectorstore = vector_store_client,
    docstore=docstore,
    child_splitter=text_splitter
)

all_splits=[]

if Batched:
    for part in range((len(filepaths)+BATCH_SIZE - 1)//BATCH_SIZE):
        print(f"part: {part} progress:{min(100, part*BATCH_SIZE / len(filepaths) * 100)}%")
        all_splits=[]
        if useRecurveCharacterSplitter:
            for file in filepaths[part*BATCH_SIZE:min(len(filepaths),(part+1) * BATCH_SIZE)]:
                loader = BSHTMLLoader(file)
                docs.extend(loader.load())

        else:
            html_strings = []
            for file in filepaths[part*BATCH_SIZE:min(len(filepaths),(part+1) * BATCH_SIZE)]:
                f = open(file)
                html_strings.append((f.readline(),f.read()))

            # Add metadata and cut unnecessary content
            
            for html_string in html_strings:
                if len(html_string[1]) < 5:
                    print("https://www.dandwiki.com/wiki/" + html_string[0])
                parsed_string = extract_contents(html_string[1])
                new_splits = []
                for new_split in semantic_splitter.split_text(parsed_string):
                    new_split.metadata['url']=html_string[0]
                    new_split.metadata['section']="race"
                    new_splits.append(new_split)
                all_splits.extend(new_splits)
                
            # Fix images
            docs = []
            for split in all_splits:
                split.page_content = split.page_content.replace("![image:/","![image:https://www.dandwiki.com/")
                docs.append(split)
                
            retriever.add_documents(docs)
            
    print("DONE")
            
else:
    if useRecurveCharacterSplitter:
        for file in filepaths:
            loader = BSHTMLLoader(file)
            docs.extend(loader.load())

        print(len(docs))

    else:
        for file in filepaths:
            f = open(file)
            html_strings.append((file[6:],f.read()))

        print(len(html_strings))
        
        for html_string in html_strings:
            if len(html_string[1]) < 5:
                print("https://www.dandwiki.com/wiki/" + html_string[0][:-5])
            parsed_string = extract_contents(html_string[1])
            new_splits = []
            for new_split in splitter.split_text(parsed_string):
                new_split.metadata['url']="https://www.dandwiki.com/wiki/" + html_string[0][:-5]
                new_splits.append(new_split)
            all_splits.extend(new_splits)
            
        # Fix images
        docs = []
        for split in all_splits:
            split.page_content = split.page_content.replace("![image:/","![image:https://www.dandwiki.com/")
            docs.append(split)
            
        _ = vector_store_client.add_documents(documents=all_splits)


  semantic_splitter = HTMLSemanticPreservingSplitter(


part: 0 progress:0.0%
part: 1 progress:4.44642063139173%
part: 2 progress:8.89284126278346%
part: 3 progress:13.33926189417519%
part: 4 progress:17.78568252556692%
part: 5 progress:22.23210315695865%
part: 6 progress:26.67852378835038%
part: 7 progress:31.124944419742107%
part: 8 progress:35.57136505113384%
part: 9 progress:40.01778568252557%
part: 10 progress:44.4642063139173%
part: 11 progress:48.910626945309026%
part: 12 progress:53.35704757670076%
part: 13 progress:57.80346820809249%
part: 14 progress:62.249888839484214%
part: 15 progress:66.69630947087595%
part: 16 progress:71.14273010226768%
part: 17 progress:75.58915073365941%
part: 18 progress:80.03557136505114%
part: 19 progress:84.48199199644286%
part: 20 progress:88.9284126278346%
part: 21 progress:93.37483325922632%
part: 22 progress:97.82125389061805%
DONE


In [13]:
_ = vector_store_client.add_documents(documents=all_splits)

KeyboardInterrupt: 

In [9]:
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict

# Define state for application
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str
        
# Define application steps
def retrieve(state: State):
    retrieved_docs = vector_store_client.similarity_search(state["question"])
    return {"context": retrieved_docs}

new_template = """You are a helper for a DnD homebrew character creation. You need to answear a question, using given context. Question: {question} Context: {context}

Answer: Let's think think about this one..."""

new_prompt = ChatPromptTemplate.from_template(new_template)


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = new_prompt.invoke({"question": state["question"], "context": docs_content})
    response = model.invoke(messages)
    # return {"answer": response.content}
    return {"answer": response}


# Compile application and test
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [21]:
for step in graph.stream(
    {"question": "What races would be suitable for cyberpunk advanture?"},
    stream_mode="updates",
):
    print(f"{step}\n\n----------------\n")

{'retrieve': {'context': [Document(id='b3c1aea1-ecb7-457d-afab-a833079dab89', metadata={'source': 'htmls/Cyborg.html'}, page_content='A race whose physical abilities are extended beyond normal limitations by mechanical elements built into the body. Ability Score Increase . Your Intelligence score increases by 2. Age . Cyborgs can live up to 500 years. After that, their human parts start to fail and can no longer be kept on life support from the robot parts. Alignment . Cyborgs don’t tend to lean to any particular alignment unless they are programmed to. Size . Cyborgs are a little taller than humans on average and can weigh over 400 lb. Your size is medium. Speed . Your walking speed is equal to 30 ft. Thermal Vision . You can see in dim light within 60 feet of you as if it were bright light, and in darkness as if it were dim light, in addition to being able to see through objects in this range. You can see different colors, the hotter the object the more on the warm spectrum of the co