# My own Knowledge worker

In [16]:
# so first i need to import some knowledge base 
import glob
import os
from dotenv import load_dotenv
import gradio as gr



In [17]:
# imports for langchain, plotly and Chroma

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings

## Creating Environment

In [18]:

MODEL="gpt-4o-mini"
vector_database="vector_db"

load_dotenv(override=True)
os.environ['OPENAI_API_KEY']=os.getenv('OPENAI_API_KEY')



In [19]:

## for data generation you can use my code from github: Generator-of-synthetic-data (change it for your purpose)

# my knowledge base is simple and small for example. Go ahead, add more files

know_base=glob.glob("material/*")



def add_metadata(doc, doc_type):
    doc.metadata["doc_type"] = doc_type
    doc.metadata["filename"] = os.path.basename(doc.metadata.get("source", "unknown.md"))
    return doc


text_loader_kwargs = {'encoding': 'utf-8'}

documents = []
for folder in know_base:
    doc_type = os.path.basename(folder)
    loader=DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    documents.extend([add_metadata(doc, doc_type) for doc in folder_docs])


text_splitter=CharacterTextSplitter(chunk_size=100,chunk_overlap=50) # chunk_overlap cant be bigger than chunk itself

chunks = text_splitter.split_documents(documents)

print(f"Total number of chunks: {len(chunks)}")
print(f"Document types found: {set(doc.metadata['doc_type'] for doc in documents)}")




Total number of chunks: 10
Document types found: {'others', 'animals'}


In [20]:
embeddings = OpenAIEmbeddings()


# from langchain.embeddings import HuggingFaceEmbeddings
# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Delete if already exists

if os.path.exists(vector_database):
    Chroma(persist_directory=vector_database, embedding_function=embeddings).delete_collection()

# Create vectorstore

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=vector_database)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 10 documents


In [21]:
collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

There are 10 vectors with 1,536 dimensions in the vector store


## Visualization of Vector Database

In [22]:
result=collection.get(include=["metadatas","embeddings","documents"])
vectors=np.array(result['embeddings'])
documents=result['documents']
metadatas=result['metadatas']
doc_types=[metadata['doc_type'] for metadata in metadatas]


colors = [['red','blue'][['animals','others'].index(f)] for f in doc_types]

## Visualization of Vector Database in 2D

In [23]:
# Reduce the dimensionality of the vectors to 2D using t-SNE
# (t-distributed stochastic neighbor embedding)


# uncomment if you have bigger knowledge base

# tsne=TSNE(n_components=2, random_state=42)
# reduced_vectors=tsne.fit_transform(vectors)


fig = go.Figure(data=[go.Scatter(
    x=vectors[:, 0],  ## suradnice bodov v 2D priestorov
    y=vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=400,
    height=300,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()




## Visualization of Vector Database in 3D

In [24]:

# tsne = TSNE(n_components=3, random_state=42)
# reduced_vectors = tsne.fit_transform(vectors)

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=vectors[:, 0],
    y=vectors[:, 1],
    z=vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=450,
    height=350,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [25]:
llm=ChatOpenAI(temperature=0.7,model_name=MODEL)
memory=ConversationBufferMemory(memory_key='chat_history', return_messages=True)
retriever = vectorstore.as_retriever()

### !!!
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)  # !!!!!

In [26]:
query = "What is a cat"
result = conversation_chain.invoke({"question": query})
print(result["answer"])

A cat is a popular pet known for its independence and playfulness. They have excellent hearing and vision, making them great hunters. Cats love comfort and often sleep more than 15 hours a day. They are also very clean animals and spend a lot of time grooming their fur.


In [27]:
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)


conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [28]:
def chat(question, history):
    result = conversation_chain.invoke({"question": question})
    return result["answer"]

## Gradio for User interface

In [29]:
view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)

* Running on local URL:  http://127.0.0.1:7862
* To create a public link, set `share=True` in `launch()`.
