In [None]:
# Import required libraries for file operations, environment management, and UI.
import os
import glob
from dotenv import load_dotenv
import gradio as gr

In [None]:
# Import LangChain modules for document loading, splitting, embeddings, and conversational AI.
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import numpy as np  # For numerical operations
from sklearn.manifold import TSNE  # For dimensionality reduction
import plotly.graph_objects as go  # For visualization
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [None]:
# Set the LLM model and vector database name.
MODEL = "gpt-4o-mini"
db_name = "vector_db"

In [None]:
# Load environment variables and set OpenAI API key.
load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

In [None]:
# Load markdown documents from the knowledge base folders and assign metadata.
folders = glob.glob("knowledge-base/*")
text_loader_kwargs = {'encoding': 'utf-8'}
documents = []
for folder in folders:
    doc_type = os.path.basename(folder)  # Get document type from folder name
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata["doc_type"] = doc_type  # Assign document type metadata
        documents.append(doc)

In [None]:
# Split documents into manageable chunks for embedding and retrieval.
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

In [None]:
# Check the number of document chunks created.
len(chunks)

In [None]:
# Extract and display the types of documents found in the chunks.
doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)
print(f"Document types found: {', '.join(doc_types)}")

In [None]:
# Create embeddings and build the vectorstore. Delete existing collection if present.
embeddings = OpenAIEmbeddings()
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

In [None]:
# Inspect the dimensionality of the vector embeddings.
collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")

In [None]:
# Set up the conversational retrieval chain with LLM, memory, and retriever.
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
retriever = vectorstore.as_retriever()
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [None]:
# Run a sample query through the conversation chain and display the answer.
query = "Can you describe Insurellm in a few sentences"
result = conversation_chain.invoke({"question":query})
print(result["answer"])

In [None]:
# Re-initialize memory and conversational chain if needed.
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [None]:
# Define the chat function for Gradio interface.
def chat(message, history):
    result = conversation_chain.invoke({"question": message})
    return result["answer"]

In [None]:
# Launch the Gradio chat interface in the browser.
view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)