In [None]:
# imports
import os
import glob
from dotenv import load_dotenv
import gradio as gr

# imports for langchain, plotly and Chroma
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings

# Agentization
from langchain.agents import initialize_agent, Tool
from langchain_community.utilities import SQLDatabase
from langchain_experimental.sql import SQLDatabaseChain
from sqlalchemy import create_engine, text
import pyodbc

# main variables
MODEL = "gpt-4o-mini"
db_name = "vector_db"


# load api key
load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

# read documents from knowledge-base
folders = glob.glob("knowledge-base/*")

def add_metadata(doc, doc_type):
    doc.metadata["doc_type"] = doc_type
    return doc

text_loader_kwargs = {'encoding': 'utf-8'}

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    documents.extend([add_metadata(doc, doc_type) for doc in folder_docs])

# chunk text in preparation for RAG
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

print(f"DEBUG: Total number of chunks: {len(chunks)}")
print(f"DEBUG: Document types found: {set(doc.metadata['doc_type'] for doc in documents)}")

# embedding - mapping each chunk of text into vector that represents the meaning of the text
embeddings = OpenAIEmbeddings()
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"DEBUG: Vectorstore created with {vectorstore._collection.count()} documents")

collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"DEBUG: There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

# langchain
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
retriever = vectorstore.as_retriever(search_kwargs={"k": 25})

# RAG retriever tool
def retrieve_knowledge(query: str) -> str:
    """Retrieve and summarize relevant knowledge-base info."""
    docs = retriever.get_relevant_documents(query)
    return "\n\n".join([doc.page_content for doc in docs])

rag_tool = Tool(
    name="KnowledgeBaseSearch",
    func=retrieve_knowledge,
    description="Retrieve information from the Markdown knowledge-base."
)

conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

# Gradio
def chat(question, history):
    result = conversation_chain.invoke({"question": question})
    answer = result["answer"]
    print("\nDEBUG: Answer:", answer)
    return answer

view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)