In [None]:
import os
import glob
from dotenv import load_dotenv
import gradio as gr

In [None]:
# imports for langchain and Chroma and plotly
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go

from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [None]:
# price is a factor for our company, so we're going to use a low cost model

MODEL = "gpt-4o-mini"

# db_name = "fixwing_vector_db"
# path = "/home/hoda/Documents/Hooma/Fixed-wing/my_papers/*"

# db_name = "dbs/farsi3_pdf_vector_db"
# path = "/home/hoda/Desktop/llms_test/farsi3/*"

db_name = "dbs/txt_vector_db"
path = "/home/hoda/Desktop/llms_test/input_texts/*"

In [None]:
# Load environment variables in a file called .env

load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')


In [None]:
# Read in documents using LangChain's loaders
# Take everything in all the sub-folders of our knowledgebase

docs = glob.glob(path)

# With thanks to CG and Jon R, students on the course, for this fix needed for some users 
text_loader_kwargs = {'encoding': 'utf-8'}
# If that doesn't work, some Windows users might need to uncomment the next line instead
# text_loader_kwargs={'autodetect_encoding': True}

# documents = []
# for i, doc in enumerate(docs):
#     loader = PyPDFLoader(doc)
#     texts = loader.load()
#     for text in texts:
#         text.metadata["doc_type"] = str(i)
#         documents.append(text)


documents = []
for i, doc in enumerate(docs):
    loader = TextLoader(doc)
    texts = loader.load()
    for text in texts:
        text.metadata["doc_type"] = str(i)
        documents.append(text)

In [None]:
len(documents)

In [None]:
documents[0]

In [None]:
# split the text into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

In [None]:
len(chunks)

In [None]:
doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)
print(f"Document types found: {', '.join(doc_types)}")

In [None]:
embeddings = OpenAIEmbeddings()

In [None]:
# Check if a Chroma Datastore already exists - if so, delete the collection to start from scratch

# if os.path.exists(db_name):
#     # Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()
#     vectorstore = Chroma(persist_directory=db_name, embedding_function=embeddings)
# else:
#     # Create our Chroma vectorstore!
#     vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
#     print(f"Vectorstore created with {vectorstore._collection.count()} documents")


if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

# Create vectorstore

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

In [None]:
# Get one vector and find how many dimensions it has

collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")

In [None]:
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

retriever = vectorstore.as_retriever()

conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [None]:
query = "How to use reinforcement learning for fixed wing landing"
result = conversation_chain.invoke({"question":query})
print(result["answer"])

In [None]:
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [None]:
def chat(message, history):
    result = conversation_chain.invoke({"question": message})
    return result["answer"]

In [None]:
view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)
# , share=True