## langchain libraries

In [None]:
#! pip install -U langchain_community langchain langchain_huggingface langchain_openai
# ! pip install openai
#! pip install "unstructured[all-docs]"
#! pip install langchain chromadb unstructured openai

In [None]:
from langchain.document_loaders import UnstructuredEPubLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_openai import AzureChatOpenAI
from langchain.vectorstores import Chroma

from markdown import markdown
import os

In [None]:
from openai import OpenAI

client = OpenAI()

In [None]:
os.environ["OPENAI_API_KEY"] = "<your_api_key>"

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [None]:
import pypandoc
pypandoc.download_pandoc()

A ce niveau, certains packages doivent être présent en local, sous linux, il faut installer **pandoc** (sudo apt install pandoc)

Ou 

import pypandoc <br>
pypandoc.download_pandoc()

In [None]:
def vector_store(path):
    loader = UnstructuredEPubLoader(path)
    docs = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=10
    )
    documents = text_splitter.split_documents(docs)

    vectorStore = Chroma.from_documents(documents, embeddings)
    return vectorStore

In [None]:
def _llm(question, context):
    formatted_prompt = f"question: {question}\n\n context: {context}"
    
    # model = AzureChatOpenAI(
    #     azure_deployment="gpt-35-turbo-16k",
    #     api_version="2023-06-01-preview",
    # )
    # message = [{
    #     "role": "user",
    #     "content": formatted_prompt
    # }]
    # return model.invoke(message).content

    response = ""

    stream = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{
            "role": "user", 
            "content": f"{formatted_prompt}"
            }],
        stream=True,
    )
    for chunk in stream:
        if chunk.choices[0].delta.content is not None:
            response += chunk.choices[0].delta.content
            
    return response

def combine_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def rag_chain(question, vectorStore):
    retriever = vectorStore.as_retriever()
    retrieved_docs = retriever.invoke(question)
    formatted_context = combine_docs(retrieved_docs)
    return _llm(question, formatted_context)

In [None]:
epub_path = "docs/39419251_rtl.epub"
vectorStore = vector_store(epub_path)

In [None]:
question = "talk me about document"
response = markdown(rag_chain(question, vectorStore))

In [None]:
response