In [None]:
!pip install cohere gradio PyMuPDF langchain_community hnswlib pypdf langchain



In [None]:
import cohere
import gradio as gr
import fitz  # PyMuPDF
import os
import hnswlib
from langchain_community.document_loaders import PyPDFLoader
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from cohere import ChatMessage


**Initializing Cohere Key**

In [None]:
# Initialize Cohere client
co = cohere.Client("xhZlPSrYzH1FvIq0ZEaGhPjoNWK8LSzAy7Magodp")

**Loading Pdf Files**

In [None]:
def load_pdfs(pdf_directory):
    texts = []
    for filename in os.listdir(pdf_directory):
        if filename.endswith('.pdf'):
            filepath = os.path.join(pdf_directory, filename)
            with fitz.open(filepath) as pdf:
                text = ""
                for page in pdf:
                    text += page.get_text()
                texts.append(text)
    return texts

pdf_texts = load_pdfs('/content/') ### Replace the address of your directory.

**Chunking the documents**

In [None]:
def split_texts(texts, chunk_size=1000, overlap=20):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    documents = [Document(page_content=text, metadata={"source": f"Document_{i}"}) for i, text in enumerate(texts)]
    chunks = splitter.split_documents(documents)
    return [chunk.page_content for chunk in chunks]

pdf_chunks = split_texts(pdf_texts)
len(pdf_chunks)

2325

**Create embeddings for the chunks**

In [None]:
def embed_texts(texts):
    embeddings = co.embed(texts=texts, model='embed-english-v3.0',input_type="search_document").embeddings
    return embeddings

pdf_embeddings = embed_texts(pdf_chunks)

**Store the embedded chunks in a vector storage**

In [None]:
# Ensure the dimensionality is correct
embedding_dimension = len(pdf_embeddings[0])

# Initialize HNSW index with the correct dimension
idx = hnswlib.Index(space="cosine", dim=embedding_dimension)
idx.init_index(max_elements=len(pdf_chunks), ef_construction=512, M=64)

# Add items to the index
idx.add_items(pdf_embeddings, list(range(len(pdf_embeddings))))

# Save the index
idx.save_index("pdf_index.bin")

# To load the index
# loaded_idx = hnswlib.Index(space="ip", dim=1024)
# loaded_idx.load_index("pdf_index.bin", max_elements=len(pdf_chunks))


**Method to retrieve from the vector storage and then rerank those retrieved chunks**

In [None]:
def retrieve_and_rerank(query, n=10, k=5):
    query_emb = co.embed(texts=[query], model="embed-english-v3.0", input_type="search_query").embeddings[0]
    relevant_doc_ids = idx.knn_query(query_emb, k=n)[0][0]
    docs_to_rerank = [pdf_chunks[doc_id] for doc_id in relevant_doc_ids]

    rerank_results = co.rerank(
        query=query,
        documents=docs_to_rerank,
        top_n=k,
        model="rerank-english-v3.0",
        return_documents=True
    )

    texts = [item.document.text for item in rerank_results.results]
    reranked_documents = [{"title": f"Document {i+1}", "description": doc} for i, doc in enumerate(texts)]
    return reranked_documents


**The gradio interface**

In [None]:
import gradio as gr

global_chat_history = []
preamble = "You are an expert assistant having a great knowledge of UNICA services"

def run_chatbot(message):
    response = co.chat(message=message, model="command-r-plus", search_queries_only=True, chat_history=global_chat_history)

    search_queries = [query.text for query in response.search_queries]

    if search_queries:
        print("Retrieving information...", end="")
        documents = []
        for query in search_queries:
            documents.extend(retrieve_and_rerank(query, 10, 5))

        response = co.chat_stream(
            message=message,
            model="command-r-plus",
            documents=documents,
            preamble=preamble,
            chat_history=global_chat_history,
        )
    else:
        response = co.chat_stream(
            message=message,
            model="command-r-plus",
            preamble=preamble,
            chat_history=global_chat_history,
        )

    chatbot_response = ""
    citations = []
    print("\nChatbot:")

    for event in response:
        if event.event_type == "text-generation":
            print(event.text, end="")
            chatbot_response += event.text
        if event.event_type == "stream-end":
            if event.response.citations:
                citations = event.response.citations
                print("\n\nCITATIONS:")
                for citation in event.response.citations:
                    print(citation)
            if event.response.documents:
                print("\nCITED DOCUMENTS:")
                for document in event.response.documents:
                    print(document)

    # Add to chat history
    global_chat_history.extend(
        [ChatMessage(role="USER", message=message),
         ChatMessage(role="CHATBOT", message=chatbot_response)])

    return chatbot_response, citations

def gradio_chat(query):
    if query.lower() == "quit":
        global global_chat_history
        global_chat_history = []  # Clear chat history
        return "", "", gr.HTML("<h1 style='text-align: center; margin-top: 20%;'>Chat has been closed all the chat history has been deleted.</h1>")

    chatbot_response, citations = run_chatbot(query)

    # Extract citations if available
    if citations:
        citations_text = "\n\nCITATIONS:\n" + "\n".join([f"{citation.text} (from {citation.document_ids})" for citation in citations])
    else:
        citations_text = "Not applicable"

    return chatbot_response, citations_text, ""

# Create a custom HTML component for the note
note_html = "<p style='text-align: center; color: gray;'>Type <strong>'quit'</strong> to end the chat.</p>"

interface = gr.Interface(
    fn=gradio_chat,
    inputs=gr.Textbox(label="User Query"),
    outputs=[
        gr.Textbox(label="Response"),
        gr.Textbox(label="Citations"),
        gr.HTML()
    ],
    title="UNICA Chatbot",
    description=note_html
)
interface.launch(debug=True)


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://ce3c7d7575426acd91.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)



Chatbot:
Hello! I am an expert assistant with a great knowledge of UNICA services. How can I help you today?Retrieving information...
Chatbot:
I don't have a clear definition of what UNICA is, but I can tell you that UNICA is a platform that supports UTF-8 as the default character set encoding, which allows users to enter data in any language. UNICA applications must be deployed on a dedicated Java™ virtual machine (JVM) and products customize the JVM that is used by the web application server. UNICA can be integrated with Windows™ Active Directory or another LDAP (Lightweight Directory Access Protocol) server.

CITATIONS:
start=85 end=93 text='platform' document_ids=['doc_0']
start=99 end=151 text='supports UTF-8 as the default character set encoding' document_ids=['doc_0']
start=159 end=202 text='allows users to enter data in any language.' document_ids=['doc_0']
start=209 end=281 text='applications must be deployed on a dedicated Java™ virtual machine (JVM)' document_ids=['doc_1']
