In [14]:
import re
import os
import csv
import datetime
from datetime import datetime
import pandas as pd

# vector store set up 

import chromadb

from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma

#Langchain
from langchain.chains import RetrievalQAWithSourcesChain
from IPython.core.display import display, HTML

  from IPython.core.display import display, HTML


In [63]:

def format_document_retriever(document, score):
    title = document.metadata.get('source', 'No Title')
    content = document.page_content
    file_path = document.metadata.get('file_path', 'No File Path')
    return f"""
    <div style='margin-bottom: 20px;'>
        <h3>{title} (Score: {score:.2f})</h3>
        <p>{content}</p>
        <a href='{file_path}'>Source</a>
    </div>
    """

def format_document(document, score):
    """
    Formats a document and its score into an HTML string.
    """
    title = document.metadata.get('source', 'No Title')
    content = document.page_content
    file_path = document.metadata.get('file_path', 'No File Path')
    return f"""
    <div style='margin-bottom: 20px;'>
        <h3>{title} (Score: {score:.2f})</h3>
        <p>{content}</p>
        <a href='{file_path}'>Source</a>
    </div>
    """

def display_documents(retrieved_docs):
    html_content = "<div style='font-family: Arial, sans-serif;'>"
    for doc, score in retrieved_docs:
        html_content += format_document_retriever(doc, score)
    html_content += "</div>"
    display(HTML(html_content))

# Chroma

In [95]:
chroma_client = client = chromadb.PersistentClient(path="Path# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# Passing a Chroma Client into Langchain

langchain_chroma = Chroma(
    client=chroma_client,
    collection_name="Skyminer-T",
    embedding_function=embedding_function,
)
print("There are", langchain_chroma._collection.count(), "in the collection")

There are 473 in the collection


# Retriever

In [113]:
def ask(question):
    retrieved_docs = langchain_chroma.similarity_search_with_score(question, k=3)
    output = display_documents(retrieved_docs)
    return output

In [126]:
ask('how to install skyminer')

# Cross encoder 

In [26]:
from sentence_transformers import CrossEncoder

In [104]:
question = "what is skyminer"
retrieved_docs = langchain_chroma.similarity_search_with_score(question, k=3)
retrieved_docs

[(Document(page_content='Context : (Documentation = Administrastion Manual, Title = Skyminer Introduction) Skyminer system is a Big Data storage and analytics engine integrated with our corporate products , systems and solutions . It is capable of storing billions of samples with different data types , while maintaining efficient storage and outstanding write and read performances . Skyminer provides features to analyze data over time , organisational , or geospatial dimensions within and/or between data series .', metadata={'documentation': 'Administrastion Manual', 'file_path': 'http://192.168.48.22:8082/repository/skyminer-dev/skyminer-documentation/latest/administration-manual-html/administration-manual-html/skyminer-introduction.html', 'source': '/Skyminer Introduction', 'word_count': 64}),
  0.24023085832595825),
 (Document(page_content='Context : (Documentation = User Manual, Title = FAQ, Chapter = What is Skyminer?) Skyminer system is a Big Data storage and analytics engine int

In [88]:
page_content1 = retrieved_docs[0][0].page_content

page_content2 = retrieved_docs[1][0].page_content

In [91]:
scores = model.predict([["what is skyminer", page_content1],  
                        ["what is skyminer", page_content2]])

In [92]:
scores

array([0.9654035 , 0.01056262], dtype=float32)

## test pipeline 

In [53]:
from typing import List, Tuple

In [84]:
model_1 = CrossEncoder('cross-encoder/stsb-roberta-large') # Dupplicate

Downloading config.json: 100%|██████████| 629/629 [00:00<?, ?B/s] 
Downloading pytorch_model.bin: 100%|██████████| 1.42G/1.42G [01:07<00:00, 21.0MB/s]
Downloading tokenizer_config.json: 100%|██████████| 139/139 [00:00<?, ?B/s] 
Downloading vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 2.55MB/s]
Downloading merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 14.7MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 772/772 [00:00<?, ?B/s] 


In [None]:
model_2 = CrossEncoder('cross-encoder/stsb-TinyBERT-L-4') # language inference

In [106]:
model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2') # sementic search

Downloading config.json: 100%|██████████| 794/794 [00:00<?, ?B/s] 
Downloading pytorch_model.bin: 100%|██████████| 90.9M/90.9M [00:05<00:00, 15.8MB/s]
Downloading tokenizer_config.json: 100%|██████████| 316/316 [00:00<?, ?B/s] 
Downloading vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 4.06MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<?, ?B/s] 


In [None]:
[more models]()

In [130]:
def rerank_with_cross_encoder(question: str, retrieved_docs: List[Tuple], top_k: int = 3) -> List[Tuple]:
    """
    Reranks the retrieved documents using the CrossEncoder model.

    :param question: The query question.
    :param retrieved_docs: A list of tuples containing documents and their initial retrieval scores.
    :param top_k: Number of top documents to return after re-ranking.
    :return: Top k documents re-ranked by the CrossEncoder model.
    """
    # Prepare pairs of question and document content for the CrossEncoder
    question_doc_pairs = [(question, doc.page_content) for doc, _ in retrieved_docs]

    # Predict the relevancy scores using the CrossEncoder
    cross_encoder_scores = model.predict(question_doc_pairs)

    # Combine the documents with their new scores
    ranked_docs = [(doc, score) for (doc, _), score in zip(retrieved_docs, cross_encoder_scores)]

    # Sort the documents by their new scores in descending order
    ranked_docs.sort(key=lambda x: x[1], reverse=True)

    # Return the top k documents
    return ranked_docs[:top_k]


def ask_question(question, x=3):
    """
    Pipeline to ask a question, retrieve, and display top 3 documents with HTML format.
    """
    # Perform initial retrieval
    initial_retrieved_docs = langchain_chroma.similarity_search_with_score(question, k=473)

    # Re-rank the documents
    reranked_docs = rerank_with_cross_encoder(question, initial_retrieved_docs, top_k=x)

    # Format and display the results
    html_content = "<div style='font-family: Arial, sans-serif;'>"
    for doc, score in reranked_docs:
        html_content += format_document(doc, score)
    html_content += "</div>"

    display(HTML(html_content))

In [131]:
ask_question("how to install skyminer", x=5)