In [1]:
import re
import os
import csv
import datetime
import pandas as pd
import time

# vector store set up 

import chromadb

from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma

# LLM
from langchain.llms import GPT4All

# VA with langchain
from langchain import PromptTemplate

from langchain.chains import RetrievalQAWithSourcesChain


# Chroma Set up

In [2]:
chroma_client = client = chromadb.PersistentClient(path="C:/Users/Nathan/Kratos_data-Science/Chroma/v4")
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# Passing a Chroma Client into Langchain

langchain_chroma = Chroma(
    client=chroma_client,
    collection_name="Skyminer",
    embedding_function=embedding_function,
)
print("There are", langchain_chroma._collection.count(), "in the collection")

  from .autonotebook import tqdm as notebook_tqdm


There are 463 in the collection


# LLM set up

In [4]:
models = {
    "Orca mini": {
        "Parameters": "3b",
        "model_path": "C:\\Users\\Nathan\\.cache\\gpt4all\\orca-mini-3b.ggmlv3.q4_0.bin"
    },
    "Orca": {
        "Parameters": "13b",
        "model_path": "C:\\Users\\Nathan\\.cache\\gpt4all\\orca-mini-13b.ggmlv3.q4_0.bin"
    },
    "Falcon": {
        "Parameters": "7b",
        "model_path": "C:\\Users\\Nathan\\.cache\\gpt4all\\ggml-model-gpt4all-falcon-q4_0.bin"
    },
    "hermes": {
        "Parameters": "13b",
        "model_path": "C:\\Users\\Nathan\\.cache\\gpt4all\\nous-hermes-13b.ggmlv3.q4_0.bin"
    },
    "Snoozy": {
        "Parameters": "13b",
        "model_path": "C:\\Users\\Nathan\\.cache\\gpt4all\\GPT4All-13B-snoozy.ggmlv3.q4_0.bin"
    },
    "Wizzard": {
        "Parameters": "13b",
        "model_path": "C:\\Users\\Nathan\\.cache\\gpt4all\\wizardlm-13b-v1.1-superhot-8k.ggmlv3.q4_0.bin"
    },
        "Llama2": {
        "Parameters": "7b",
        "model_path": "C:\\Users\\Nathan\\.cache\\gpt4all\\llama-2-7b-chat.ggmlv3.q4_0.bin"
    }
}


# Chat bot 

In [5]:
class DocumentationAssistant:
    def __init__(self, retriever, llm, num_docs=1, verbose=True):
        self.retriever = retriever
        self.llm = llm
        self.num_docs = num_docs
        self.verbose = verbose
        self.QUESTION_PROMPT_TEMPLATE = """
        You are an expert on the topic. Based on the provided document excerpts, answer the question: "{question}". 
        Craft a concise and clear response using the provided information. If possible, illustrate with a simple example.
        If the information is insufficient, request more details.
        ----------
        Context: {context}
        ----------
        """

    def _print(self, *args, **kwargs):
        if self.verbose:
            print(*args, **kwargs)

    def get_answer(self, query):
        # 1. Retrieve relevant documents using the retriever's similarity search
        results = self.retriever.similarity_search_with_score(query)

        # Print the retrieved documents and their scores
        self._print("1. Retrieved Documents and their Similarity Scores:")
        for doc, score in results:
            self._print(f"Document: {doc.page_content}... | Score: {score}")
        self._print("\n")

        # 2. Sort the results based on similarity score 
        # The returned distance score is L2 distance. Therefore, a lower score is better.
        sorted_results = sorted(results, key=lambda x: x[1], reverse=False)

        # 3. Slice the sorted_results list to get the top num_docs documents
        top_docs = sorted_results[:self.num_docs]

        # Print the selected documents and their scores
        self._print("2. Selected Documents for Context:")
        for doc, score in top_docs:
            self._print(f"Document: {doc.page_content}... | Score: {score}")
        self._print("\n")

        # 4. Concatenate the content of the top documents to form the context
        context = "\n\n".join([doc.page_content for doc, score in top_docs])

        # 5. Prepare the prompt
        prompt = self.QUESTION_PROMPT_TEMPLATE.format(context=context, question=query)

        # Print the prepared prompt
        self._print("3. Prepared Prompt:")
        self._print(prompt)
        self._print("\n")

        # 6. Use the prompt with the LLM to get the answer
        responses = self.llm.generate([prompt])

        # Extract the response from the LLMResult object
        response = responses.generations[0][0].text

        # Extract and format document sources from the top documents
        document_sources = [doc.metadata['source'] for doc, score in top_docs]
        sources_string = "\n".join(document_sources)

        # Extract the content of the retrieved documents
        retrieved_docs = [doc.page_content for doc, score in top_docs]

        # Return the final response along with the sources and the retrieved documents
        return response, sources_string, retrieved_docs
    
    


# Generate function

In [6]:
def answer_question(question, model_name, num_docs=1, verbose_level=0):
    """
    Answers a given question using the DocumentationAssistant class.
    
    Parameters:
    - question (str): The question to be answered.
    - model_name (str): The name of the model to be used from the 'models' dictionary.
    - num_docs (int, optional): Number of documents to be retrieved. Defaults to 1.
    - verbose_level (int, optional): Level of verbosity.
        0: Only prints the answer.
        1: Prints the answer and sources.
        2: Prints the answer, sources, and verbose outputs from the DocumentationAssistant.
        3: Prints the answer, sources, verbose outputs, and retrieved documents.
    
    Returns:
    None. The function prints the results based on the verbosity level.
    """
    
    # Check if the model_name is valid
    if model_name not in models:
        print(f"Error: Model '{model_name}' not found in the models dictionary.")
        return

    # Initialize the retriever, LLM, and DocumentationAssistant
    retriever = langchain_chroma  # Assuming you have this retriever instance already
    model_details = models[model_name]
    model_path = model_details["model_path"]
    llm = GPT4All(model=model_path, max_tokens=4000)
    assistant = DocumentationAssistant(retriever, llm, num_docs=num_docs)
    
    # Adjust the verbose attribute based on verbose_level
    if verbose_level in [2, 3]:
        assistant.verbose = True
    else:
        assistant.verbose = False

    # Get the answer using the DocumentationAssistant
    answer, sources, retrieved_docs = assistant.get_answer(question)

    # Print the result based on the verbosity level
    print("Question:", question)
    print("Answer:", answer)
    
    if verbose_level >= 1:
        print("Sources:", sources)
    if verbose_level == 3:
        print("Retrieved Documents:", retrieved_docs)


# Test

In [7]:
# Example usage:
question = "How can I use Skyminer for my business "
model_name = "Llama2"
answer_question(question, model_name, num_docs=1, verbose_level=0)

Found model file at  C:\\Users\\Nathan\\.cache\\gpt4all\\llama-2-7b-chat.ggmlv3.q4_0.bin
Question: How can I use Skyminer for my business 
Answer: 1. How can I use Skyminer for my business?
        You can use Skyminer as a data storage & analytics service for your business in various ways:
        a) Monitoring and Analyzing Data : Use Skyminer to store, process ,and analyze large amounts of data from various sources such as logs, metrics, traces, and other data streams . This can help you identify trends, patterns, and anomalies that can inform your business decisions.
        b) Building Custom Dashboards: Create custom dashboards using Grafana to visualize and monitor the data stored in Skyminer . This allows you to easily track key performance indicators (KPIs), detect issues ,and make informed decisions based on real-time data.
        c) Integration with Other Systems : Use Skyminer as a centralized data storage solution that can be integrated with other systems such as database

In [9]:
# Example usage:
question = "Give me the list of the downsampling aggregators available in skyminer"
model_name = "Llama2"
answer_question(question, model_name, num_docs=4, verbose_level=1)

Found model file at  C:\\Users\\Nathan\\.cache\\gpt4all\\llama-2-7b-chat.ggmlv3.q4_0.bin
Question: Give me the list of the downsampling aggregators available in skyminer
Answer: 1. What are the downsampling aggregators available in Skyminer? Please provide a list of these aggregators along with their uses and parameters.



Sources: administration-manual Skyminer Introduction Versions
administration-manual Skyminer Introduction Versions
user-manual Aggregators Access to Aggregator documentation in Skyminer UI
user-manual FAQ What is Skyminer?


In [13]:
# Example usage:
question = "tell me about the downsampling aggregators"
model_name = "Llama2"
answer_question(question, model_name, num_docs=1, verbose_level=1)

Found model file at  C:\\Users\\Nathan\\.cache\\gpt4all\\llama-2-7b-chat.ggmlv3.q4_0.bin
Question: tell me about the downsampling aggregators
Answer:  Please provide more information on downsampling aggregators, including how they work and what their limitations are.  What does alignment mean in this context? How do you use these parameters to control the behavior of downsampling aggregators? Can you give an example of when each parameter would be useful?
Sources: user-manual Aggregators Description/The downsampling aggregators


In [14]:
# Example usage:
question = "tell me about the downsampling aggregators"
model_name = "Falcon"
answer_question(question, model_name, num_docs=1, verbose_level=1)

Found model file at  C:\\Users\\Nathan\\.cache\\gpt4all\\ggml-model-gpt4all-falcon-q4_0.bin
Question: tell me about the downsampling aggregators
Answer:  Downsampling aggregators allow you to reduce the sampling rate of the data points and aggregate these values over a longer period of time. This can be useful when working with large datasets, as it reduces the amount of data that needs to be processed at once.
Downsampling aggregators are commonly used in machine learning applications, where they can help improve model accuracy by reducing noise and increasing signal strength. For example, downsampling aggregators can be used to reduce the sampling rate of a time series dataset, so that only the most important data points are retained for analysis. This can result in cleaner and more accurate data, which can then be fed into machine learning models for further analysis.
Sources: user-manual Aggregators Description/The downsampling aggregators


In [11]:
question = "What is an Aggregator in skyminer ? And how can I use them to generate a query"
model_name = "Llama2"
answer_question(question, model_name, num_docs=1, verbose_level=2)

Found model file at  C:\\Users\\Nathan\\.cache\\gpt4all\\llama-2-7b-chat.ggmlv3.q4_0.bin
1. Retrieved Documents and their Similarity Scores:
Document: To access the aggregators documentation , click on the information button in the Skyminer Query interface . Click on the link to “Features documentation” The following window will open , select the tab Aggregator to see details about each aggregator , their use and parameters Error messages Type error : the type returned by the script is not a number { "errors" : [ "Script result should be number , got class java . lang . String" ] } Syntax error : the JavaScript is not valid { "errors" : [ "query . metric [ 0 ]. aggregators [ 1 ]. m_script has an invalid syntax :[ line & column number ] [ Syntax error message ]" ] } Allocation limit : the number of points in the range is larger than the maximum size that can be allocated , defined by the skyminer . script_agregator . max_batch property in the configuration file ( only occurs if allocate

In [15]:
question = "What is an Aggregator in skyminer ? And how can I use them to generate a query"
model_name = "Falcon"
answer_question(question, model_name, num_docs=2, verbose_level=1)

Found model file at  C:\\Users\\Nathan\\.cache\\gpt4all\\ggml-model-gpt4all-falcon-q4_0.bin
Question: What is an Aggregator in skyminer ? And how can I use them to generate a query
Answer: 
Aggregators in Skyminer are used to group and aggregate data points from different sources into a single data point. This allows for more efficient querying of the aggregated data. To use aggregators in Skyminer, you can follow these steps:

1. Click on the "Aggregators" tab in the Skyminer Query interface.
2. Select the aggregator you want to use from the list of available aggregators.
3. Configure the aggregator by specifying the fields to include in the aggregation and any other relevant parameters.
4. Save your aggregator configuration.
5. Use your aggregator in your queries by selecting it from the "Aggregators" drop-down menu in the query interface.

Here's a simple example of how you can use an aggregator to generate a query:

Let's say you want to create a query that shows the average temper