```
docker compose -f backend.yaml up

docker compose -f frontend.yaml up

conda env list
conda activate py3.11
source pipeline_venv/bin/activate
./start.sh
```

## Mongo

In [4]:
from pymongo import MongoClient as mc

def mongo_is_client(
    storage_client: any
) -> any:
    return isinstance(storage_client, mc.Connection)

def mongo_setup_client(
    username: str,
    password: str,
    address: str,
    port: str
) -> any:
    connection_prefix = 'mongodb://(username):(password)@(address):(port)/'
    connection_address = connection_prefix.replace('(username)', username)
    connection_address = connection_address.replace('(password)', password)
    connection_address = connection_address.replace('(address)', address)
    connection_address = connection_address.replace('(port)', port)
    mongo_client = mc(
        host = connection_address
    )
    return mongo_client

def mongo_get_database(
    mongo_client: any,
    database_name: str
) -> any:
    try:
        database = mongo_client[database_name]
        return database
    except Exception as e:
        return None

def mongo_check_database(
    mongo_client: any, 
    database_name: str
) -> bool:
    try:
        database_exists = database_name in mongo_client.list_database_names()
        return database_exists
    except Exception as e:
        return False

def mongo_list_databases(
    mongo_client: any
) -> any:
    try:
        databases = mongo_client.list_database_names()
        return databases
    except Exception as e:
        return []

def mongo_remove_database(
    mongo_client: any, 
    database_name: str
) -> bool:
    try:
        mongo_client.drop_database(database_name)
        return True
    except Exception as e:
        return False

def mongo_get_collection(
    mongo_client: any, 
    database_name: str, 
    collection_name: str
) -> bool:
    try:
        database = mongo_get_database(
            mongo_client = mongo_client,
            database_name = database_name
        )
        collection = database[collection_name]
        return collection
    except Exception as e:
        return None
    
def mongo_check_collection(
    mongo_client: any, 
    database_name: any, 
    collection_name: any
) -> bool:
    try:
        database = mongo_client[database_name]
        collection_exists = collection_name in database.list_collection_names()
        return collection_exists
    except Exception as e:
        return False

def mongo_update_collection(
    mongo_client: any, 
    database_name: str, 
    collection_name: str, 
    filter_query: any, 
    update_query: any
) -> any:
    try:
        collection = mongo_get_collection(
            mongo_client = mongo_client, 
            database_name = database_name, 
            collection_name = collection_name
        )
        result = collection.update_many(filter_query, update_query)
        return result
    except Exception as e:
        return None

def mongo_list_collections(
    mongo_client: any, 
    database_name: str
) -> bool:
    try:
        database = mongo_get_database(
            mongo_client = mongo_client,
            database_name = database_name
        )
        collections = database.list_collection_names()
        return collections
    except Exception as e:
        return []

def mongo_remove_collection(
    mongo_client: any, 
    database_name: str, 
    collection_name: str
) -> bool:
    try: 
        database = mongo_get_database(
            mongo_client = mongo_client,
            database_name = database_name
        )
        database.drop_collection(collection_name)
        return True
    except Exception as e:
        return False

def mongo_create_document(
    mongo_client: any, 
    database_name: str, 
    collection_name: str, 
    document: any
) -> any:
    try: 
        collection = mongo_get_collection(
            mongo_client = mongo_client, 
            database_name = database_name, 
            collection_name = collection_name
        )
        result = collection.insert_one(document)
        return result
    except Exception as e:
        return None

def mongo_get_document(
    mongo_client: any, 
    database_name: str, 
    collection_name: str, 
    filter_query: any
):
    try: 
        collection = mongo_get_collection(
            mongo_client = mongo_client, 
            database_name = database_name, 
            collection_name = collection_name
        )
        document = collection.find_one(filter_query)
        return document
    except Exception as e:
        print(e)
        return None 
    
def mongo_collection_number(
    mongo_client: any, 
    database_name: str, 
    collection_name: str 
) -> any:
    try:
        collection = mongo_get_collection(
            mongo_client = mongo_client, 
            database_name = database_name, 
            collection_name = collection_name
        )
        amount_of_documents = collection.count_documents({})
        return amount_of_documents
    except Exception as e:
        return None

def mongo_list_documents(
    mongo_client: any, 
    database_name: str, 
    collection_name: str, 
    filter_query: any,
    sorting_query: any
) -> any:
    try: 
        collection = mongo_get_collection(
            mongo_client = mongo_client, 
            database_name = database_name, 
            collection_name = collection_name
        )
        documents = list(collection.find(filter_query).sort(sorting_query))
        return documents
    except Exception as e:
        return []

def mongo_update_document(
    mongo_client: any, 
    database_name: any, 
    collection_name: any, 
    filter_query: any, 
    update_query: any
) -> any:
    try: 
        collection = mongo_get_collection(
            mongo_client = mongo_client, 
            database_name = database_name, 
            collection_name = collection_name
        )
        result = collection.update_one(filter_query, update_query)
        return result
    except Exception as e:
        return None

def mongo_remove_document(
    mongo_client: any, 
    database_name: str, 
    collection_name: str, 
    filter_query: any
) -> bool:
    try: 
        collection = mongo_get_collection(
            mongo_client = mongo_client, 
            database_name = database_name, 
            collection_name = collection_name
        )
        result = collection.delete_one(filter_query)
        return result
    except Exception as e:
        return None


## Qdrant

In [50]:
from qdrant_client import QdrantClient as qc

def qdrant_is_client(
    storage_client: any
) -> any:
    try:
        return isinstance(storage_client, qc.Connection)
    except Exception as e:
        return False

def qdrant_setup_client(
    api_key: str,
    address: str, 
    port: str
) -> any:
    try:
        qdrant_client = qc(
            host = address,
            port = int(port),
            api_key = api_key,
            https = False
        ) 
        return qdrant_client
    except Exception as e:
        return None

def qdrant_create_collection(
    qdrant_client: any, 
    collection_name: str,
    configuration: any
) -> any:
    try:
        result = qdrant_client.create_collection(
            collection_name = collection_name,
            vectors_config = configuration
        )
        return result
    except Exception as e:
        print(e)
        return None

def qdrant_get_collection(
    qdrant_client: any, 
    collection_name: str
) -> any:
    try:
        collection = qdrant_client.get_collection(
            collection_name = collection_name
        )
        return collection
    except Exception as e:
        return None

def qdrant_collection_number(
    qdrant_client: any, 
    collection_name: str,
    count_filter: any
) -> any:
    try:
        result = qdrant_client.count(
            collection_name = collection_name,
            count_filter = count_filter,
            exact =  True
        )
        return result.count
    except Exception as e:
        print(e)
        return None

def qdrant_list_collections(
    qdrant_client: any
) -> any:
    try:
        collections = qdrant_client.get_collections()
        collection_list = []
        for description in collections.collections:
            collection_list.append(description.name)
        return collection_list
    except Exception as e:
        return []
    
def qdrant_remove_collection(
    qdrant_client: any, 
    collection_name: str
) -> bool:
    try:
        qdrant_client.delete_collection(collection_name)
        return True
    except Exception as e:
        return False

def qdrant_upsert_points(
    qdrant_client: qc, 
    collection_name: str,
    points: any
) -> any:
    try:
        results = qdrant_client.upsert(
            collection_name = collection_name, 
            points = points
        )
        return results
    except Exception as e:
        print(e)
        return None

def qdrant_search_data(
    qdrant_client: qc,  
    collection_name: str,
    scroll_filter: any,
    limit: int,
    offset: any
) -> any:
    try:
        hits = qdrant_client.scroll(
            collection_name = collection_name,
            scroll_filter = scroll_filter,
            limit = limit,
            with_payload = True,
            offset = offset
        )
        return hits
    except Exception as e:
        print(e)
        return []

def qdrant_search_vectors(
    qdrant_client: qc,  
    collection_name: str,
    query_vector: any,
    limit: str
) -> any:
    try:
        hits = qdrant_client.search(
            collection_name = collection_name,
            query_vector = query_vector,
            limit = limit,
            with_payload = True
        )
        return hits
    except Exception as e:
        return []

def qdrant_remove_points(
    qdrant_client: qc,  
    collection_name: str, 
    points_selector: any
) -> bool:
    try:
        results = qdrant_client.delete(
            collection_name = collection_name,
            points_selector = points_selector
        )
        return results
    except Exception as e:
        print(f"Error removing document: {e}")
        return None

## Meili

In [6]:
import meilisearch as ms

def meili_is_client(
    storage_client: any
) -> any:
    try:
        return isinstance(storage_client, ms.Connection)
    except Exception as e:
        print(e)
        return False

def meili_setup_client(
    api_key: str,
    host: str
) -> any:
    try:
        meili_client = ms.Client(
            url = host, 
            api_key = api_key
        )
        return meili_client 
    except Exception as e:
        print(e)
        return None

def meili_get_index( 
    meili_client: any, 
    index_name: str
) -> any:
    try:
        index = meili_client.index(
            uid = index_name
        )
        return index
    except Exception as e:
        print(e)
        return None
    
def meili_check_index(
    meili_client: any, 
    index_name: str
) -> bool:
    try:
        meili_client.get_index(
            uid = index_name
        )
        return True
    except Exception as e:
        print(e)
        return False
    
def meili_remove_index(
    meili_client: any, 
    index_name: str
) -> bool:
    try:
        response = meili_client.index(
            index_name = index_name
        ).delete()
        return response
    except Exception as e:
        print(e)
        return None
    
def meili_list_indexes(
    meili_client: any
) -> bool:
    try:
        names = []
        indexes = meili_client.get_indexes()
        for index in indexes['results']:
            names.append(index.uid)
        return names
    except Exception as e:
        print(e)
        return None

def meili_add_documents(
    meili_client: any, 
    index_name: str, 
    documents: any
) -> any:
    try:
        index = meili_get_index(
            meili_client = meili_client,
            index_name = index_name
        )
        response = index.add_documents(
            documents = documents
        )
        return response
    except Exception as e:
        print(e)
        return None

def meili_set_filterable(
    meili_client: any, 
    index_name: str, 
    attributes: any
) -> any:
    try:
        index = meili_get_index(
            meili_client = meili_client,
            index_name = index_name
        )
        response = index.update_filterable_attributes(attributes)
        return response
    except Exception as e:
        print(e)
        return None

def meili_search_documents(
    meili_client: any, 
    index_name: str, 
    query: any, 
    options: any
) -> any:
    try:
        index = meili_get_index(
            meili_client = meili_client,
            index_name = index_name
        )
        response = index.search(
            query,
            options
        )
        return response
    except Exception as e:
        print(e)
        return None
    
def meili_update_documents(
    meili_client, 
    index_name, 
    documents
) -> any:
    try:
        index = meili_client.index(
            index_name = index_name
        )
        response = index.update_documents(
            documents = documents
        )
        return response
    except Exception as e:
        print(e)
        return None

def meili_delete_documents(
    meili_client: any, 
    index_name: str, 
    ids: any
) -> any:
    try:
        index = meili_client.index(
            index_name = index_name
        )
        response = index.delete_documents(
            document_ids = ids
        )
        return response
    except Exception as e:
        print(e)
        return None

## Langchain

In [7]:
from langchain_text_splitters import (
    RecursiveCharacterTextSplitter,
)

def langchain_chunk_prompt(
    configuration: any,
    prompt: str
) -> any:
    splitter = RecursiveCharacterTextSplitter(
        chunk_size = configuration['chunk-size'], 
        chunk_overlap = configuration['chunk-overlap'],
        length_function = len,
        is_separator_regex = False
    )

    prompt_chunks = splitter.create_documents([prompt])
    prompt_chunks = [prompt.page_content for prompt in prompt_chunks]
    return prompt_chunks

## SpaCy

In [3]:
def spacy_search_keywords(
    language_model: any,
    text: str
):
    lowered = text.lower()
    formatted = language_model(lowered)
    
    keywords = [
        token.lemma_ for token in formatted
        if not token.is_stop               
        and not token.is_punct              
        and not token.is_space              
        and len(token) > 1                  
    ]
    
    keywords = list(set(keywords))
    
    return keywords

## Queries

In [18]:
import re

def clean_prompt(
    prompt: str
) -> any:
    prompt = prompt.lower()
    prompt = re.sub(r'\s+', ' ', prompt)
    prompt = re.sub(r'[^\w\s]', '', prompt)
    return prompt.strip()

def create_chunk_keywords(
    language_model: any,
    chunks: any
) -> any:
    keyword_queries = []
    for chunk in chunks:
        keywords = spacy_search_keywords(
            language_model = language_model,
            text = chunk
        )
        keyword_query = ' OR '.join([f'keywords = "{keyword}"' for keyword in keywords])
        keyword_queries.append(keyword_query)
    return keyword_queries

def generate_prompt_queries(
    configuration: any,
    embedding_model: any,
    language_model: any,
    prompt: str
) -> any:
    cleaned_prompt = clean_prompt(
        prompt = prompt
    )
    
    prompt_chunks = langchain_chunk_prompt(
        configuration = configuration,
        prompt = cleaned_prompt
    ) 

    embedding_queries = embedding_model.embed_documents(
        texts = prompt_chunks
    )

    keyword_queries = create_chunk_keywords(
        language_model = language_model,
        chunks = prompt_chunks
    )

    formatted_queries = {
        'embeddings': embedding_queries,
        'keywords': keyword_queries
    }

    return formatted_queries

In [11]:
test_prompt = 'What is python?'

In [10]:
pipeline_configuration = {
    'chunk-size': 50,
    'chunk-overlap': 0,
    'top-k': 10,
    'alpha': 0.5,
    'context-amount': 5
}

In [16]:
import spacy
from langchain_huggingface import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(
    model_name = 'sentence-transformers/all-MiniLM-L6-v2'
)

language_model = spacy.load(
    name = 'en_core_web_sm'
)

  from tqdm.autonotebook import tqdm, trange


In [25]:
mongo_client = mongo_setup_client(
    username = 'mongo123',
    password = 'mongo456',
    address = '127.0.0.1',
    port = '27017'
)

qdrant_client = qdrant_setup_client(
    api_key = 'qdrant_key',
    address = '127.0.0.1', 
    port = '7201'
)

meili_client = meili_setup_client(
    host = 'http://127.0.0.1:7202', 
    api_key = 'meili_key'
)



In [23]:
example_queries = generate_prompt_queries(
    configuration = pipeline_configuration,
    embedding_model = embedding_model, 
    language_model = language_model,
    prompt = test_prompt
)

## Hits

In [81]:
def calculate_keyword_score(
    keyword_query: str,
    keyword_list: any
) -> any:
    match = 0
    asked_keywords = keyword_query.split('OR')
    for asked_keyword in asked_keywords:
        formatted = asked_keyword.replace('keywords =', '')
        formatted = formatted.replace('"', '')
        formatted = formatted.replace(' ', '')
        
        if formatted in keyword_list:
            match += 1
            
    query_length = len(asked_keywords)
    keyword_length = len(keyword_list)

    if match == 0:
        return 0.0

    normalized = match / ((query_length * keyword_length) ** 0.5)
    return normalized

def get_vector_hits(
    vector_client: any,
    configuration: any,
    embedding_queries: any
) -> any:
    vector_hits = []
    
    collections = qdrant_list_collections(
        qdrant_client = vector_client
    )
    for collection in collections:
        for embedding in embedding_queries:
            results = qdrant_search_vectors(
                qdrant_client = vector_client,  
                collection_name = collection,
                query_vector = embedding,
                limit = configuration['top-k']
            ) 
            
            for result in results:
                #result_tuple = (
                #    'vector',
                #    result.payload['database'],
                #    result.payload['collection'],
                #    result.payload['document'],
                #    result.payload['type'],
                #    result.score
                #)
                result_dict = {
                    'source': 'vector',
                    'database': result.payload['database'],
                    'collection': result.payload['collection'],
                    'document': result.payload['document'],
                    'type': result.payload['type'],
                    'score': result.score
                }
                
                vector_hits.append(result_dict)
    return vector_hits

def get_search_hits(
    search_client: any,
    configuration: any,
    keyword_queries: any
) -> any:
    search_hits = []
    
    collections = meili_list_indexes(
        meili_client = search_client
    )
    
    for collection in collections:        
        for keywords in keyword_queries:
            results = meili_search_documents(
                meili_client = search_client, 
                index_name = collection, 
                query = "", 
                options = {
                    'filter': keywords,
                    'attributesToRetrieve': ['database','collection','document', 'type', 'keywords'],
                    'limit': configuration['top-k']
                }
            )
    
            for result in results['hits']:
                score = calculate_keyword_score(
                    keyword_query = keywords,
                    keyword_list = result['keywords']
                )
            
                #result_tuples = (
                #    'search',
                #    result['database'],
                #    result['collection'],
                #    result['document'],
                #    result['type'],
                #    score
                #)

                result_dict = {
                    'source': 'search',
                    'database': result['database'],
                    'collection': result['collection'],
                    'document': result['document'],
                    'type': result['type'],
                    'score': score
                }
                
                search_hits.append(result_dict)
    return search_hits
    
def get_hits(
    vector_client: any,
    search_client: any,
    configuration: any,
    queries: any
) -> any:
    vector_hits = get_vector_hits(
        vector_client = vector_client,
        configuration = configuration,
        embedding_queries = queries['embeddings']
    )
    
    search_hits = get_search_hits(
        search_client = search_client,
        configuration = configuration,
        keyword_queries = queries['keywords']
    )

    found_hits = vector_hits + search_hits
    # sorted(found_hits, key = lambda x: x[-1], reverse = True)
    return found_hits

In [82]:
example_hits = get_hits(
    vector_client = qdrant_client,
    search_client = meili_client,
    configuration = pipeline_configuration,
    queries = example_queries
)

## Top

In [93]:
import pandas as pd

def match_hit_documents(
    configuration: any,
    filtered_df: any
) -> any:
    alpha = configuration['alpha']
    matched_documents = []
    for index_i, row_i in filtered_df[filtered_df['source'] == 'vector'].iterrows():
        vector_source = row_i['source']
        vector_database = row_i['database']
        vector_collection = row_i['collection']
        vector_id = row_i['document']
        vector_type = row_i['type']
        vector_score = row_i['score']
        
        for index_j, row_j in filtered_df[filtered_df['source'] == 'search'].iterrows():
            search_source = row_j['source']
            search_database = row_j['database']
            search_collection = row_j['collection']
            search_id = row_j['document']
            search_type = row_j['type']
            search_score = row_j['score']
            
            if vector_database == search_database:
                if vector_collection == search_collection:
                    if vector_type == search_type:
                        if vector_id == search_id:
                            hybrid_score = vector_score * alpha + search_score * (1-alpha)
    
                            matched_documents.append({
                                'source': 'hybrid',
                                'database': search_database,
                                'collection': search_collection,
                                'document': search_id,
                                'score': hybrid_score
                            })
    match_df = pd.DataFrame(matched_documents)
    return match_df

def select_context_documents(
    configuration: any,
    matched_df: any
) -> any:
    sorted_df = matched_df.sort_values('score', ascending = False)
    seen_documents = []
    context_documents = []
    for row in sorted_df.values.tolist():
        row_id = row[3]
        if not row_id in seen_documents and len(context_documents) <= configuration['context-amount']:
            seen_documents.append(row_id)
            context_documents.append(row)
    return context_documents

#def filter_hits(
#    hits: any
#) -> any:
    

def get_top_documents(
    hits: str,
    configuration: any
) -> any:
    df = pd.DataFrame(hits)

    print(df)
    
    #ids_with_both = df.groupby('document')['source'].nunique()
    
    #print(ids_with_both)
    
    #ids_with_both = ids_with_both[ids_with_both > 1].index

    
    
    #filtered_df = df[df['document'].isin(ids_with_both)]

    #print(filtered_df)
    
    #df = filter_hits(
    #    hits = hits
    #)

    return None

    #matched_df = match_hit_documents(
    #    configuration = configuration,
    #    filtered_df = filtered_df
    #) 

    #context_documents = select_context_documents(
    #    configuration = configuration,
    #    matched_df = matched_df
    #)
    
    #return context_documents

In [104]:
df = pd.DataFrame(example_hits)

In [105]:
grouped = df.groupby('document')['source']

In [106]:
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7c081e9b8a60>

In [96]:
df

Unnamed: 0,source,database,collection,document,type,score
0,vector,K123AsJ0k1|cloud-hpc-oss-mlops-platform|md,de|ku|ma|co|ks|README,673b0e5b6ddead426e62ca83,markdown,0.500455
1,vector,K123AsJ0k1|cloud-hpc-oss-mlops-platform|md,de|ku|ma|co|ks|README,673b0e5b6ddead426e62ca83,markdown,0.500455
2,vector,K123AsJ0k1|cloud-hpc-oss-mlops-platform|md,de|ku|ma|co|ks|README,673b0e5b6ddead426e62ca83,markdown,0.500455
3,vector,K123AsJ0k1|cloud-hpc-oss-mlops-platform|md,de|ku|ma|co|ks|README,673b0e5b6ddead426e62ca83,markdown,0.500455
4,vector,K123AsJ0k1|cloud-hpc-oss-mlops-platform|md,de|ku|ma|co|ks|README,673b0e5b6ddead426e62ca83,markdown,0.500455
...,...,...,...,...,...,...
70,search,K123AsJ0k1|cloud-hpc-oss-mlops-platform|yaml,te|re|kf|pipeline,673b0e9a76c6682abb11ddba,yaml,0.147442
71,search,K123AsJ0k1|cloud-hpc-oss-mlops-platform|yaml,de|ku|ma|ap|kf|up|v1|th|ar|up|ma|qu|ba|ov|work...,673b0f5edf3ef8d0a63ff2ab,yaml,0.125000
72,search,K123AsJ0k1|cloud-hpc-oss-mlops-platform|yaml,de|ku|ma|ap|ju|ju|up|ba|co|spawner_ui_config,673b0ef0755c0da6c9c6dfbd,yaml,0.129099
73,search,K123AsJ0k1|cloud-hpc-oss-mlops-platform|yaml,de|ku|ma|ap|kf|up|th|ar|up|ma|qu|ba|ov|workflo...,673b0f3f755c0da6c9c6e233,yaml,0.125000


In [94]:
example_documents = get_top_documents(
    hits = example_hits,
    configuration = pipeline_configuration
)

    source                                      database  \
0   vector    K123AsJ0k1|cloud-hpc-oss-mlops-platform|md   
1   vector    K123AsJ0k1|cloud-hpc-oss-mlops-platform|md   
2   vector    K123AsJ0k1|cloud-hpc-oss-mlops-platform|md   
3   vector    K123AsJ0k1|cloud-hpc-oss-mlops-platform|md   
4   vector    K123AsJ0k1|cloud-hpc-oss-mlops-platform|md   
..     ...                                           ...   
70  search  K123AsJ0k1|cloud-hpc-oss-mlops-platform|yaml   
71  search  K123AsJ0k1|cloud-hpc-oss-mlops-platform|yaml   
72  search  K123AsJ0k1|cloud-hpc-oss-mlops-platform|yaml   
73  search  K123AsJ0k1|cloud-hpc-oss-mlops-platform|yaml   
74  search  K123AsJ0k1|cloud-hpc-oss-mlops-platform|yaml   

                                           collection  \
0                               de|ku|ma|co|ks|README   
1                               de|ku|ma|co|ks|README   
2                               de|ku|ma|co|ks|README   
3                               de|ku|ma|co|ks|READ

In [88]:
example_documents

Unnamed: 0,source,database,collection,document,type,score


In [None]:
def get_context(
    self,
    prompt: str
) -> str:
    
    print('Creating clients')
    mongo_client = self.mongo_setup_client(
        username = self.valves.MONGO_USER,
        password = self.valves.MONGO_PASSWORD,
        address = self.valves.MONGO_ADDRESS,
        port = self.valves.MONGO_PORT
    )

    qdrant_client = self.qdrant_setup_client(
        api_key = self.valves.QDRANT_KEY,
        address = self.valves.QDRANT_ADDRESS, 
        port = self.valves.QDRANT_PORT
    )

    meili_client = self.meili_setup_client(
        host = self.valves.MEILI_HOST, 
        api_key = self.valves.MEILI_KEY
    )
    print('Clients setup')

    pipeline_configuration = {
        'chunk-size': 50,
        'chunk-overlap': 0,
        'model-name': 'sentence-transformers/all-MiniLM-L6-v2',
        'top-k': 10,
        'alpha': 0.5,
        'context-amount': 5
    }

    print('Running context pipeline')
    created_context = self.context_pipeline(
        document_client = mongo_client,
        vector_client = qdrant_client,
        search_client = meili_client,
        configuration = pipeline_configuration ,
        prompt = prompt
    )
    print('Context pipeline run')

    return created_context

In [None]:
def mongo_setup_client(
    self,
    username: str,
    password: str,
    address: str,
    port: str
) -> any:
    connection_prefix = 'mongodb://(username):(password)@(address):(port)/'
    connection_address = connection_prefix.replace('(username)', username)
    connection_address = connection_address.replace('(password)', password)
    connection_address = connection_address.replace('(address)', address)
    connection_address = connection_address.replace('(port)', port)
    mongo_client = mc(
        host = connection_address
    )
    return mongo_client

def mongo_get_database(
    self,
    mongo_client: any,
    database_name: str
) -> any:
    try:
        database = mongo_client[database_name]
        return database
    except Exception as e:
        return None

def mongo_get_collection(
    self,
    mongo_client: any, 
    database_name: str, 
    collection_name: str
) -> bool:
    try:
        database = self.mongo_get_database(
            mongo_client = mongo_client,
            database_name = database_name
        )
        collection = database[collection_name]
        return collection
    except Exception as e:
        return None
    
def mongo_get_document(
    self,
    mongo_client: any, 
    database_name: str, 
    collection_name: str, 
    filter_query: any
):
    try: 
        collection = self.mongo_get_collection(
            mongo_client = mongo_client, 
            database_name = database_name, 
            collection_name = collection_name
        )
        document = collection.find_one(filter_query)
        return document
    except Exception as e:
        print(e)
        return None 

def qdrant_setup_client(
    self,
    api_key: str,
    address: str, 
    port: str
) -> any:
    try:
        qdrant_client = qc(
            host = address,
            port = int(port),
            api_key = api_key,
            https = False
        ) 
        return qdrant_client
    except Exception as e:
        return None

def qdrant_list_collections(
    self,
    qdrant_client: any
) -> any:
    try:
        collections = qdrant_client.get_collections()
        collection_list = []
        for description in collections.collections:
            collection_list.append(description.name)
        return collection_list
    except Exception as e:
        return []

def qdrant_search_vectors(
    self,
    qdrant_client: qc,  
    collection_name: str,
    query_vector: any,
    limit: str
) -> any:
    try:
        hits = qdrant_client.search(
            collection_name = collection_name,
            query_vector = query_vector,
            limit = limit
        )
        return hits
    except Exception as e:
        return []

def meili_setup_client(
    self,
    host: str, 
    api_key: str
) -> any:
    try:
        meili_client = ms.Client(
            url = host, 
            api_key = api_key
        )
        return meili_client 
    except Exception as e:
        print(e)
        return None

def meili_get_index( 
    self,
    meili_client: any, 
    index_name: str
) -> any:
    try:
        index = meili_client.index(
            uid = index_name
        )
        return index
    except Exception as e:
        print(e)
        return None
    
def meili_list_indexes(
    self,
    meili_client: any
) -> bool:
    try:
        names = []
        indexes = meili_client.get_indexes()
        for index in indexes['results']:
            names.append(index.uid)
        return names
    except Exception as e:
        print(e)
        return None

def meili_search_documents(
    self,
    meili_client: any, 
    index_name: str, 
    query: any, 
    options: any
) -> any:
    try:
        index = self.meili_get_index(
            meili_client = meili_client,
            index_name = index_name
        )
        response = index.search(
            query,
            options
        )
        return response
    except Exception as e:
        print(e)
        return None

def langchain_chunk_prompt(
    self,
    configuration: any,
    prompt: str
) -> any:
    splitter = RecursiveCharacterTextSplitter(
        chunk_size = configuration['chunk-size'], 
        chunk_overlap = configuration['chunk-overlap'],
        length_function = len
    )

    prompt_chunks = splitter.create_documents([prompt])
    prompt_chunks = [prompt.page_content for prompt in prompt_chunks]
    return prompt_chunks
    
def langchain_chunk_embeddings(
    self,
    configuration: any,
    chunks: any
) -> any:
    embedding_model = HuggingFaceEmbeddings(
        model_name = configuration['model-name']
    )    
    embeddings = embedding_model.embed_documents(
        texts = chunks
    )
    return embeddings

def spacy_find_keywords(
    self,
    text: str
):
    nlp = spacy.load("en_core_web_sm")

    formatted = nlp(text.lower())
    
    keywords = [
        token.lemma_ for token in formatted
        if not token.is_stop               
        and not token.is_punct              
        and not token.is_space              
        and len(token) > 1                  
    ]
    
    keywords = list(set(keywords))
    
    return keywords

def spacy_create_chunk_keywords(
    self, 
    chunks: any
) -> any:
    keyword_queries = []
    for chunk in chunks:
        keywords = self.spacy_find_keywords(
            text = chunk
        )
        keyword_query = ' OR '.join([f'keywords = "{keyword}"' for keyword in keywords])
        keyword_queries.append(keyword_query)
    return keyword_queries

def clean_prompt(
    self,
    prompt: str
) -> any:
    prompt = prompt.lower()
    prompt = re.sub(r'\s+', ' ', prompt)
    prompt = re.sub(r'[^\w\s]', '', prompt)
    return prompt.strip()

def generate_prompt_queries(
    self,
    configuration: any,
    prompt: str
) -> any:
    cleaned_prompt = self.clean_prompt(
        prompt = prompt
    )
    
    prompt_chunks = self.langchain_chunk_prompt(
        configuration = configuration,
        prompt = cleaned_prompt
    ) 

    embedding_queries = self.langchain_chunk_embeddings(
        configuration = configuration,
        chunks = prompt_chunks
    )

    keyword_queries = self.spacy_create_chunk_keywords(
        chunks = prompt_chunks
    )

    formatted_queries = {
        'embeddings': embedding_queries,
        'keywords': keyword_queries
    }

    return formatted_queries

def calculate_keyword_score(
    self,
    keyword_query: str,
    keyword_list: any
) -> any:
    match = 0
    asked_keywords = keyword_query.split('OR')
    for asked_keyword in asked_keywords:
        formatted = asked_keyword.replace('keywords =', '')
        formatted = formatted.replace('"', '')
        formatted = formatted.replace(' ', '')
        
        if formatted in keyword_list:
            match += 1
            
    query_length = len(asked_keywords)
    keyword_length = len(keyword_list)

    if match == 0:
        return 0.0

    normalized = match / ((query_length * keyword_length) ** 0.5)
    return normalized

def get_vector_hits(
    self,
    vector_client: any,
    configuration: any,
    embedding_queries: any
) -> any:
    recommeded_cases = []

    collections = self.qdrant_list_collections(
        qdrant_client = vector_client
    )
    for collection in collections:
        for embedding in embedding_queries:
            results = self.qdrant_search_vectors(
                qdrant_client = vector_client,  
                collection_name = collection,
                query_vector = embedding,
                limit = configuration['top-k']
            ) 
            
            for result in results:
                res_database = result.payload['database']
                res_collection = result.payload['collection']
                res_document = result.payload['document']
                res_type = result.payload['type']
                res_score = result.score
                
                res_case = {
                    'source': 'vector',
                    'database': res_database,
                    'collection': res_collection,
                    'document': res_document,
                    'type': res_type,
                    'score': res_score
                }
                
                recommeded_cases.append(res_case)
    return recommeded_cases

def get_search_hits(
    self,
    search_client: any,
    configuration: any,
    keyword_queries: any
) -> any:
    recommeded_cases = []
    collections = self.meili_list_indexes(
        meili_client = search_client
    )
    
    for collection in collections:        
        for keywords in keyword_queries:
            results = self.meili_search_documents(
                meili_client = search_client, 
                index_name = collection, 
                query = "", 
                options = {
                    'filter': keywords,
                    'attributesToRetrieve': ['database','collection','document', 'type', 'keywords'],
                    'limit': configuration['top-k']
                }
            )
    
            for result in results['hits']:
                res_database = result['database']
                res_collection = result['collection']
                res_document = result['document']
                res_type = result['type']
                res_keywords = result['keywords']
                
                
                res_score = self.calculate_keyword_score(
                    keyword_query = keywords,
                    keyword_list = res_keywords
                )
    
                res_case = {
                    'source': 'search',
                    'database': res_database,
                    'collection': res_collection,
                    'document': res_document,
                    'type': res_type,
                    'score': res_score
                }
    
                recommeded_cases.append(res_case)
    return recommeded_cases
    
def get_vector_search_hits(
    self,
    vector_client: any,
    search_client: any,
    configuration: any,
    queries: any
) -> any:
    vector_hits = self.get_vector_hits(
        vector_client = vector_client,
        configuration = configuration,
        embedding_queries = queries['embeddings']
    )
    
    search_hits = self.get_search_hits(
        search_client = search_client,
        configuration = configuration,
        keyword_queries = queries['keywords']
    )

    found_hits = vector_hits + search_hits

    return found_hits

def filter_hit_documents(
    self,
    configuration: any,
    hits: any
) -> any:
    df = pd.DataFrame(hits)
    ids_with_both = df.groupby('document')['source'].nunique()
    ids_with_both = ids_with_both[ids_with_both > 1].index
    filtered_df = df[df['document'].isin(ids_with_both)]
    return filtered_df

def match_hit_documents(
    self,
    configuration: any,
    filtered_df: any
) -> any:
    alpha = configuration['alpha']
    matched_documents = []
    for index_i, row_i in filtered_df[filtered_df['source'] == 'vector'].iterrows():
        vector_source = row_i['source']
        vector_database = row_i['database']
        vector_collection = row_i['collection']
        vector_id = row_i['document']
        vector_type = row_i['type']
        vector_score = row_i['score']
        
        for index_j, row_j in filtered_df[filtered_df['source'] == 'search'].iterrows():
            search_source = row_j['source']
            search_database = row_j['database']
            search_collection = row_j['collection']
            search_id = row_j['document']
            search_type = row_j['type']
            search_score = row_j['score']
            
            if vector_database == search_database:
                if vector_collection == search_collection:
                    if vector_type == search_type:
                        if vector_id == search_id:
                            hybrid_score = vector_score * alpha + search_score * (1-alpha)
    
                            matched_documents.append({
                                'source': 'hybrid',
                                'database': search_database,
                                'collection': search_collection,
                                'document': search_id,
                                'score': hybrid_score
                            })
    match_df = pd.DataFrame(matched_documents)
    return match_df

def select_context_documents(
    self,
    configuration: any,
    matched_df: any
) -> any:
    sorted_df = matched_df.sort_values('score', ascending = False)
    seen_documents = []
    context_documents = []
    for row in sorted_df.values.tolist():
        row_id = row[3]
        if not row_id in seen_documents and len(context_documents) <= configuration['context-amount']:
            seen_documents.append(row_id)
            context_documents.append(row)
    return context_documents

def get_top_documents(
    self,
    hits: str,
    configuration: any
) -> any:
    filtered_df = self.filter_hit_documents(
        configuration = configuration,
        hits = hits
    )

    matched_df = self.match_hit_documents(
        configuration = configuration,
        filtered_df = filtered_df
    ) 

    context_documents = self.select_context_documents(
        configuration = configuration,
        matched_df = matched_df
    )
    
    return context_documents

def create_context(
    self,
    mongo_client: any,
    documents: any
) -> any:
    context = ''
    for metadata in documents:
        database = metadata[1]
        collection = metadata[2]
        document = metadata[3]
        data = self.mongo_get_document(
            mongo_client = mongo_client, 
            database_name = database, 
            collection_name = collection, 
            filter_query = {
                '_id': ObjectId(document)
            }
        )
        context += data['data']
    return context

def context_pipeline(
    self,
    document_client: any,
    vector_client: any,
    search_client: any,
    configuration: any,
    prompt: any
) -> any:

    print('Creating queries')

    prompt_query = self.generate_prompt_queries(
        configuration = configuration,
        prompt = prompt
    )

    print('Getting hits')

    prompt_hits = self.get_vector_search_hits(
        vector_client = vector_client,
        search_client = search_client,
        configuration = configuration,
        queries = prompt_query
    )
    
    print('Getting top documents')

    prompt_documents = self.get_top_documents(
        hits = prompt_hits,
        configuration = configuration
    ) 

    print('Creating context')

    prompt_context = self.create_context(
        mongo_client = document_client,
        documents = prompt_documents
    )

    return prompt_context

def get_context(
    self,
    prompt: str
) -> str:
    
    print('Creating clients')
    mongo_client = self.mongo_setup_client(
        username = self.valves.MONGO_USER,
        password = self.valves.MONGO_PASSWORD,
        address = self.valves.MONGO_ADDRESS,
        port = self.valves.MONGO_PORT
    )

    qdrant_client = self.qdrant_setup_client(
        api_key = self.valves.QDRANT_KEY,
        address = self.valves.QDRANT_ADDRESS, 
        port = self.valves.QDRANT_PORT
    )

    meili_client = self.meili_setup_client(
        host = self.valves.MEILI_HOST, 
        api_key = self.valves.MEILI_KEY
    )
    print('Clients setup')

    pipeline_configuration = {
        'chunk-size': 50,
        'chunk-overlap': 0,
        'model-name': 'sentence-transformers/all-MiniLM-L6-v2',
        'top-k': 10,
        'alpha': 0.5,
        'context-amount': 5
    }

    print('Running context pipeline')
    created_context = self.context_pipeline(
        document_client = mongo_client,
        vector_client = qdrant_client,
        search_client = meili_client,
        configuration = pipeline_configuration ,
        prompt = prompt
    )
    print('Context pipeline run')

    return created_context