In [None]:
import openai
from openai.embeddings_utils import get_embedding, get_embeddings
import os
from dotenv import load_dotenv

# Use the PyPDF2 library to read a PDF file
from pypdf import PdfReader
from tqdm import tqdm

import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity
load_dotenv()

In [1]:
ENGINE = ""

## Set up Vector Database

In [None]:
import chromadb
from datetime import datetime
import hashlib
from chromadb.utils import embedding_functions

In [None]:
COLLECTION_NAME = "semantic-search"

In [None]:
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="multi-qa-mpnet-base-cos-v1")

In [None]:
client = chromadb.PersistentClient(path="/tmp/semantic")

In [None]:
collection = client.get_or_create_collection(
    name=COLLECTION_NAME,
    metadata={"hnsw:space": "cosine"},
    embedding_function=sentence_transformer_ef
    )


In [None]:
def my_hash(s):
    # Return the MD5 hash of the input string as a hexadecimal string
    return hashlib.md5(s.encode()).hexdigest()

my_hash('I love to hash it')

'ae76cc4dfd345ecaeea9b8ba0d5c3437'

In [None]:
def prepare_for_chroma(texts, engine=None):
    now = datetime.utcnow()

    if engine:
        embeddings = get_embeddings(texts, engine=ENGINE)
        return {
        'ids':[my_hash(text) for text in texts],
        'documents': [text for text in texts],
        'embeddings': [embedding for embedding in embeddings],
        'metadata': [dict(head=text[0], date_uploaded=str(now)) for text in texts]
    }

    return {
        'ids':[my_hash(text) for text in texts],
        'documents': [text for text in texts],
        'metadata': [dict(head=text[0], date_uploaded=str(now)) for text in texts]
    }

In [None]:
texts = ['hi']

In [None]:
response =  prepare_for_chroma(texts, engine=ENGINE)

In [None]:
response

{'ids': ['49f68a5c8493ec2c0bf489821c21fc3b'],
 'documents': ['hi'],
 'metadata': [{'head': 'h', 'date_uploaded': '2023-10-20 13:24:57.880216'}]}

In [None]:
def upload_texts_to_chroma(texts, collection, batch_size=None, show_progress_bar=True, engine=None):
    total_added = 0
    if not batch_size:
        batch_size = len(texts)

    _range = range(0, len(texts), batch_size)
    for i in tqdm(_range) if show_progress_bar else _range:
        batch = texts[i : i + batch_size]
        output = prepare_for_chroma(batch, engine=engine)

        if output.get('embeddings', None):
            out = collection.add(
                documents= output['documents'],
                embeddings= output['embeddings'],
                metadatas= output['metadata'],
                ids= output['ids']
                )

        else:
            out = collection.add(
                documents= output['documents'],
                metadatas= output['metadata'],
                ids= output['ids']
                )
        print(out)
        total_added += 1

        return total_added

In [None]:
upload_texts_to_chroma(texts, collection, engine=ENGINE)

  0%|          | 0/1 [00:00<?, ?it/s]


1

In [None]:
def query_from_chroma(query, collection, engine=None, top_k=3):
    if engine:
        query_embedding = get_embedding(query, engine=ENGINE)

        return collection.query(
            query_embeddings=query_embedding,
            n_results=top_k
            )
    return collection.query(
                query_texts=[query],
                n_results=top_k
            )

In [None]:
query_from_chroma('hello', collection)

Number of requested results 3 is greater than number of elements in index 1, updating n_results = 1


{'ids': [['49f68a5c8493ec2c0bf489821c21fc3b']],
 'distances': [[0.07519697162713412]],
 'metadatas': [[{'date_uploaded': '2023-10-18 09:35:59.505288'}]],
 'embeddings': None,
 'documents': [['hi']]}

In [None]:
def delete_texts_from_chroma(texts, collection):
    hashes = [my_hash(text) for text in texts]

    return collection.delete(
        ids=hashes
    )

In [None]:
# delete text
delete_texts_from_chroma(texts, collection)

In [None]:
#test collection is empty
query_from_chroma('hello', collection)

## Open-Source Embedding Alternatives

In [None]:
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


### Testing out the bi encoder

In [None]:
query = "How many people live in London?"
docs = ["Around 9 Million people live in London", "London is known for its financial district"]

#Load the model
# Initializing a SentenceTransformer model with the 'multi-qa-mpnet-base-cos-v1' pre-trained model
model = SentenceTransformer('sentence-transformers/multi-qa-mpnet-base-cos-v1')

#Encode query and documents
query_emb = model.encode(query)
doc_emb = model.encode(docs)

#Compute dot score between query and all document embeddings
scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()

#Combine docs & scores
doc_score_pairs = list(zip(docs, scores))

#Sort by decreasing score
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

#Output passages & scores
for doc, score in doc_score_pairs:
    print(score, doc)


0.8814705014228821 Around 9 Million people live in London
0.5050859451293945 London is known for its financial district


In [None]:
doc_emb.shape

(2, 768)

In [None]:
# Only keep documents of at least 50 characters split by a custom delimiter
split = list(filter(lambda x: len(x) > 50, intro_to_kube.split('\n\n\n')))

avg_length = sum([len(model.encode(t)) for t in split]) / len(split)
print(f'custom delimiter approach has {len(split)} documents with average length {avg_length:.1f} tokens')

custom delimiter approach has 30 documents with average length 768.0 tokens


In [None]:
corpus_embeddings = model.encode(split, show_progress_bar=True)

Batches: 100%|██████████| 1/1 [00:41<00:00, 41.66s/it]


In [None]:
# Normalize the embeddings to unit length
corpus_embeddings = corpus_embeddings /  np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

# Perform kmean clustering
clustering_model = AgglomerativeClustering(n_clusters=None, metric='cosine', linkage='average', distance_threshold=0.4)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

# Print the number of embeddings in each cluster
unique_labels, counts = np.unique(cluster_assignment, return_counts=True)
for label, count in zip(unique_labels, counts):
    print(f'Cluster {label}: {count} embeddings')

Cluster 0: 8 embeddings
Cluster 1: 3 embeddings
Cluster 2: 3 embeddings
Cluster 3: 2 embeddings
Cluster 4: 2 embeddings
Cluster 5: 1 embeddings
Cluster 6: 3 embeddings
Cluster 7: 1 embeddings
Cluster 8: 4 embeddings
Cluster 9: 1 embeddings
Cluster 10: 1 embeddings
Cluster 11: 1 embeddings


In [None]:
pruned_documents = []
for _label, count in zip(unique_labels, counts):
    pruned_documents.append('\n\n'.join([text for text, label in zip(split, cluster_assignment) if label == _label]))

In [None]:
print(pruned_documents[0])

Course: Introduction to Kubernetes L1:Introduction In this course, we will explore what kubernetes is; its architecture and building blocks, how it can be run on our local system or in the cloud, different ways we can configure and protect sensitive information, and how we can let external applications access our kubernetes application. We will also learn how to deploy and manage applications and resources with kubernetes. Here is a guide on how to create a GCP/Azure account Before diving into Kubernetes, we need to know some fundamental services that make working with Kubernetes easy and understandable. Containers Containers are an application-centric method used to deliver high-performing, scalable applications on any infrastructure of your choice. Containers provide a portable, isolated way of deploying microservices without disturbance from other microservices in our application. These containers install all the other dependencies needed by the microservice to function in a virtual

In [None]:
upload_texts_to_chroma(pruned_documents, collection, batch_size=128)

  0%|          | 0/1 [00:16<?, ?it/s]

None





1

In [None]:
query = "How do I setup Kubernetes?"

results_from_chroma = query_from_chroma(query, collection, top_k=5)

In [None]:
results_from_chroma

{'ids': [['2c4f78d725df2535b7419e2bdc16082d',
   'f22fe51da433105ca64c8495cb1f2468',
   '41ac779329c19684696f020540501ff9',
   '0811ea60dbb7b265bfee21243da17ebf',
   '45c44600ec8fb03b99ff1ebe2ee6c81d']],
 'distances': [[0.3146678855570244,
   0.41500638989238825,
   0.4536414574616634,
   0.4635328474400209,
   0.4721777930495821]],
 'metadatas': [[{'date_uploaded': '2023-10-20 13:35:21.827338', 'head': 'C'},
   {'date_uploaded': '2023-10-20 13:35:21.827338', 'head': 'C'},
   {'date_uploaded': '2023-10-20 13:35:21.827338', 'head': 'C'},
   {'date_uploaded': '2023-10-20 13:35:21.827338', 'head': 'I'},
   {'date_uploaded': '2023-10-20 13:35:21.827338', 'head': 'W'}]],
 'embeddings': None,
 'documents': [['Course: Introduction to Kubernetes L1:Introduction In this course, we will explore what kubernetes is; its architecture and building blocks, how it can be run on our local system or in the cloud, different ways we can configure and protect sensitive information, and how we can let exter