In [38]:
import openai
from openai.embeddings_utils import get_embedding, get_embeddings
import os
from dotenv import load_dotenv

# Use the PyPDF2 library to read a PDF file
from pypdf import PdfReader
from tqdm import tqdm
load_dotenv()

True

In [39]:
openai.api_key = os.environ.get('OPENAI_API_KEY')

In [18]:
ENGINE = 'text-embedding-ada-002'

In [5]:
embedded_text = get_embedding('I love to be vectorized', engine=ENGINE)

In [6]:
len(embedded_text)

1536

## Open-Source Embedding Alternatives

In [8]:
from sentence_transformers import SentenceTransformer, util

In [9]:
query = "How many people live in London?"
docs = ["Around 9 Million people live in London", "London is known for its financial district"]

#Load the model
# Initializing a SentenceTransformer model with the 'multi-qa-mpnet-base-cos-v1' pre-trained model
model = SentenceTransformer('sentence-transformers/multi-qa-mpnet-base-cos-v1')

#Encode query and documents
query_emb = model.encode(query)
doc_emb = model.encode(docs)

#Compute dot score between query and all document embeddings
scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()

#Combine docs & scores
doc_score_pairs = list(zip(docs, scores))

#Sort by decreasing score
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

#Output passages & scores
for doc, score in doc_score_pairs:
    print(score, doc)


Downloading (…)e891a/.gitattributes: 100%|██████████| 737/737 [00:00<00:00, 5.29MB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 1.30MB/s]
Downloading (…)92a80e891a/README.md: 100%|██████████| 9.19k/9.19k [00:00<00:00, 31.7MB/s]
Downloading (…)a80e891a/config.json: 100%|██████████| 571/571 [00:00<00:00, 2.08MB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 833kB/s]
Downloading (…)91a/data_config.json: 100%|██████████| 25.5k/25.5k [00:00<00:00, 60.0MB/s]
Downloading pytorch_model.bin: 100%|██████████| 438M/438M [00:01<00:00, 275MB/s] 
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 350kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 1.82MB/s]
Downloading (…)e891a/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 66.5MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 363/363 [00:00<00:00, 2.83MB/s]
Downloading (…)891a/train_script.py: 100%|██████

0.8814705014228821 Around 9 Million people live in London
0.5050859451293945 London is known for its financial district


In [10]:
doc_emb.shape

(2, 768)

### Read PDF files

In [None]:
# Open the PDF file in read-binary mode
with open('', 'rb') as file:

    # Create a PDF reader object
    reader = PdfReader(file)

    # Initialize an empty string to hold the text
    principles_of_ds = ''

    # Loop through each page in the PDF file
    for page in tqdm(reader.pages):
       
        # Extract the text from the page
        text = page.extract_text()

        # Find the starting point of the text we want to extract
        # In this case, we are extracting text starting from the string ' ]'
        principles_of_ds += '\n\n' + text[text.find(' ]')+2:]

# Strip any leading or trailing whitespace from the resulting string
principles_of_ds = principles_of_ds.strip()

In [2]:
# Importing the tiktoken library
import tiktoken

# Initializing a tokenizer for the 'cl100k_base' model
# This tokenizer is designed to work with the 'ada-002' embedding model
tokenizer = tiktoken.get_encoding("cl100k_base")

# Using the tokenizer to encode the text 'hey there'
# The resulting output is a list of integers representing the encoded text
# This is the input format required for embedding using the 'ada-002' model
tokenizer.encode('hey there')

[36661, 1070]

In [3]:
import re
# Function to split the text into chunks of a maximum number of tokens. Inspired by OpenAI
def overlapping_chunks(text, max_tokens = 500, overlapping_factor = 5):
    '''
    max_tokens: tokens we want per chunk
    overlapping_factor: number of sentences to start each chunk with that overlaps with the previous chunk
    '''

    # Split the text using punctuation
    sentences = re.split(r'[.?!]', text)

    # Get the number of tokens for each sentence
    n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences]
    
    chunks, tokens_so_far, chunk = [], 0, []

    # Loop through the sentences and tokens joined together in a tuple
    for sentence, token in zip(sentences, n_tokens):

        # If the number of tokens so far plus the number of tokens in the current sentence is greater 
        # than the max number of tokens, then add the chunk to the list of chunks and reset
        # the chunk and tokens so far
        if tokens_so_far + token > max_tokens:
            chunks.append(". ".join(chunk) + ".")
            if overlapping_factor > 0:
                chunk = chunk[-overlapping_factor:]
                tokens_so_far = sum([len(tokenizer.encode(c)) for c in chunk])
            else:
                chunk = []
                tokens_so_far = 0

        # If the number of tokens in the current sentence is greater than the max number of 
        # tokens, go to the next sentence
        if token > max_tokens:
            continue

        # Otherwise, add the sentence to the chunk and add the number of tokens to the total
        chunk.append(sentence)
        tokens_so_far += token + 1
    if chunk:
        chunks.append(". ".join(chunk) + ".")

    return chunks

In [4]:
from urllib.request import urlopen

#

# A textbook about insects
text = urlopen('https://www.gutenberg.org/cache/epub/10834/pg10834.txt').read().decode()

In [8]:
split = overlapping_chunks(text, overlapping_factor=0)
avg_length = sum([len(tokenizer.encode(t)) for t in split]) / len(split)
print(f'non-overlapping chunking approach has {len(split)} documents with average length {avg_length:.1f} tokens')

non-overlapping chunking approach has 17 documents with average length 476.7 tokens


In [9]:
split = overlapping_chunks(text)
avg_length = sum([len(tokenizer.encode(t)) for t in split]) / len(split)
print(f'overlapping chunking approach has {len(split)} documents with average length {avg_length:.1f} tokens')

overlapping chunking approach has 24 documents with average length 477.4 tokens


## Set up Vector Database

In [6]:
import chromadb
from datetime import datetime
import hashlib

In [2]:
COLLECTION_NAME = "semantic-search"

In [3]:
client = chromadb.PersistentClient(path="/tmp/semantic")

In [9]:
collection = client.get_or_create_collection(
    name=COLLECTION_NAME,
    metadata={"hnsw:space": "cosine"}
    )


In [7]:
def my_hash(s):
    # Return the MD5 hash of the input string as a hexadecimal string
    return hashlib.md5(s.encode()).hexdigest()

my_hash('I love to hash it')

'ae76cc4dfd345ecaeea9b8ba0d5c3437'

In [43]:
def prepare_for_chroma(texts, engine=None):
    now = datetime.utcnow()

    if engine:
        embeddings = get_embeddings(texts, engine=ENGINE)
    
    return {
        'ids':[my_hash(text) for text in texts],
        'documents': [text for text in texts],
        'embeddings': [embedding for embedding in embeddings],
        'metadata': {'date_uploaded':str(now)}
    }
    

In [26]:
texts = ['hi']

In [29]:
response =  prepare_for_chroma(texts, engine=ENGINE)

In [None]:
response

In [45]:
def upload_texts_to_chroma(texts, collection, batch_size=None, show_progress_bar=True, engine=None):
    total_added = 0
    if not batch_size:
        batch_size = len(texts)

    _range = range(0, len(texts), batch_size)
    for i in tqdm(_range) if show_progress_bar else _range:
        batch = texts[i : i + batch_size]
        output = prepare_for_chroma(batch, engine=engine)

        out = collection.add(
            documents= output['documents'],
            embeddings= output['embeddings'],
            metadatas= output['metadata'],
            ids= output['ids']
        )
        
        total_added += 1

        return total_added

In [46]:
upload_texts_to_chroma(texts, collection, engine=ENGINE)

  0%|          | 0/1 [00:00<?, ?it/s]Add of existing embedding ID: 49f68a5c8493ec2c0bf489821c21fc3b
Insert of existing embedding ID: 49f68a5c8493ec2c0bf489821c21fc3b
  0%|          | 0/1 [00:00<?, ?it/s]


1

In [None]:
def query_from_chroma(query)