In [7]:
import chromadb
from sentence_transformers import SentenceTransformer
from chromadb.utils import embedding_functions

In [4]:
chroma_client = chromadb.Client()

### Create a collection
Collections are where you'll store your embeddings, documents, and any
additional metadata. You can create a collection with a 
name.

### Changing the distance function[​](https://docs.trychroma.com/usage-guide#changing-the-distance-function "Direct link to Changing the distance function")

`create_collection`  also takes an optional  `metadata`  argument which
can be used to customize the distance method of the embedding space 
by setting the value of  `hnsw:space`.

Valid options for `hnsw:space` are "l2", "ip" or "cosine". The
**default** is "l2". The equations for each can be found in the docs for 
Hnswlib
[here](https://github.com/nmslib/hnswlib/tree/master#python-bindings).


### Embeddings

Chroma provides lightweight wrappers around popular embedding providers,
making it easy to use them in your apps. You can set an embedding
function when you create a Chroma collection, which will be used
automatically, or you can call them directly yourself.


To get Chroma's embedding functions, import the
chromadb.utils.embedding_functions module.


By default, Chroma uses the  [Sentence
Transformers](https://www.sbert.net/)  `all-MiniLM-L6-v2`  model to
create embeddings. This embedding model can create sentence and document
embeddings that can be used for a wide variety of tasks. This embedding
function runs locally on your machine, and may require you download the
model files (this will happen automatically).


```
default_ef = embedding_functions.DefaultEmbeddingFunction()
```

#### Sentence Transformers[​](https://docs.trychroma.com/embeddings#sentence-transformers "Direct link to Sentence Transformers")

Chroma can also use any  [Sentence Transformers](https://www.sbert.net/)
model to create embeddings.


```
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
```

You can pass in an optional  `model_name`  argument, which lets you
choose which Sentence Transformers model to use. By default, Chroma uses  
`all-MiniLM-L6-v2`. You can see a list of all available models
[here](https://www.sbert.net/docs/pretrained_models.html).

Let's use for example the `all-mpnet-base-v2` from sentence transformers.

In [16]:
sentence_transformer_ef = (
    embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name="all-mpnet-base-v2"
    )
)

Downloading (…)a8e1d/.gitattributes: 100%|██████████| 1.18k/1.18k [00:00<00:00, 4.40MB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 2.14MB/s]
Downloading (…)b20bca8e1d/README.md: 100%|██████████| 10.6k/10.6k [00:00<00:00, 35.6MB/s]
Downloading (…)0bca8e1d/config.json: 100%|██████████| 571/571 [00:00<00:00, 7.13MB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 634kB/s]
Downloading (…)e1d/data_config.json: 100%|██████████| 39.3k/39.3k [00:00<00:00, 49.8MB/s]
Downloading pytorch_model.bin: 100%|██████████| 438M/438M [00:10<00:00, 43.6MB/s] 
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 715kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 1.30MB/s]
Downloading (…)a8e1d/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 6.93MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 363/363 [00:00<00:00, 2.51MB/s]
Downloading (…)8e1d/train_script.py: 100%|█

In [22]:
# emb_fn = "all-mpnet-base-v2" # embedding model. Default all-MiniLM-L6-v2
dis_fn = "cosine" # distance function. Default l2 

collection = chroma_client.create_collection(
    name="cosmic_chronicles",
    embedding_function=sentence_transformer_ef,
    metadata={"hnsw:space": dis_fn},
)

### Delete a collection

Delete a collection and all associated embeddings, documents, and
metadata. ⚠️ This is destructive and not reversible

In [21]:
chroma_client.delete_collection(name="cosmic_chronicles") 

### Add some text documents to the collection
Chroma will store your text, and handle tokenization, embedding, and
indexing automatically.

 Let's create a collection named "Cosmic Chronicles". This collection
 will contain short summaries of interesting astronomical 
 events and discoveries. Here's how you can add these documents to your
 collection:


In [23]:
collection.add(
    documents=[
        "Discovery of water on Mars: In 2020, scientists confirmed the existence of underground lakes on Mars.",
        "First image of a black hole: In 2019, the Event Horizon Telescope captured the first image of a black hole located in the M87 galaxy.",
        "Voyager 1 enters interstellar space: In 2012, the Voyager 1 spacecraft became the first human-made object to enter interstellar space.",
        "Detection of gravitational waves: In 2015, the LIGO observatory made the first direct observation of gravitational waves, confirming a major prediction of Albert Einstein's general theory of relativity.",
        "The Hubble Space Telescope: Launched in 1990, the Hubble Space Telescope has provided some of the most detailed images of distant galaxies, nebulae, and stars.",
        "The Kepler Mission: Launched in 2009, the Kepler space telescope has discovered more than 2,600 confirmed planets outside our solar system.",
        "The discovery of the first exoplanet: In 1995, the first exoplanet orbiting a sun-like star, 51 Pegasi b, was discovered.",
    ],
    metadatas=[
        {"source": "NASA"},
        {"source": "Event Horizon Telescope"},
        {"source": "NASA"},
        {"source": "LIGO"},
        {"source": "NASA"},
        {"source": "NASA"},
        {"source": "Observatoire de Genève, Michel Mayor, Didier Queloz"},
    ],
    ids=["doc1", "doc2", "doc3", "doc4", "doc5", "doc6", "doc7"],
)

### Query the collection
You can query the collection with a list of query texts, and Chroma will
return the n most similar results. It's that easy!

In [24]:
results = collection.query(
    query_texts=["Discovery of exoplanets"], n_results=7
)
results

{'ids': [['doc7', 'doc6', 'doc4', 'doc5', 'doc1', 'doc3', 'doc2']],
 'embeddings': None,
 'documents': [['The discovery of the first exoplanet: In 1995, the first exoplanet orbiting a sun-like star, 51 Pegasi b, was discovered.',
   'The Kepler Mission: Launched in 2009, the Kepler space telescope has discovered more than 2,600 confirmed planets outside our solar system.',
   "Detection of gravitational waves: In 2015, the LIGO observatory made the first direct observation of gravitational waves, confirming a major prediction of Albert Einstein's general theory of relativity.",
   'The Hubble Space Telescope: Launched in 1990, the Hubble Space Telescope has provided some of the most detailed images of distant galaxies, nebulae, and stars.',
   'Discovery of water on Mars: In 2020, scientists confirmed the existence of underground lakes on Mars.',
   'Voyager 1 enters interstellar space: In 2012, the Voyager 1 spacecraft became the first human-made object to enter interstellar space.',


By default data stored in Chroma is ephemeral making it easy to
prototype scripts. It's easy to make Chroma persistent so you can reuse
every collection you create and add more documents to it later. It will
load your data automatically when you start the client, and save it
automatically when you close it. 

In [25]:
collection.peek

<bound method Collection.peek of Collection(name=cosmic_chronicles)>

### Embeddings in the context of LLMs

The choice between BERT-based embeddings and GPT-based embeddings for data preparation in Large Language Models (LLMs) applications depends on the specific requirements of the task at hand. Here are some general guidelines:

**BERT-based embeddings** are typically used when the task requires understanding the context from both directions (left and right) around a word. BERT is a bidirectional model, meaning it looks at the context from both directions. This makes BERT-based embeddings very effective for tasks like:

- **Question Answering**: If you're building a system to answer questions based on a given text, BERT-based embeddings can help the model understand the context of the question and find the most relevant answer in the text.
- **Named Entity Recognition**: If you're trying to identify and categorize entities in a text, BERT-based embeddings can help the model understand the context around each entity.
- **Sentiment Analysis**: If you're analyzing the sentiment expressed in a piece of text, BERT-based embeddings can help the model understand the nuances of the sentiment.

**GPT-based embeddings** are typically used when the task involves generating text. GPT is a unidirectional model, meaning it generates a sentence from left to right. This makes GPT-based embeddings very effective for tasks like:

- **Text Generation**: If you're building a system to generate human-like text, GPT-based embeddings can help the model generate contextually relevant sentences.
- **Machine Translation**: If you're translating text from one language to another, GPT-based embeddings can help the model generate fluent translations.
- **Summarization**: If you're summarizing a longer piece of text, GPT-based embeddings can help the model generate a concise and relevant summary.

In general, the choice between BERT-based and GPT-based embeddings depends on whether the task requires understanding the context around a word (in which case BERT-based embeddings are typically used) or generating text (in which case GPT-based embeddings are typically used). However, both types of embeddings are very versatile and can be used for a wide range of tasks.

### Sentence Transformers

SentenceTransformers is a Python library that provides an easy-to-use interface for generating sentence embeddings using transformer-based models, which includes BERT and its variants.

The library is built on top of the Hugging Face's Transformers library, which provides the underlying transformer models like BERT, RoBERTa, DistilBERT, etc. SentenceTransformers adds a pooling operation on top of these models to generate a single vector for the entire input sequence (sentence).

The key innovation of SentenceTransformers is that it modifies the pretraining objective of these transformer models to optimize them for generating sentence embeddings. This is done using a siamese or triplet network structure, where the aim is to bring closer the embeddings of semantically similar sentences and separate the embeddings of semantically dissimilar sentences.

So, in summary, SentenceTransformers provides a family of models for generating sentence embeddings, and these models are based on BERT and other transformer architectures. The embeddings generated by these models are bidirectional, meaning they capture the context from both directions (left and right) around each word in a sentence.

### List of other popular embeddings


Embedding techniques are used to convert text data into numerical vectors that can be processed by machine learning algorithms. Here are some of the main embedding techniques:

1. **Word2Vec**: Developed by Google, Word2Vec is a two-layer neural network that processes text by vectorizing words. Its input is a text corpus and its output is a set of vectors, with each vector representing a word in the corpus. Word2Vec uses either the Continuous Bag of Words (CBOW) or Skip-Gram method to predict words within a certain context.

2. **GloVe (Global Vectors for Word Representation)**: Developed by Stanford, GloVe is an unsupervised learning algorithm that obtains vector representations for words. It does this by aggregating global word-word co-occurrence statistics from a corpus and then mapping these statistics into a word vector space.

3. **FastText**: Developed by Facebook's AI Research lab, FastText is an extension to Word2Vec. Instead of feeding individual words into the neural network, FastText treats each word as a bag of character n-grams. This allows it to capture the meaning of shorter words and allows it to understand suffixes and prefixes.

4. **BERT (Bidirectional Encoder Representations from Transformers)**: Developed by Google, BERT is a transformer-based machine learning technique for natural language processing. Unlike Word2Vec and GloVe, which generate a single static embedding for each word in the vocabulary, BERT generates context-dependent embeddings. This means that the word embeddings are influenced by the other words in the sentence.

5. **GPT (Generative Pretrained Transformer)**: Developed by OpenAI, GPT is another transformer-based model. While BERT is designed to handle tasks that require understanding both the left and right context of a word (bidirectional understanding), GPT is designed to handle tasks that require understanding only the right context (unidirectional understanding). GPT-3, the latest version of GPT as of my knowledge cutoff in September 2021, is particularly powerful and can generate human-like text given a certain prompt.

6. **ELMo (Embeddings from Language Models)**: Developed by Allen AI, ELMo is a deep contextualized word representation that models both complex characteristics of word use (e.g., syntax and semantics), and how these uses vary across linguistic contexts (i.e., polysemy). Unlike traditional word embeddings such as Word2Vec and GloVe, which generate a single static embedding for each word, ELMo generates embeddings dynamically based on the word context.

These are just a few examples of the many techniques available for generating word embeddings. The choice of technique depends on the specific requirements of the task at hand.

### Select a distance method

In the context of embeddings and Large Language Models (LLMs) applications, the choice of distance metric depends on the specific requirements of the task and the nature of the embeddings. Here's a brief overview of the three distance metrics you mentioned:

1. **Euclidean Distance (L2)**: This is the straight-line distance between two points in a space. It's often used when the magnitude of the embeddings is important. However, in high-dimensional spaces (like the ones typically produced by LLMs), Euclidean distance can suffer from the "curse of dimensionality," where all points appear to be roughly equidistant to one another.

2. **Cosine Similarity**: This measures the cosine of the angle between two vectors. It's often used when the orientation (or direction) of the embeddings is more important than their magnitude. Cosine similarity is particularly useful for text data, as it can capture the semantic similarity between documents (or words) regardless of their size.

3. **Inner Product (IP)**: This is the sum of the product of the corresponding entries of the two sequences of numbers. In the context of embeddings, it can be used to measure the similarity between two vectors, but unlike cosine similarity, it also takes into account the magnitude of the vectors.

In general, if you're working with text data and using embeddings to capture semantic similarity, cosine similarity is often a good choice. However, if the magnitude of the embeddings is also important (for example, if you're using embeddings to capture the importance or frequency of certain features), then you might want to consider using the inner product or Euclidean distance.

It's also worth noting that the choice of distance metric can also depend on the specific embedding technique you're using. Some techniques might produce embeddings where certain distance metrics are more appropriate than others. For example, Word2Vec embeddings are often compared using cosine similarity, while BERT embeddings might be compared using the inner product. 

As always, the best way to determine which distance metric to use is to
experiment with different options and see which one works best for your
specific task.



### Summary of embedding and distances

Here's a simplified table that outlines some common applications of Large Language Models (LLMs), along with a suggested embedding technique and distance method for each. Please note that these are general suggestions and the best choice can vary depending on the specifics of the task, the available data, and the computational resources.

| Application | Embedding Technique | Distance Method |
|-------------|---------------------|-----------------|
| Text Classification | BERT | Cosine |
| Sentiment Analysis | BERT | Cosine |
| Named Entity Recognition | BERT | Cosine |
| Question Answering | BERT | Cosine |
| Text Generation | GPT | Inner Product |
| Machine Translation | GPT | Inner Product |
| Summarization | GPT | Inner Product |
| Semantic Search | BERT or Sentence-BERT | Cosine |
| Chatbots and Conversational Agents | GPT | Inner Product |

In general, BERT and its variants (like Sentence-BERT for sentence embeddings) are often used for tasks that require understanding the context of a sentence, with cosine similarity as the distance method to capture semantic similarity. On the other hand, GPT and its variants are often used for tasks that involve generating text, with the inner product as the distance method to capture the similarity in the generated text.

In [27]:
import os
import openai
openai.api_key = os.getenv("OPENAI_API_KEY")

completion = openai.ChatCompletion.create(
  model="gpt-4",
  messages=[
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Hello!"}
  ]
)

print(completion.choices[0].message)


{
  "content": "Hello! How can I assist you today?\n",
  "role": "assistant"
}


In [28]:
import os
import openai
openai.api_key = os.getenv("OPENAI_API_KEY")

completion = openai.ChatCompletion.create(
  model="gpt-4",
  messages=[
    {
    "role": "user",
    "content": "Question: Given the following email content, is the email spam or not spam? Email Content: Hi, just checking in about our meeting tomorrow. Answer Choices: (A) Spam, (B) Not Spam."
    },
    {
    "role": "assistant",
    "content": "Answer: (B) Not Spam"
    },
    {
    "role": "user",
    "content": "Question: Given the following email content, is the email spam or not spam? Email Content: Congratulations! You've been selected for a free vacation! Click here now! Answer Choices: (A) Spam, (B) Not Spam."
    },
    {
    "role": "assistant",
    "content": "Answer: (A) Spam"
    },
    {
    "role": "user",
    "content": "Question: Given the following email content, is the email spam or not spam? Email Content: Dear user, you have won a million dollars! Click here to claim your prize. Answer Choices: (A) Spam, (B) Not Spam."
    },
    {
    "role": "assistant",
    "content": "Answer: "
    }
  ]
)

print(completion.choices[0].message)

{
  "content": "(A) Spam",
  "role": "assistant"
}


In [29]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
import openai

categories = ['rec.sport.baseball', 'rec.sport.hockey']
sports_dataset = fetch_20newsgroups(subset='train', shuffle=True, random_state=42, categories=categories)

In [31]:
print(sports_dataset['data'][0])

From: dougb@comm.mot.com (Doug Bank)
Subject: Re: Info needed for Cleveland tickets
Reply-To: dougb@ecs.comm.mot.com
Organization: Motorola Land Mobile Products Sector
Distribution: usa
Nntp-Posting-Host: 145.1.146.35
Lines: 17

In article <1993Apr1.234031.4950@leland.Stanford.EDU>, bohnert@leland.Stanford.EDU (matthew bohnert) writes:

|> I'm going to be in Cleveland Thursday, April 15 to Sunday, April 18.
|> Does anybody know if the Tribe will be in town on those dates, and
|> if so, who're they playing and if tickets are available?

The tribe will be in town from April 16 to the 19th.
There are ALWAYS tickets available! (Though they are playing Toronto,
and many Toronto fans make the trip to Cleveland as it is easier to
get tickets in Cleveland than in Toronto.  Either way, I seriously
doubt they will sell out until the end of the season.)

-- 
Doug Bank                       Private Systems Division
dougb@ecs.comm.mot.com          Motorola Communications Sector
dougb@nwu.edu       

In [32]:
sports_dataset.target_names[sports_dataset['target'][0]]

'rec.sport.baseball'

In [33]:
len_all, len_baseball, len_hockey = len(sports_dataset.data), len([e for e in sports_dataset.target if e == 0]), len([e for e in sports_dataset.target if e == 1])
print(f"Total examples: {len_all}, Baseball examples: {len_baseball}, Hockey examples: {len_hockey}")

Total examples: 1197, Baseball examples: 597, Hockey examples: 600


In [34]:
import pandas as pd

labels = [sports_dataset.target_names[x].split('.')[-1] for x in sports_dataset['target']]
texts = [text.strip() for text in sports_dataset['data']]
df = pd.DataFrame(zip(texts, labels), columns = ['prompt','completion']) #[:300]
df.head()

Unnamed: 0,prompt,completion
0,From: dougb@comm.mot.com (Doug Bank)\nSubject:...,baseball
1,From: gld@cunixb.cc.columbia.edu (Gary L Dare)...,hockey
2,From: rudy@netcom.com (Rudy Wade)\nSubject: Re...,baseball
3,From: monack@helium.gas.uug.arizona.edu (david...,hockey
4,Subject: Let it be Known\nFrom: <ISSBTL@BYUVM....,baseball


In [None]:
import os
import openai
openai.api_key = os.getenv("OPENAI_API_KEY")

completion = openai.ChatCompletion.create(
  model="gpt-4",
  messages=[
    {
    "role": "user",
    "content": "Question: Given the following email content, is the email spam or not spam? Email Content: Hi, just checking in about our meeting tomorrow. Answer Choices: (A) Spam, (B) Not Spam."
    },
    {
    "role": "assistant",
    "content": "Answer: (B) Not Spam"
    },
    {
    "role": "user",
    "content": "Question: Given the following email content, is the email spam or not spam? Email Content: Congratulations! You've been selected for a free vacation! Click here now! Answer Choices: (A) Spam, (B) Not Spam."
    },
    {
    "role": "assistant",
    "content": "Answer: (A) Spam"
    },
    {
    "role": "user",
    "content": "Question: Given the following email content, is the email spam or not spam? Email Content: Dear user, you have won a million dollars! Click here to claim your prize. Answer Choices: (A) Spam, (B) Not Spam."
    },
    {
    "role": "assistant",
    "content": "Answer: "
    }
  ]
)

print(completion.choices[0].message)