In [9]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim

sentences = [
    'The new movie is awesome',
    'This recent movie is so good',
]

model = SentenceTransformer('Supabase/gte-small')
embeddings = model.encode(sentences)
print(cos_sim(embeddings[0], embeddings[1]))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/66.8M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tensor([[0.8980]])


In [10]:
# Two lists of sentences
sentences1 = [
    "The cat sits outside",
    "A man is playing guitar",
    "The new movie is awesome",
]

sentences2 = [
    "The dog plays in the garden",
    "A woman watches TV",
    "中文是这个",
]

# Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

# Compute cosine-similarities
cosine_scores = cos_sim(embeddings1, embeddings2)

# Output the pairs with their score
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(
        sentences1[i], sentences2[i], cosine_scores[i][i]
    ))

The cat sits outside 		 The dog plays in the garden 		 Score: 0.8200
A man is playing guitar 		 A woman watches TV 		 Score: 0.7016
The new movie is awesome 		 中文是这个 		 Score: 0.7315


In [11]:
# Our sentences to encode
sentences = [
    "This framework generates embeddings for each input sentence",
    "Sentences are passed as a list of string.",
    "The quick brown fox jumps over the lazy dog."
]

# Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

# Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

Sentence: This framework generates embeddings for each input sentence
Embedding: [-4.56867635e-01 -6.03740886e-02  2.77014356e-02 -1.49324030e-01
 -2.58537903e-02  3.99035126e-01 -1.55861266e-02  2.39103317e-01
  9.95153934e-02  1.49657920e-01 -3.45251858e-01 -4.33488786e-01
  6.84537232e-01  2.49792591e-01  3.92541915e-01  3.05619121e-01
 -2.38010854e-01  3.97296131e-01 -4.60436165e-01 -1.37540340e-01
  5.90817809e-01 -2.84304172e-01  1.05978139e-01 -5.92266202e-01
 -1.59350365e-01  4.13091660e-01 -1.64931849e-01 -7.34148696e-02
 -3.01011473e-01 -1.89854705e+00  2.36649923e-02 -5.51725984e-01
  7.99842477e-01 -4.33840714e-02 -2.60188192e-01 -1.74996063e-01
 -4.91537303e-01  4.09644276e-01 -1.80870801e-01  2.30171114e-01
  2.36194938e-01  2.71462679e-01  2.17982121e-02 -6.09191716e-01
 -2.04823956e-01 -5.56082964e-01 -6.08014047e-01  7.77903624e-05
 -8.24695081e-02 -2.05188334e-01 -7.09772184e-02 -4.21118766e-01
 -9.76334512e-02  8.62645656e-02  2.12224036e-01  1.12527385e-01
  2.59943

In [12]:
from sentence_transformers.util import semantic_search

docs = [
    "A man is eating food.",
    "A man is eating a piece of bread.",
    "The girl is carrying a baby.",
    "A man is riding a horse.",
    "A woman is playing violin.",
    "Two men pushed carts through the woods.",
    "A man is riding a white horse on an enclosed ground.",
    "A monkey is playing drums.",
    "A cheetah is running behind its prey.",
]

docs_embeddings = model.encode(docs, convert_to_tensor=True)

query = "tell me about music"
query_embedding = model.encode(query, convert_to_tensor=True)

hits = semantic_search(query_embedding, docs_embeddings, top_k=2)
hits

for hit in hits[0]:
    print(docs[hit['corpus_id']], "(Score: %.4f)" % hit['score'])

A woman is playing violin. (Score: 0.7764)
A monkey is playing drums. (Score: 0.7570)


In [7]:
import tiktoken

def split_large_text(large_text, max_tokens):
    enc = tiktoken.get_encoding("cl100k_base")
    tokenized_text = enc.encode(large_text)

    chunks = []
    current_chunk = []
    current_length = 0

    for token in tokenized_text:
        current_chunk.append(token)
        current_length += 1

        if current_length >= max_tokens:
            chunks.append(enc.decode(current_chunk).rstrip(' .,;'))
            current_chunk = []
            current_length = 0

    if current_chunk:
        chunks.append(enc.decode(current_chunk).rstrip(' .,;'))

    return chunks

Why use tokens?

> By breaking words into smaller parts (tokens), LLMs can better handle new or unusual words by understanding their building blocks. It also helps the model grasp the nuances of language, such as different word forms and contextual meanings.

[source](https://kelvin.legal/understanding-large-language-models-words-versus-tokens/#:~:text=By%20breaking%20words%20into%20smaller,word%20forms%20and%20contextual%20meanings.)

In [13]:
import tiktoken

sent = "If we split a text by number of characters, it is not obvious how many tokens these chunks will be."


print(len(sent.split()))

enc = tiktoken.get_encoding("cl100k_base")
encoded = enc.encode(sent)

print(len(encoded))
tokens = [enc.decode_single_token_bytes(x) for x in encoded]
print(tokens)
print(len(tokens))


decoded = enc.decode(encoded)
print(len(decoded.split()))
decoded


20
22
[b'If', b' we', b' split', b' a', b' text', b' by', b' number', b' of', b' characters', b',', b' it', b' is', b' not', b' obvious', b' how', b' many', b' tokens', b' these', b' chunks', b' will', b' be', b'.']
22
20


'If we split a text by number of characters, it is not obvious how many tokens these chunks will be.'

In [1]:
import google.generativeai as genai
import time
import os

import getpass

GOOGLE_API_KEY = getpass.getpass()


··········


In [20]:

genai.configure(api_key=GOOGLE_API_KEY)
gen_model = genai.GenerativeModel('gemini-pro')

doc = """The new iPad Pro

May 7 (UPI) -- Apple held its "Let Loose" iPad event Tuesday, unveiling the new iPad Pro, iPad Air and Apple Pencil Pro in what chief executive officer Tim Cook called "the biggest day for iPad since its introduction."

The event, which streamed from Apple headquarters at Cupertino, Calif., touted iPad's past and its future as a "magical sheet of glass."

iPad Pro

One of the biggest announcements Tuesday was Apple's new iPad Pro, which integrates the tech giant's more powerful M4 chip to operate 50% faster, according to Apple. The M4 chip will also allow Apple to integrate artificial intelligence into the "impossibly thin" 11-inch and 13-inch devices.

"Meet the new iPad Pro: the thinnest product we've ever created, the most advanced display we've ever produced, with the incredible power of the M4 chip. Just imagine all the things it'll be used to create," Cook wrote Tuesday in a post on X.

The iPad Pro also offers OLED display, a broader viewing display with Ultra Retina XDR, and nano-textured glass that cuts glare.
"iPad Pro empowers a broad set of pros and is perfect for anyone who wants the ultimate iPad experience -- with its combination of the world's best displays, extraordinary performance of our latest M-series ships, and advanced accessories -- all in a portable design," John Ternus, Apple's senior vice president of Hardware Engineering, said in a statement.

Related video: Apple reveals new iPad models equipped with chips for AI-related tasks (USA TODAY)
Current Time 0:00
/
Duration 0:45
USA TODAY
Apple reveals new iPad models equipped with chips for AI-related tasks
0
View on Watch
View on Watch
"Today, we're taking it even further with the new, stunningly thin and light iPad Pro, our biggest update ever to iPad Pro," Ternus added.

The 11-inch iPad Pro will start at $1,000 for the Wi-Fi version and $1,200 for the cellular model. The 13-inch iPad Pro will start at $1,300 for the Wi-Fi model and $1,500 for the cellular device. Both devices will go on sale May 15.

iPad Air

The more affordable new iPad Air also debuted at Tuesday's Apple event and comes in two sizes. The 11-inch and 13-inch screens feature a Liquid Retina display as the device also upgrades its audio with improved stereo speakers.

The iPad Air is powered by an M2 chip and is available with more storage this time. It also features a "landscape-front camera and faster Wi-Fi. The new iPad air is more powerful and versatile than ever," said Melody Kuna, director of iPad Product Design.

The 11-inch iPad Air starts at $600, with the 13-inch iPad Air starting at $800.

Apple Pencil Pro

During its "Let Loose" event, Apple also introduced two new iPad Pro accessories Tuesday, including the Apple Pencil Pro, a new stylus that uses squeezing to bring up new palettes and rotating to control pen and brush tools to allow for "even greater precision."

The Apple Pencil Pro, which costs $130 and charges magnetically on the side of the iPad Pro, also features a location tagging app in case the stylus gets lost.

Magic Keyboard

Apple's Magic Keyboard also got an update for the iPad Pro lineup, allowing users to turn their tablets into a makeshift MacBook.

The Magic Keyboard now has an aluminum palm rest, a larger trackpad, a row of Function keys and shortcuts to adjust screen and keyboard brightness.

The 11-inch version of the Magic Keyboard costs $300 and the 13-inch version will retail for $350 when it goes on sale next week.
"""
test_splited = split_large_text(doc, 30)

docs_embeddings = model.encode(test_splited, convert_to_tensor=True)

query = "tell me about new ipad pro"
query_embedding = model.encode(query, convert_to_tensor=True)

hits = semantic_search(query_embedding, docs_embeddings, top_k=10)
needed_docs = []
for hit in hits[0]:
    # print(test_splited[hit['corpus_id']], "(Score: %.4f)" % hit['score'])
    needed_docs.append(test_splited[hit['corpus_id']])

corpus = '\n'.join(needed_docs)

response = gen_model.generate_content(query + '\n' + corpus)
print(response.text)


**New iPad Pro**

**Key Features:**

* **Ultra-thin and light:** The thinnest iPad Apple has ever created.
* **Advanced display:** Liquid Retina XDR display with ProMotion technology for smooth scrolling, HDR content, and True Tone for accurate color rendering.
* **Incredible power:** M2 chip for up to 50% faster performance than previous iPad Pros.
* **iPadOS 16:** Latest operating system with new multitasking features, redesigned apps, and more.

**Specifications:**

* **Display:**
    * 11-inch Liquid Retina XDR display
    * 13-inch Liquid Retina XDR display
* **Chip:** M2 chip
* **Storage:** 128GB, 256GB, 512GB, 1TB, 2TB
* **RAM:** 8GB (11-inch) / 16GB (13-inch)
* **Connectivity:** Wi-Fi 6E and cellular options available
* **Battery life:** Up to 10 hours on Wi-Fi, up to 9 hours on cellular
* **Dimensions:**
    * 11-inch: 9.74 x 7.02 x 0.23 inches
    * 13-inch: 11.04 x 8.46 x 0.23 inches
* **Weight:**
    * 11-inch: 1.03 pounds
    * 13-inch: 1.29 pounds

**Accessories:**

* **A