In [2]:
!pip install -q aiohttp
!pip install -q datasets
!pip install -q tqdm
!pip install -q huggingface-hub

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [11]:
DATASET_IN = "derek-thomas/dataset-creator-reddit-bestofredditorupdates"
DATASET_OUT = "processed-subset-bestofredditorupdates"
ENDPOINT_NAME = "embeddings-demo-endpoint"

MAX_WORKERS = 5  # This is for how many async workers you want. Choose based on the model and hardware
ROW_COUNT = 10  # Choose None to use all rows, Im using 10 just for a demo

In [4]:
# GPU Choice
VENDOR = "aws"
REGION = "us-east-1"
INSTANCE_SIZE = "medium"
INSTANCE_TYPE = "g4dn.xlarge"

In [6]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [9]:
from huggingface_hub import whoami
from getpass import getpass

who = whoami()

organization = getpass(prompt="What is your Hugging Face 🤗 username or organization? (with an added payment method)")

namespace = organization or who["name"]


In [12]:
from datasets import load_dataset

dataset = load_dataset(DATASET_IN)

dataset["train"]

Downloading readme:   0%|          | 0.00/1.73k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/41.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10924 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'content', 'score', 'date_utc', 'title', 'flair', 'poster', 'permalink', 'updated', 'new'],
    num_rows: 10924
})

In [13]:
documents = dataset["train"].to_pandas().to_dict("records")[:ROW_COUNT]
len(documents), documents[0]

(10,
 {'id': '10004zw',
  'content': '[removed]',
  'score': 1,
  'date_utc': Timestamp('2022-12-31 18:16:22'),
  'title': 'To All BORU contributors, Thank you :)',
  'flair': 'CONCLUDED',
  'poster': 'IsItAcOnSeQuEnCe',
  'permalink': '/r/BestofRedditorUpdates/comments/10004zw/to_all_boru_contributors_thank_you/',
  'updated': False,
  'new': False})

# __Inference Endpoints__

### Create Inference Endpoint

In [None]:
from huggingface_hub import create_inference_endpoint, list_inference_endpoints

try:
    endpoint = create_inference_endpoint(ENDPOINT_NAME,
                                         repository="jinaai/jina-embeddings-v2-base-en",
                                         revision="7302ac470bed880590f9344bfeee32ff8722d0e5",
                                         task="sentence-embeddings",
                                         framework="pytorch",
                                         accelerator="gpu",
                                         instance_size=INSTANCE_SIZE,
                                         instance_type=INSTANCE_TYPE,
                                         region=REGION,
                                         vendor=VENDOR,
                                         namespace=namespace,
                                         custom_image={"health_route": "/health",
                                                       "env": {"MAX_BATCH_TOKENS": str(MAX_WORKERS * 2048),
                                                               "MAX_CONCURRENT_REQUESTS": "512",
                                                               "MODEL_ID": "/repository"},
                                                       "url": "ghcr.io/huggingface/text-embeddings-inference:0.5.0"
                                                       },
                                         type="protected")
except:
    endpoint = [ie for ie in list_inference_endpoints(namespace=namespace) if ie.name == ENDPOINT_NAME][0]
    print('Loaded endpoint')

In [None]:
%%time
endpoint.wait()

In [None]:
import numpy as np
import json

response = endpoint.client.post(json={"inputs": 'This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music!',
                                      'truncate': True},
                                task="feature-extraction")
response = np.array(json.loads(response.decode()))
response[0][:20]

In [None]:
embedding_input = "This input will get multiplied" * 10000
print(f"The length of the embedding_input is: {len(embedding_input)}")
response = endpoint.client.post(json={"inputs": embedding_input,
                                      "truncate": True},
                                task="feature-extraction")
response = np.array(json.loads(response.decode()))
response[0][:20]

# __Get Embeddings__

In [17]:
async def request(document, semaphore):
    # Semaphore guard
    async with semaphore:
        result = await endpoint.async_client.post(json={"inputs": document["content"],
                                                        "truncate": True},
                                                  task="feature-extraction")
        result = np.array(json.loads(result.decode()))
        document["embedding"] = result[0]  # Assuming the API's output can be directly assigned
        return document

In [18]:
from tqdm.auto import tqdm
import asyncio

async def main(documents):
    # Semaphore to limit concurrent requests. Adjust the number as needed.
    semaphore = asyncio.BoundedSemaphore(MAX_WORKERS)

    # Creating a list of tasks
    tasks = [request(document, semaphore) for document in documents]

    # Using tqdm to show progress. It's been integrated into the async loop.
    for f in tqdm(asyncio.as_completed(tasks), total=len(documents)):
        await f

In [None]:
import time

start = time.perf_counter()

# Get embeddings
await main(documents)

# Make sure we got it all
count = 0
for document in documents:
    if 'embedding' in document.keys() and len(document['embedding']) == 768:
        count += 1
print(f'Embeddings = {count} documents = {len(documents)}')


# Print elapsed time
elapsed_time = time.perf_counter() - start
minutes, seconds = divmod(elapsed_time, 60)
print(f"{int(minutes)} min {seconds:.2f} sec")

# __Pause Inference Endpoint__

In [None]:
endpoint = endpoint.pause()

print(f"Endpoint Status: {endpoint.status}")

# __Push updated dataset to Hub__

In [None]:
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict

df = pd.DataFrame(documents)
dd = DatasetDict({'train': Dataset.from_pandas(df)})

In [None]:
dd.push_to_hub(repo_id=DATASET_OUT)

In [None]:
print(f'Dataset is at https://huggingface.co/datasets/{who["name"]}/{DATASET_OUT}')

# __Delete Endpoint__



In [None]:
endpoint = endpoint.delete()

if not endpoint:
    print('Endpoint deleted successfully')
else:
    print('Delete Endpoint in manually')