In [1]:
#pip install annoy

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.

Collecting annoy
  Downloading annoy-1.17.3.tar.gz (647 kB)
     ---------------------------------------- 0.0/647.5 kB ? eta -:--:--
     ---------------- ----------------------- 262.1/647.5 kB ? eta -:--:--
     -------------------------------------- 647.5/647.5 kB 1.8 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: annoy
  Building wheel for annoy (pyproject.toml): started
  Building wheel for annoy (pypro

In [1]:
from annoy import AnnoyIndex
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import time
import torch

# Load the dataset from the relative directory
print("Loading dataset from disk...")

ds = load_dataset("wikimedia/wikipedia", "20231101.en")

print("Dataset loaded successfully!")

# Load the SentenceTransformer model
print("Loading SentenceTransformer model...")
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
print("Model loaded successfully!")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print(device)

# Initialize Annoy index
f = model.get_sentence_embedding_dimension()
t = AnnoyIndex(f, 'euclidean')
print("Initialized Annoy index.")

start = time.time()
embeddings = ds['train']['text'][:256]
embeddings = model.encode(embeddings, batch_size=256, device=device)  # Encode the single text
# Process dataset in batches
end = time.time()
print(end-start)
print("Starting to process dataset in batches...")
for i, embedding in enumerate(embeddings):
    t.add_item(i, embedding)


print("Finished processing all batches!")

# Build and save the Annoy index
print("Building Annoy index...")
t.build(10)  # Use more trees for better accuracy
print("Annoy index built successfully!")

print("Saving Annoy index...")
t.save('highQualityAnnoyEuclidean.ann')
print("Annoy index saved successfully!")


  from .autonotebook import tqdm as notebook_tqdm


Loading dataset from disk...
Dataset loaded successfully!
Loading SentenceTransformer model...
Model loaded successfully!
cuda
Initialized Annoy index.


  attn_output = torch.nn.functional.scaled_dot_product_attention(


253.15101718902588
Starting to process dataset in batches...
Finished processing all batches!
Building Annoy index...
Annoy index built successfully!
Saving Annoy index...
Annoy index saved successfully!


In [None]:
from annoy import AnnoyIndex
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import time
import torch
import math
import os

# Load the dataset from the relative directory
print("Loading dataset from disk...")
ds = load_dataset("wikimedia/wikipedia", "20231101.en")
print("Dataset loaded successfully!")

# Load the SentenceTransformer model
print("Loading SentenceTransformer model...")
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
print("Model loaded successfully!")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print(f"Using device: {device}")

# Initialize variables
num_parts = 10  # Number of parts to divide the dataset
f = model.get_sentence_embedding_dimension()  # Dimension of embeddings
output_dir = "annoy_indexes"  # Directory to save Annoy files
os.makedirs(output_dir, exist_ok=True)  # Ensure the output directory exists

# Split the dataset
texts = ds['train']['text']
total_texts = len(texts)
part_size = math.ceil(total_texts / num_parts)

print(f"Dividing dataset into {num_parts} parts, each with approximately {part_size} texts.")

# Process each part
for part in range(num_parts):
    start_idx = part * part_size
    end_idx = min(start_idx + part_size, total_texts)
    subset_texts = texts[start_idx:end_idx]
    print(f"Processing part {part + 1}/{num_parts} with {len(subset_texts)} texts.")

    # Encode the texts
    start_time = time.time()
    embeddings = model.encode(subset_texts, batch_size=256, device=device, show_progress_bar=True)
    print(f"Part {part + 1} encoded in {time.time() - start_time:.2f} seconds.")

    # Build Annoy index for this part
    t = AnnoyIndex(f, 'euclidean')
    for i, embedding in enumerate(embeddings):
        t.add_item(i, embedding)

    print(f"Building Annoy index for part {part + 1}...")
    t.build(10)  # Use 10 trees for better accuracy
    index_file = os.path.join(output_dir, f"annoy_index_part_{part + 1}.ann")
    t.save(index_file)
    print(f"Part {part + 1} Annoy index saved to {index_file}.")

print("All parts processed and Annoy indexes created successfully!")


Loading dataset from disk...
Dataset loaded successfully!
Loading SentenceTransformer model...
Model loaded successfully!
Using device: cuda
Dividing dataset into 10 parts, each with approximately 640782 texts.
Processing part 1/10 with 640782 texts.


Batches:   7%|▋         | 163/2504 [15:24<20:28,  1.91it/s]   