In [1]:
!source dclm_env/bin/activate

In [1]:
from annoy import AnnoyIndex
from sentence_transformers import SentenceTransformer
from datasets import load_from_disk



# Load the dataset from the relative directory
print("Loading dataset from disk...")
ds = load_from_disk('./wikipedia20231101en')
print("Dataset loaded successfully!")

# Load the SentenceTransformer model
print("Loading SentenceTransformer model...")
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
print("Model loaded successfully!")

# Initialize Annoy index
f = model.get_sentence_embedding_dimension()
t = AnnoyIndex(f, 'euclidean')
print("Initialized Annoy index.")

# Process dataset in batches
i = 0
print("Starting to process dataset in batches...")
for split in ds.keys():
    print(f"Processing split: {split}")
    for example in ds[split]:
        text = example['text']  # Extract the 'text' field directly from the example
        
        # Encode the sentence
        embedding = model.encode(text)  # Encode the single text
        
        # Add embedding to Annoy index
        t.add_item(i, embedding)
        i += 1

        # Print progress every 10000 items
        if i % 100000 == 0:
            print(f"Processed {i} items so far.")


print("Finished processing all batches!")

# Build and save the Annoy index
print("Building Annoy index...")
t.build(10)  # Use more trees for better accuracy
print("Annoy index built successfully!")

print("Saving Annoy index...")
t.save('highQualityAnnoyEuclidean.ann')
print("Annoy index saved successfully!")


Loading dataset from disk...


Loading dataset from disk:   0%|          | 0/41 [00:00<?, ?it/s]

Dataset loaded successfully!
Loading SentenceTransformer model...
Model loaded successfully!
Initialized Annoy index.
Starting to process dataset in batches...
Processing split: train
Processed 100000 items so far.


KeyboardInterrupt: 

In [None]:
from annoy import AnnoyIndex
from sentence_transformers import SentenceTransformer
from datasets import load_from_disk
from multiprocessing import Pool
import numpy as np

# Load the dataset
print("Loading dataset from disk...")
ds = load_from_disk('./wikipedia20231101en')
print("Dataset loaded successfully!")

# Load the SentenceTransformer model
print("Loading SentenceTransformer model...")
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
print("Model loaded successfully!")

# Initialize Annoy index
f = model.get_sentence_embedding_dimension()
t = AnnoyIndex(f, 'euclidean')
print("Initialized Annoy index.")

def process_batch(batch):
    """
    Function to process a batch of texts.
    Encodes texts and returns a list of tuples (index, embedding).
    """
    results = []
    for idx, example in batch:
        text = example['text']
        embedding = model.encode(text)  # Encode the single text
        results.append((idx, embedding))
    return results

# Prepare dataset for parallel processing
all_data = [(i, example) for split in ds.keys() for i, example in enumerate(ds[split])]

# Split data into batches for parallel processing
batch_size = 1000  # Adjust batch size based on available memory
batches = [all_data[i:i + batch_size] for i in range(0, len(all_data), batch_size)]

print("Starting parallel processing...")
with Pool() as pool:
    results = pool.map(process_batch, batches)

# Add items to Annoy index
print("Adding items to Annoy index...")
for batch_results in results:
    for idx, embedding in batch_results:
        t.add_item(idx, embedding)

# Build and save the Annoy index
print("Building Annoy index...")
t.build(10)  # Use more trees for better accuracy
print("Annoy index built successfully!")

print("Saving Annoy index...")
t.save('highQualityAnnoyEuclidean.ann')
print("Annoy index saved successfully!")


Loading dataset from disk...


Loading dataset from disk:   0%|          | 0/41 [00:00<?, ?it/s]

Dataset loaded successfully!
Loading SentenceTransformer model...
Model loaded successfully!
Initialized Annoy index.


In [None]:
from multiprocessing import Pool
from tqdm import tqdm
from annoy import AnnoyIndex
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import os

# Load the dataset
# Load the dataset from the relative directory
print("Loading dataset from disk...")

ds = load_dataset("wikimedia/wikipedia", "20231101.en")

print("Dataset loaded successfully!")

# Load the SentenceTransformer model
print("Loading SentenceTransformer model...")
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
print("Model loaded successfully!")

# Initialize Annoy index
f = model.get_sentence_embedding_dimension()
t = AnnoyIndex(f, 'euclidean')
print("Initialized Annoy index.")

def process_batch(batch):
    """
    Function to process a batch of texts.
    Encodes texts and returns a list of tuples (index, embedding).
    """
    results = []
    for idx, example in batch:
        text = example['text']
        embedding = model.encode(text)  # Encode the single text
        results.append((idx, embedding))
    return results

# Prepare dataset for parallel processing
all_data = [(i, example) for split in ds.keys() for i, example in enumerate(ds[split])]

# Split data into batches for parallel processing
batch_size = 1000  # Adjust batch size based on available memory
batches = [all_data[i:i + batch_size] for i in range(0, len(all_data), batch_size)]

# Create a progress bar
progress_bar = tqdm(total=len(batches), desc="Processing Batches")

def update_progress(*_):
    """
    Callback to update the progress bar.
    """
    progress_bar.update()

# Use Pool for parallel processing
print("Starting parallel processing...")
from multiprocessing.pool import ThreadPool

with ThreadPool() as pool:
    results = list(tqdm(pool.imap(process_batch, batches), total=len(batches), desc="Processing Batches"))

# Close the progress bar
progress_bar.close()

# Add items to Annoy index
print("Adding items to Annoy index...")
for batch_results in results:
    for idx, embedding in batch_results:
        t.add_item(idx, embedding)

# Build and save the Annoy index
print("Building Annoy index...")
t.build(10)  # Use more trees for better accuracy
print("Annoy index built successfully!")

print("Saving Annoy index...")
t.save('highQualityAnnoyEuclidean.ann')
print("Annoy index saved successfully!")


  from .autonotebook import tqdm as notebook_tqdm


Loading dataset from disk...
Dataset loaded successfully!
Loading SentenceTransformer model...
Model loaded successfully!
Initialized Annoy index.


Processing Batches:   0%|          | 0/6408 [00:00<?, ?it/s]

Starting parallel processing...


  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [2]:
from annoy import AnnoyIndex
from sentence_transformers import SentenceTransformer
from datasets import load_dataset



# Load the dataset from the relative directory
print("Loading dataset from disk...")
ds = load_dataset("wikimedia/wikipedia", "20231101.en")
print("Dataset loaded successfully!")

# Load the SentenceTransformer model
print("Loading SentenceTransformer model...")
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
print("Model loaded successfully!")

print(ds.num_rows)

Loading dataset from disk...
Dataset loaded successfully!
Loading SentenceTransformer model...
Model loaded successfully!
{'train': 6407814}


In [None]:
# Initialize Annoy index
f = model.get_sentence_embedding_dimension()
t = AnnoyIndex(f, 'euclidean')
print("Initialized Annoy index.")

# Process dataset in batches
i = 0
print("Starting to process dataset in batches...")
sentences = model.encode(ds['train'])
for embedding in sentences:
    t.add_item(i, embedding)
    i += 1
    if i % 100000 == 0:
        print(f"Processed {i} items so far.")


print("Finished processing all batches!")

# Build and save the Annoy index
print("Building Annoy index...")
t.build(10)  # Use more trees for better accuracy
print("Annoy index built successfully!")

print("Saving Annoy index...")
t.save('highQualityAnnoyEuclidean.ann')
print("Annoy index saved successfully!")

Initialized Annoy index.
Starting to process dataset in batches...
