# Create Document Embeddings (Ollama Compatible)
This notebook creates embeddings that match Ollama's deepseek model dimensions (3584)

In [None]:
!pip install sentence-transformers torch

In [None]:
import json
import torch
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm

# Check if GPU is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# Load model that outputs 3072 dimensions (close to Ollama's 3584)
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device=device)

# Function to pad embeddings to match Ollama's dimension
def pad_embedding(embedding, target_size=3584):
    current_size = len(embedding)
    if current_size >= target_size:
        return embedding[:target_size]
    padding = [0.0] * (target_size - current_size)
    return embedding + padding

In [None]:
# Upload your chunks.json file here
from google.colab import files
uploaded = files.upload()
filename = list(uploaded.keys())[0]

# Load chunks
with open(filename, 'r') as f:
    chunks = json.load(f)

In [None]:
# Process chunks in batches
embeddings_data = []
batch_size = 32  # Larger batch size for GPU efficiency

for i in tqdm(range(0, len(chunks), batch_size)):
    batch = chunks[i:i + batch_size]
    # Get embeddings for entire batch at once
    batch_embeddings = model.encode([chunk for chunk in batch], convert_to_tensor=True)
    
    for j, embedding in enumerate(batch_embeddings):
        # Convert to list and pad to match Ollama's dimensions
        padded_embedding = pad_embedding(embedding.cpu().numpy().tolist())
        embeddings_data.append({
            "id": i + j,
            "content": batch[j],
            "embedding": padded_embedding
        })

# Save embeddings
output_filename = filename.replace('_chunks.json', '_embeddings.json')
with open(output_filename, 'w') as f:
    json.dump(embeddings_data, f)

print(f"\nSaved embeddings for {len(embeddings_data)} chunks")

# Download the embeddings file
files.download(output_filename)