In [1]:
from google.cloud import storage

def list_blobs(bucket_name, suffix=None):
    """Lists all the blobs in the bucket with optional suffix filtering.
    
    Args:
        bucket_name (str): Name of the GCS bucket
        suffix (str or list): File suffix(es) to filter by (e.g. '.jsonl', '.npy')
        
    Returns:
        list: List of blob objects matching the suffix criteria
    """
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)

    blobs = bucket.list_blobs()
    
    if suffix:
        if isinstance(suffix, str):
            suffix = [suffix]
        filtered_blobs = [blob.name for blob in blobs if any(blob.name.endswith(s) for s in suffix)]
        return filtered_blobs
    
    return list(blobs)

In [2]:
bucket_name = "proposition-vectors"
props = list_blobs(bucket_name,".npy")

In [5]:
ids = []
for i in range(8):
    id = props[160*i-1].split("_")[1].split("-")
    start_id = "".join([elem + "-" for elem in id[:5]])[:-1]
    end_id = "".join([elem + "-" for elem in id[5:]])[:-5]
    ids.append((start_id,end_id))

ids

[('enwiki-00000000-0000-0000-0000', 'enwiki-00005131-0001-0000-0000'),
 ('enwiki-00859626-0002-0001-0006', 'enwiki-00864894-0016-0000-0002'),
 ('enwiki-01587902-0000-0000-0002', 'enwiki-01592440-0010-0000-0000'),
 ('enwiki-02315769-0000-0000-0001', 'enwiki-02321829-0003-0000-0002'),
 ('enwiki-03013027-0007-0000-0006', 'enwiki-03017772-0006-0000-0004'),
 ('enwiki-03823541-0002-0000-0002', 'enwiki-03827874-0003-0000-0010'),
 ('enwiki-04586442-0027-0000-0002', 'enwiki-04590551-0015-0000-0004'),
 ('enwiki-05324078-0001-0000-0000', 'enwiki-05329181-0044-0002-0003')]

In [11]:
ids = []
for i in range(8):
    id = props[160*i-1].split("_")[1].split("-")
    start_id = "".join([elem + "-" for elem in id[:5]])[:-1]
    end_id = "".join([elem + "-" for elem in id[5:]])[:-5]
    ids.append((start_id,end_id))

ids

[('enwiki-06048186-0015-0000-0006', 'enwiki-06052180-0000-0000-0002'),
 ('enwiki-00855357-0012-0000-0004', 'enwiki-00859626-0002-0001-0005'),
 ('enwiki-01583453-0039-0000-0004', 'enwiki-01587902-0000-0000-0001'),
 ('enwiki-02308005-0007-0001-0004', 'enwiki-02315769-0000-0000-0000'),
 ('enwiki-03008104-0029-0001-0002', 'enwiki-03013027-0007-0000-0005'),
 ('enwiki-03818516-0004-0000-0001', 'enwiki-03823541-0002-0000-0001'),
 ('enwiki-04582327-0016-0000-0002', 'enwiki-04586442-0027-0000-0001'),
 ('enwiki-05319613-0001-0000-0007', 'enwiki-05324078-0000-0000-0003')]

In [16]:
with open('emb_list.py', 'w') as f:
    f.write('emb_file_names = [\n')
    for prop in sorted(props):
        f.write(f'    "{prop}",\n')
    f.write(']')

In [17]:
def kamal(n):

    for i in range(n):
        yield i

In [18]:
gen = kamal(10)

In [29]:
next(gen)

StopIteration: 

In [1]:
import chromadb
import numpy as np

In [2]:
client = chromadb.HttpClient(host = "34.44.238.198", port = 8000)

In [3]:
client.heartbeat()

1746964145266340225

In [8]:
client.list_collections()

[Collection(name=props)]

In [10]:
# Get the props collection
collection = client.get_collection("props")

# List all items in the collection
results = collection.get()

print(f"Total items in collection: {len(results['ids'])}")
print("\nFirst 5 items:")
for i in range(min(5, len(results['ids']))):
    print(f"\nID: {results['ids'][i]}")
    print(f"Document: {results['documents'][i][:200]}...")  # Print first 200 chars


Total items in collection: 19000

First 5 items:

ID: 0
Document: ! (Cláudia Pascoal album)
! is the debut studio album by Cláudia Pascoal....

ID: 1
Document: ! (Cláudia Pascoal album)
Cláudia Pascoal is a Portuguese singer....

ID: 2
Document: ! (Cláudia Pascoal album)
The album was released in Portugal on 27 March 2020....

ID: 3
Document: ! (Cláudia Pascoal album)
The album was released by Universal Music Portugal....

ID: 4
Document: ! (Cláudia Pascoal album)
The album peaked at number six on the Portuguese Albums Chart....


In [None]:
client.delete_collection("props")

In [13]:
# Create some dummy data
collection = client.get_or_create_collection("kamal")

# Generate dummy embeddings and documents
import numpy as np

n_items = 100
embedding_dim = 768  # Standard embedding dimension

# Generate random embeddings, IDs and documents
embeddings = [np.random.rand(embedding_dim).tolist() for _ in range(n_items)]
ids = [str(i) for i in range(n_items)]
documents = [f"This is a dummy document number {i}" for i in range(n_items)]

# Add to collection in one batch
collection.add(
    embeddings=embeddings,
    documents=documents,
    ids=ids
)

print(f"Added {n_items} dummy items to collection")


Added 100 dummy items to collection


In [14]:
# Query the collection with a random embedding as an example
query_embedding = np.random.rand(768).tolist()  # Same dimension as our dummy data

# Perform the query
results = collection.query(
    query_embeddings=query_embedding,
    n_results=5  # Get top 5 matches
)

print("Query results:")
for i, (doc, distance) in enumerate(zip(results['documents'][0], results['distances'][0])):
    print(f"{i+1}. Document: {doc}")
    print(f"   Distance: {distance:.4f}\n")


Query results:
1. Document: This is a dummy document number 51
   Distance: 116.5082

2. Document: This is a dummy document number 74
   Distance: 119.2914

3. Document: This is a dummy document number 28
   Distance: 119.7850

4. Document: This is a dummy document number 9
   Distance: 120.6148

5. Document: This is a dummy document number 3
   Distance: 121.5881



In [13]:
client.delete_collection("props")

In [12]:
print(client.list_collections())

[Collection(name=props-2), Collection(name=kamal), Collection(name=props)]
