In [2]:
from FlagEmbedding import BGEM3FlagModel
from tqdm.notebook import tqdm
from dotenv import load_dotenv

from m3_sentence_transformer.data_sampler import get_sample_docs_with_all_qrels

load_dotenv()

model = BGEM3FlagModel("BAAI/bge-m3", use_fp16=True)

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [12]:
import weaviate
import weaviate.classes as wvc
#client = weaviate.connect_to_custom(http_host=os.getenv("WEAVIATE_HTTP_HOST"),http_port=int(os.getenv("WEAVIATE_HTTP_PORT")), http_secure=True, grpc_host=os.getenv("WEAVIATE_GRPC_HOST"), grpc_port=int(os.getenv("WEAVIATE_GRPC_PORT")), grpc_secure=True, auth_credentials=weaviate.auth.AuthApiKey(api_key=os.getenv("WEAVIATE_API_KEY")))
client = weaviate.connect_to_local()

In [13]:
#client.collections.delete("neuclir_1_mutli_bge_m3_small")

In [14]:
try:
    documents = client.collections.create(
        name="neuclir_1_mutli_bge_m3_small",
        vectorizer_config=[
            wvc.config.Configure.NamedVectors.none(
                name="title_dense",
                vector_index_config=wvc.config.Configure.VectorIndex.hnsw(
                    vector_cache_max_objects=100000,
                    #quantizer=wvc.config.Reconfigure.VectorIndex.Quantizer.pq(training_limit=100000)  # Set the threshold to begin training
                ),
                
            ),
            wvc.config.Configure.NamedVectors.none( 
                name="text_dense",
                vector_index_config=wvc.config.Configure.VectorIndex.hnsw(
                    vector_cache_max_objects=100000,
                   # quantizer=wvc.config.Reconfigure.VectorIndex.Quantizer.pq(training_limit=10000)  # Set the threshold to begin training
                ),
            )],
        properties=[
            wvc.config.Property(
                name="doc_id",
                data_type=wvc.config.DataType.UUID,
            ),
            wvc.config.Property(
                name="title_sparse",
                data_type=wvc.config.DataType.BLOB,
            ),
              wvc.config.Property(
                name="document_sparse",
                data_type=wvc.config.DataType.BLOB,
            ),
            wvc.config.Property(
                name="title_colbert",
                data_type=wvc.config.DataType.BLOB,
            ),
           wvc.config.Property(
                name="document_colbert",
                data_type=wvc.config.DataType.BLOB,
            ),
            wvc.config.Property(
                name="title",
                data_type=wvc.config.DataType.TEXT,
            ),
            wvc.config.Property(
                name="text",
                data_type=wvc.config.DataType.TEXT,
            ),
            wvc.config.Property(
                name="url",
                data_type=wvc.config.DataType.TEXT,
            )
        ])
except Exception as e:
    print(e)
    

In [6]:
import pandas as pd

docs = get_sample_docs_with_all_qrels()


len(docs)

177301

In [7]:
import base64
def to_blob(obj):
    return base64.b64encode(pickle.dumps(obj)).decode('utf-8')

In [11]:
from time import sleep
import pickle

batches = [(i,i+10000) for i in range(0, len(docs), 10000)]
coll = client.collections.get("neuclir_1_mutli_bge_m3_small")

outer_progress = tqdm(total=len(docs))

for i, (start, end) in enumerate(batches):
    if i % 50 == 0 and i != 0:
        print(f"Sleeping for 5 minutes to allow indexing.")
    batch = docs[start:end]
    title_embeddings = model.encode(batch["title"].to_list(), return_dense=True, return_sparse=False,
                                    return_colbert_vecs=False)
    # doc_embeddings = model.encode(batch["text"].to_list(), return_dense=True, return_sparse=True, return_colbert_vecs=False)
    # title_sparse_blobs = [to_blob(x) for x in title_embeddings["lexical_weights"]]
    # title_colbert_blobs = [to_blob(x) for x in title_embeddings["colbert_vecs"]]
    batch = batch.reset_index(drop=True)
    with coll.batch.fixed_size(60, 2) as b:
        for row in batch.itertuples(index=True):
            #print(row)
            b.add_object(properties={
                "doc_id": row.doc_id,
                "title": row.title,
                "text": row.text,
                "url": row.url
                # "title_sparse": title_sparse_blobs[row.Index],
                # "title_colbert": title_colbert_blobs[row.Index],
            }, vector={
                "title_dense": title_embeddings["dense_vecs"][row.Index],
            }, uuid=row.doc_id)
            outer_progress.update(1)
        if b.number_errors != 0:
            print(f"Found Errors: {b.number_errors}")

        b.flush()
        sleep(10)

        

  0%|          | 0/177301 [00:00<?, ?it/s]

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Inference Embeddings:  12%|█▎        | 5/40 [00:08<00:56,  1.62s/it]


KeyboardInterrupt: 

In [16]:

zh = client.collections.get("neuclir_1_mutli_bge_m3_small")
aggregation = zh.aggregate.over_all(total_count=True)
print(aggregation.total_count)

176474


In [15]:
client.cluster.nodes(
    collection="neuclir_1_mutli_bge_m3_small",
    output="verbose"
)

[Node(git_hash='cfdbdd0', name='node1', shards=[Shard(collection='Neuclir_1_mutli_bge_m3_small', name='3Vq0ckBvFSiX', node='node1', object_count=176474, vector_indexing_status='READY', vector_queue_length=0, compressed=False, loaded=True)], stats=Stats(object_count=176474, shard_count=1), status='HEALTHY', version='1.27.5')]

In [17]:
from weaviate.collections.classes.grpc import MetadataQuery

query_embeddings = model.encode(["What political impact do news have on teenagers?"], return_dense=True, return_sparse=False, return_colbert_vecs=False)
response = coll.query.near_vector(near_vector=query_embeddings["dense_vecs"][0], target_vector="title_dense", limit=50, return_metadata=MetadataQuery(distance=True))

for o in response.objects:
    print(o.metadata.distance)

0.3335208296775818
0.373696506023407
0.3738090991973877
0.3791655898094177
0.380637526512146
0.38228732347488403
0.38613414764404297
0.3997741937637329
0.4023386240005493
0.4060593843460083
0.41171079874038696
0.4135178327560425
0.41399049758911133
0.414242684841156
0.4175682067871094
0.42212599515914917
0.4224163889884949
0.4284636974334717
0.4299066662788391
0.4303016662597656
0.4310094118118286
0.43485647439956665
0.4354240894317627
0.43781232833862305
0.4388442039489746
0.43887484073638916
0.43953585624694824
0.44052112102508545
0.44064629077911377
0.4419466257095337
0.44270193576812744
0.4434394836425781
0.44470804929733276
0.44488435983657837
0.44635099172592163
0.4471167325973511
0.4475809335708618
0.4504570960998535
0.45121055841445923
0.4520554542541504
0.45220035314559937
0.45233649015426636
0.4526246190071106
0.45290887355804443
0.4531651735305786
0.4540814161300659
0.4555288553237915
0.4560089111328125
0.45607990026474
0.4564289450645447
