In [1]:
import ir_datasets
import pandas as pd
import numpy as np

import ir_measures
from ir_measures import *

from FlagEmbedding import BGEM3FlagModel

dataset = ir_datasets.load("neuclir/1/multi/trec-2023")


In [4]:
model = BGEM3FlagModel("BAAI/bge-m3", use_fp16=True)

queries = pd.DataFrame(dataset.queries_iter())
qrels = pd.DataFrame(dataset.qrels_iter())

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [5]:
grouped = queries.groupby("query_id").first().reset_index()

desc = grouped["description"].tolist()

query_embeddings = model.encode(desc, return_dense=True, return_sparse=False, return_colbert_vecs=True)

query_embedding_pairs = list(zip(grouped["query_id"], grouped["description"], query_embeddings['dense_vecs'], query_embeddings['colbert_vecs']))

len(query_embedding_pairs)

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


76

In [9]:
import weaviate
client = weaviate.connect_to_local()

In [66]:
coll = client.collections.get("neuclir_1_mutli_bge_m3_200k")
aggregation = coll.aggregate.over_all(total_count=True)
print(f"Total number of documents in the collection: {aggregation.total_count}")

Total number of documents in the collection: 199861


In [67]:
from weaviate.collections.classes.grpc import MetadataQuery

results_dense = []
results_hybrid = []
for (q, d, e, _) in query_embedding_pairs:
    response_dense = coll.query.near_vector(near_vector=e, target_vector="title_dense", limit=1000, return_metadata=MetadataQuery(distance=True))
    response_hybrid = coll.query.hybrid(query=d, vector=e, alpha=0.75, target_vector="title_dense", limit=1000, return_metadata=MetadataQuery(distance=True, score=True, explain_score=True))
    results_hybrid.append(response_hybrid)
    results_dense.append(response_dense)

print(f"Number of dense results: {len(results_dense)}")
print(f"Number of hybrid results: {len(results_hybrid)}")


Number of dense results: 76
Number of hybrid results: 76


In [61]:
def results_to_ir_measure_df(w_results: list, w_query_embedding_pairs: list):
    data = []
    # Loop through each query and corresponding results
    for i, (query,_, _, _) in enumerate(w_query_embedding_pairs):
        for o in w_results[i].objects:
            doc_id = str(o.uuid)
            if o.metadata.score is not None:
                score = o.metadata.score
            else:
                score = 1 - o.metadata.distance
            data.append({"query_id": query, "doc_id": doc_id, "score": score})

    # Convert to a DataFrame    
    return  pd.DataFrame(data)

In [68]:
dense = results_to_ir_measure_df(results_dense, query_embedding_pairs)
hybrid = results_to_ir_measure_df(results_hybrid, query_embedding_pairs)

dense.head()

Unnamed: 0,query_id,doc_id,score
0,200,2fb2cba8-5d76-4dc7-8c2e-80dbd2870eb8,0.714949
1,200,1b423e4f-107b-41ee-94fb-d789ae4967a4,0.713002
2,200,703b02d8-fde2-4c08-95f7-6ea0243ce9be,0.709095
3,200,bf8c49ce-28f2-4c43-b224-64ace5c8998d,0.708849
4,200,fbb08c2b-5a10-4df8-8a21-3511f87c2c6d,0.688554


In [63]:

measures = ir_measures.calc_aggregate([nDCG@20, P@5, MRR@10, P(rel=1)@5, Judged@10, MAP, RBP(rel=1), R@100, R@1000], qrels[["query_id", "doc_id", "relevance"]], dense)

measures

{Judged@10: 0.964473684210526,
 R@1000: 0.5906430399311716,
 AP: 0.18023692072790182,
 R@100: 0.21631323677104852,
 P@5: 0.505263157894737,
 RBP(rel=1): np.float64(0.4770999124168744),
 RR@10: 0.6297514619883041,
 nDCG@20: 0.33898073865480294}

In [64]:
measures = ir_measures.calc_aggregate([nDCG@20, P@5, MRR@10, P(rel=1)@5, Judged@10, MAP, RBP(rel=1), R@100, R@1000], qrels[["query_id", "doc_id", "relevance"]], hybrid)

measures

{Judged@10: 0.9618421052631578,
 R@1000: 0.5978044589138959,
 AP: 0.1855901140845485,
 R@100: 0.22200892808067949,
 P@5: 0.5026315789473684,
 RBP(rel=1): np.float64(0.46762939644736445),
 RR@10: 0.6164108187134503,
 nDCG@20: 0.3406532166488425}

### Colbert

In [11]:
coll = client.collections.get("neuclir_1_mutli_bge_m3_100k")
coll_colbert = client.collections.get("neuclir_1_mutli_bge_m3_100k_colbert")


In [12]:
from weaviate.classes.query import Filter

coll_colbert.query.fetch_objects(filters=Filter.by_property("doc_id").equal("26aafb99-9dd4-4e7e-ab93-cfe4431d1197"),include_vector=True)

QueryReturn(objects=[Object(uuid=_WeaviateUUIDInt('f8227203-fbba-4c9d-8c46-1071a01b0043'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'doc_id': UUID('26aafb99-9dd4-4e7e-ab93-cfe4431d1197')}, references=None, vector={'title_colbert': [-30.0, -62.0, -23.0, -83.0, -58.0, -9.0, -27.0, 13.0, -47.0, -47.0, 36.0, -21.0, -34.0, -6.0, -10.0, 29.0, 14.0, 50.0, 14.0, -46.0, -26.0, -11.0, -27.0, -44.0, 10.0, -20.0, -90.0, 22.0, 12.0, -65.0, -61.0, -4.0, -69.0, 13.0, 21.0, -32.0, -41.0, -20.0, 71.0, -10.0, -53.0, -31.0, -29.0, -34.0, -28.0, 17.0, -75.0, 26.0, -43.0, 5.0, -11.0, -80.0, 16.0, 19.0, 27.0, -14.0, 45.0, 0.0, 42.0, -58.0, -81.0, -13.0, 30.0, 37.0, -33.0, -23.0, 17.0, 47.0, -49.0, 5.0, -41.0, -32.0, -78.0, 32.0, 67.0, -32.0, -40.0, -24.0, -5.0, 34.0, 6.0, -38.0, 22.0, 9.0, 23.0, -27.0, -28.0, 23.0, -57.0, 19.0, 18.0, -56.0, -7.0, -5.0, -67.0, 14.0, 11.

In [31]:
from weaviate.collections.classes.grpc import MetadataQuery

results_dense = []
results_hybrid = []
for (q, d, e, _) in query_embedding_pairs:
    response_dense = coll.query.near_vector(near_vector=e, target_vector="title_dense", limit=1000, return_metadata=MetadataQuery(distance=True))
    response_hybrid = coll.query.hybrid(query=d, vector=e, alpha=0.75, target_vector="title_dense", limit=1000, return_metadata=MetadataQuery(distance=True, score=True, explain_score=True))
    results_hybrid.append(response_hybrid)
    results_dense.append(response_dense)

len(results_dense)

76

In [45]:
from tqdm import tqdm  # Import tqdm for the progress bar

performance = []
hybrid = True

# Wrap the outer loop with tqdm for progress tracking
for (q, d, e, colbert_vec_query) in tqdm(query_embedding_pairs, desc="Processing Queries"):
    if hybrid is True:
        response = coll.query.hybrid(
            query=d, 
            vector=e, 
            alpha=0.75, 
            target_vector="title_dense", 
            limit=1000, 
            return_metadata=MetadataQuery(distance=True, score=True, explain_score=True)
        )
    else:
        response = coll.query.near_vector(
            near_vector=e, 
            target_vector="title_dense", 
            limit=1000, 
            return_metadata=MetadataQuery(distance=True)
        )
    
    for o in response.objects:
        doc_id = str(o.properties["doc_id"])
        results_colbert = coll_colbert.query.fetch_objects(
            filters=Filter.by_property("doc_id").equal(doc_id),
            include_vector=True
        )
        
        colbert_vecs_inner = [obj.vector["title_colbert"] for obj in results_colbert.objects]
        
        if colbert_vecs_inner:
            # Convert to NumPy array for efficient computation
            colbert_vecs_inner = np.array(colbert_vecs_inner)
            
            max_scores = []
            for colbert_v in colbert_vec_query:
                # Compute max similarity using vectorized operations
                similarities = colbert_vecs_inner @ colbert_v
                max_score = np.max(similarities)
                max_scores.append(max_score)
            
            # Aggregate max scores
            total_score = np.sum(max_scores)  # Or np.sum(max_scores), depending on your needs
        else:
            total_score = o.metadata.distance
        
        performance.append((q, doc_id, total_score))

print(f"Total results processed: {len(performance)}")


Processing Queries: 100%|██████████| 76/76 [04:33<00:00,  3.59s/it]

Total results processed: 76000





In [46]:
late = pd.DataFrame.from_records(performance, columns=["query_id", "doc_id", "score"])
dense = results_to_ir_measure_df(results_dense, query_embedding_pairs)
hybrid = results_to_ir_measure_df(results_hybrid, query_embedding_pairs)

late_doc_ids = set(late["doc_id"])
dense_doc_ids = set(dense["doc_id"])
print(len(late_doc_ids))
print(len(dense_doc_ids))

44959
45550


In [35]:
dense

Unnamed: 0,query_id,doc_id,score
0,200,2fb2cba8-5d76-4dc7-8c2e-80dbd2870eb8,0.714949
1,200,1b423e4f-107b-41ee-94fb-d789ae4967a4,0.713002
2,200,703b02d8-fde2-4c08-95f7-6ea0243ce9be,0.709095
3,200,bf8c49ce-28f2-4c43-b224-64ace5c8998d,0.708849
4,200,fbb08c2b-5a10-4df8-8a21-3511f87c2c6d,0.688554
...,...,...,...
9995,209,d6542c9d-4ddd-4953-b7d6-29d80309d6c9,0.461409
9996,209,5ee110cc-8f15-44eb-9328-c93621b577c3,0.461406
9997,209,ea06db41-e2db-4f77-9058-74294bb595ec,0.461328
9998,209,49044aad-d99d-46ce-b5d7-3e0b1a6bb187,0.461258


In [42]:
late = pd.DataFrame(performance, columns=["query_id", "doc_id", "score"])

late.sort_values(by="score", ascending=False)

Unnamed: 0,query_id,doc_id,score
64030,264,e49f09b1-5851-4a80-91a4-63c42083d09a,25958.168551
64178,264,aa6cbfe6-ca8b-4409-bdb4-a3e2c621f2a8,25858.352795
64010,264,40fded2d-d3fd-4479-8795-5606de404e11,25716.607300
64001,264,737f38bd-5033-465f-a5ef-89523ec388db,25587.155612
64060,264,12a40939-afc9-457b-b420-4cef797280f4,25291.688594
...,...,...,...
11637,211,70602a3e-f42f-4811-8d98-9e226013a7b4,3334.031016
43779,243,59768d2d-275e-4c3d-a787-b1e35ed96379,3304.029958
43815,243,1a8465e9-f077-4393-9671-b863813855f5,3283.840620
43706,243,47177912-28e7-4a90-88be-0e8ee79bf5bf,3282.426569


In [43]:
# Get the unique doc_ids from qrels and late
qrels_doc_ids = set(qrels["doc_id"].unique())
dense_doc_ids = set(dense["doc_id"].unique())

print(len(qrels_doc_ids))
print(len(dense_doc_ids))
late_doc_ids = set(late["doc_id"].unique())
print(len(late_doc_ids))
common_doc_ids = qrels_doc_ids.intersection(dense_doc_ids)
print(f"Number of common doc_ids: {len(common_doc_ids)}")
# Find the intersection of both sets
common_doc_ids = qrels_doc_ids.intersection(late_doc_ids)

# Get the count of common doc_ids
common_doc_ids_count = len(common_doc_ids)

print(f"Number of common doc_ids: {common_doc_ids_count}")

76913
45550
45550
Number of common doc_ids: 42066
Number of common doc_ids: 42066


In [47]:
qrels_short = qrels
measures = ir_measures.calc_aggregate([nDCG@20, P@5, MRR@10, P(rel=1)@5, Judged@10, MAP, RBP(rel=1), R@100, R@1000], qrels_short[["query_id", "doc_id", "relevance"]], late)

measures

{Judged@10: 0.9486842105263156,
 R@1000: 0.6251350343356242,
 AP: 0.16493276296022177,
 R@100: 0.1909931088533459,
 P@5: 0.42894736842105285,
 RBP(rel=1): np.float64(0.42078295924823683),
 RR@10: 0.6311142439431915,
 nDCG@20: 0.3001407505269542}

In [39]:
qrels_short = qrels
measures = ir_measures.calc_aggregate([nDCG@20, P@5, MRR@10, P(rel=1)@5, Judged@10, MAP, RBP(rel=1), R@100, R@1000], qrels_short[["query_id", "doc_id", "relevance"]], hybrid)

measures

{Judged@10: 0.9710526315789472,
 R@1000: 0.6251350343356242,
 AP: 0.1951043393091836,
 R@100: 0.22650656503220415,
 P@5: 0.5052631578947369,
 RBP(rel=1): np.float64(0.46895989751480766),
 RR@10: 0.6182696324143692,
 nDCG@20: 0.34118858652783696}

In [48]:
qrels_short = qrels
measures = ir_measures.calc_aggregate([nDCG@20, P@5, MRR@10, P(rel=1)@5, Judged@10, MAP, RBP(rel=1), R@100, R@1000], qrels_short[["query_id", "doc_id", "relevance"]], dense)

measures

{Judged@10: 0.969736842105263,
 R@1000: 0.6245714071887468,
 AP: 0.19039028557204712,
 R@100: 0.22147558751174765,
 P@5: 0.5,
 RBP(rel=1): np.float64(0.4765843021811337),
 RR@10: 0.6321794068504595,
 nDCG@20: 0.3403516921943632}

In [265]:

from weaviate.classes.query import Filter

performance = []

for (q, res) in results_dense:
        # Initialize the results
    late_interaction_scores = []
    
    for obj in res.objects:
        results_colbert = coll_colbert.query.fetch_objects(filters=Filter.by_property("doc_id").equal(obj.properties["doc_id"]),
                                         include_vector=True)
        
        colbert_vecs_inner = []
        
        for o in results_colbert.objects:
            doc_id = str(o.properties["doc_id"])
            colbert_vecs_inner.append(o.vector["title_colbert"])
            
            total_score = 0
    
            # Process one query
            for i, (query, _, _, colbert_vecs_query) in enumerate(query_embedding_pairs[:1]):
                # Store per-query max similarities
                for colbert_vec_query in colbert_vecs_query:
                    # Compute similarity with all document token embeddings
                    # Vectors are already normalized
                    similarities = [np.dot(colbert_vec_query, colbert_vec_doc) for colbert_vec_doc in colbert_vecs_inner]
                    # Perform max operation
                    max_score = max(similarities)
                    max_scores.append(max_score)
                
            # Aggregate max scores (e.g., sum or mean)
            total_score = np.mean(max_scores)  # Alternatively, use np.mean(max_scores) for average score
            late_interaction_scores.append((obj.properties["doc_id"], total_score))
                
            performance.append((q, doc_id, total_score))
    

TypeError: cannot unpack non-iterable QueryReturn object

In [23]:
aggregation = coll_colbert.aggregate.over_all(total_count=True)
print(aggregation.total_count)

1824347
