## Loading data:

In [None]:
!pip -q install datasets

Load Datasets:

In [None]:
from datasets import load_dataset

ds_GMMLU_en = load_dataset("JRQi/Global-MMLU-emb", "en")
ds_GMMLU_de = load_dataset("JRQi/Global-MMLU-emb", "de")
ds_GMMLU_ru = load_dataset("JRQi/Global-MMLU-emb", "ru")

Convert to Pandas DF:

In [None]:
import pandas as pd

df_GMMLU_en = pd.DataFrame(ds_GMMLU_en['test'])
df_GMMLU_de = pd.DataFrame(ds_GMMLU_de['test'])
df_GMMLU_ru = pd.DataFrame(ds_GMMLU_ru['test'])

In [None]:
df_GMMLU_en.iloc[0]

Unnamed: 0,0
id,abstract_algebra/test/0
subject,abstract_algebra
subject_category,STEM
question,Find the degree for the given field extension ...
option_a,0
option_b,4
option_c,2
option_d,6
answer,B
emb,"[0.026153564, -0.00036787987, -0.015670776, 0...."


Example questions for each subject:

In [None]:
categories_en = df_GMMLU_en['subject'].unique()

for category in categories_en:
    example = df_GMMLU_en[df_GMMLU_en['subject'] == category].iloc[0]
    print(f"{example['subject']}\t{example['question'][:50]}...\t{example['option_a']}\t{example['option_b']}\t{example['option_c']}\t{example['option_d']}")

abstract_algebra	Find the degree for the given field extension Q(sq...	0	4	2	6
anatomy	A lesion causing compression of the facial nerve a...	paralysis of the facial muscles.	paralysis of the facial muscles and loss of taste.	paralysis of the facial muscles, loss of taste and lacrimation.	paralysis of the facial muscles, loss of taste, lacrimation and decreased salivation.
astronomy	What is true for a type-Ia ("type one-a") supernov...	This type occurs in binary systems.	This type occurs in young galaxies.	This type produces gamma-ray bursts.	This type produces high amounts of X-rays.
business_ethics	_______ such as bitcoin are becoming increasingly ...	Cryptocurrencies, Expensive, Secure, Financial Crime	Traditional currency, Cheap, Unsecure, Charitable giving	Cryptocurrencies, Cheap, Secure, Financial crime	Traditional currency, Expensive, Unsecure, Charitable giving
clinical_knowledge	What size of cannula would you use in a patient wh...	18 gauge.	20 gauge.	22 gauge.	24 gauge.
colleg

Generating ubinary embeddings (helps speed up DPR and will avoid overloading memory):

In [None]:
import numpy as np

def convert_to_ubinary(embedding):
    embedding_array = np.array(embedding)
    binary_embedding = (embedding_array > 0).astype(np.uint8)
    reshaped = binary_embedding.reshape(-1, 8)  # Each row contains 8 bits
    packed_embedding = np.packbits(reshaped, axis=1).flatten()

    return packed_embedding

In [None]:
# generate ubinary embeddings for each df
df_GMMLU_en['emb_ubinary'] = df_GMMLU_en['emb'].apply(convert_to_ubinary)
df_GMMLU_de['emb_ubinary'] = df_GMMLU_de['emb'].apply(convert_to_ubinary)
df_GMMLU_ru['emb_ubinary'] = df_GMMLU_ru['emb'].apply(convert_to_ubinary)

In [None]:
print(df_GMMLU_en['emb_ubinary'])

0        [158, 48, 183, 236, 211, 28, 202, 175, 95, 239...
1        [210, 52, 199, 186, 251, 27, 239, 47, 31, 70, ...
2        [238, 45, 28, 184, 169, 19, 109, 111, 55, 231,...
3        [18, 13, 120, 105, 201, 106, 91, 47, 132, 198,...
4        [62, 181, 199, 232, 249, 123, 235, 45, 127, 23...
                               ...                        
14037    [118, 123, 62, 97, 241, 92, 123, 201, 108, 224...
14038    [240, 106, 29, 108, 193, 28, 239, 217, 236, 11...
14039    [240, 56, 103, 106, 121, 120, 203, 217, 96, 20...
14040    [64, 45, 230, 110, 251, 226, 245, 217, 108, 15...
14041    [208, 123, 111, 108, 139, 159, 206, 160, 96, 8...
Name: emb_ubinary, Length: 14042, dtype: object


# DPR using binary embeddings:

In [None]:
!pip -q install faiss-cpu

In [None]:
def semantic_search(dataframe, index, lang, question_col="question", top_k=3):

    results = dict()

    for i in range(len(dataframe)):
        entry = dataframe.iloc[i]

        query = entry[question_col]
        query_embedding = np.asarray(entry['emb_ubinary']).reshape(1, -1)
        query_emb_float = np.asarray(entry['emb']).reshape(1, -1)

        # Phase I: Search on the index with a binary
        hits_scores, hits_doc_ids = index.search(query_embedding, k=min(10*top_k, index.ntotal))

        #Get the results in a list of hits
        hits = [{'doc_id': doc_id.item(), 'score_bin': score_bin} for doc_id, score_bin in zip(hits_doc_ids[0], hits_scores[0])]

        # Phase II: Do a re-scoring with the float query embedding
        binary_doc_emb = np.asarray([index.reconstruct(hit['doc_id']) for hit in hits])
        binary_doc_emb_unpacked = np.unpackbits(binary_doc_emb, axis=-1).astype("int")
        binary_doc_emb_unpacked = 2*binary_doc_emb_unpacked-1

        scores_cont = (query_emb_float[0] @ binary_doc_emb_unpacked.T)
        for idx in range(len(scores_cont)):
            hits[idx]['score_cont'] = scores_cont[idx]

        #Sort by largest score_cont
        hits.sort(key=lambda x: x['score_cont'], reverse=True)

        for hit in hits[0:top_k]:
            doc_id = hit['doc_id']
            hit['doc'] = docs[doc_id]
            hit['lang'] = lang

        results[entry['id']] = {
            'query': query,
            'hits': hits[0:top_k],
            'category': category,
            'answer': entry['answer'],
        }
    return results


In [None]:
def update_results(current_results, results, lang):
    for key, value in current_results.items():
        if key not in results[lang]:
            results[lang][key] = value
        else:
            existing_hits = {hit['doc_id']: hit for hit in results[lang][key]['hits']}
            new_hits = {hit['doc_id']: hit for hit in value['hits']}

            combined_hits = {**existing_hits, **new_hits}

            # Only keep the 3 hits with the highest scores in results,:
            results[lang][key]['hits'] = sorted(combined_hits.values(), key=lambda x: x['score_cont'], reverse=True)[:3]
    return results

In [None]:
from datasets import load_dataset
import numpy as np
import faiss

langs = ["en", "de", "ru"]
# langs = ["en"]
top_k = 3

# Create FAISS index
num_dim = 1024
index = faiss.IndexBinaryFlat(num_dim)

results_crosslingual = {"en": {}, "de": {}, "ru": {}}
results_multilingual = {"en": {}, "de": {}, "ru": {}}

for lang in langs:
    print(f"lang: {lang}")
    max_docs = 200000 # limit results
    batch_size = 50000  # Process in batches
    amount_of_batches = max_docs // batch_size
    batch_nr = 1
    print(f"Amount of batches: {amount_of_batches}")

    docs_stream = load_dataset(
        f"Cohere/wikipedia-2023-11-embed-multilingual-v3-int8-binary",
        lang,
        split="train",
        streaming=True,
    )

    docs = []
    doc_embeddings = []

    for i, doc in enumerate(docs_stream):
        if i >= max_docs:
            break

        docs.append(doc)
        doc_embeddings.append(doc['emb_ubinary'])

        # Process in batches
        if (i + 1) % batch_size == 0:
            doc_embeddings_np = np.asarray(doc_embeddings, dtype='uint8')
            index.add(doc_embeddings_np)

            if lang == "en":
                current_results = semantic_search(df_GMMLU_en, index, lang, top_k=top_k)

                update_results(current_results, results_multilingual, "en")

                current_results_crosslingual_de = semantic_search(df_GMMLU_de, index, lang, top_k=top_k)
                current_results_crosslingual_ru = semantic_search(df_GMMLU_ru, index, lang, top_k=top_k)

                update_results(current_results_crosslingual_de, results_crosslingual, "de")
                update_results(current_results_crosslingual_ru, results_crosslingual, "ru")
                update_results(current_results, results_crosslingual, "en")

            elif lang == "de":
                current_results = semantic_search(df_GMMLU_de, index, lang, top_k=top_k)

                update_results(current_results, results_multilingual, "de")

                current_results_crosslingual_en = semantic_search(df_GMMLU_en, index, lang, top_k=top_k)
                current_results_crosslingual_ru = semantic_search(df_GMMLU_ru, index, lang, top_k=top_k)

                update_results(current_results_crosslingual_en, results_crosslingual, "en")
                update_results(current_results_crosslingual_ru, results_crosslingual, "ru")
                update_results(current_results, results_crosslingual, "de")

            elif lang == "ru":
                current_results = semantic_search(df_GMMLU_ru, index, lang, top_k=top_k)

                update_results(current_results, results_multilingual, "ru")

                current_results_crosslingual_en = semantic_search(df_GMMLU_en, index, lang, top_k=top_k)
                current_results_crosslingual_de = semantic_search(df_GMMLU_de, index, lang, top_k=top_k)

                update_results(current_results_crosslingual_en, results_crosslingual, "en")
                update_results(current_results_crosslingual_de, results_crosslingual, "de")
                update_results(current_results, results_crosslingual, "ru")

            # Clear memory
            docs = []
            doc_embeddings = []
            index = faiss.IndexBinaryFlat(num_dim)
            print(f"Batch {batch_nr}/{amount_of_batches} processed.")
            batch_nr += 1

    # Add remaining documents
    if doc_embeddings:
        doc_embeddings_np = np.asarray(doc_embeddings, dtype='uint8')
        index.add(doc_embeddings_np)

        if lang == "en":
            current_results = semantic_search(df_GMMLU_en, index, lang, top_k=top_k)

            update_results(current_results, results_multilingual, "en")

            current_results_crosslingual_de = semantic_search(df_GMMLU_de, index, lang, top_k=top_k)
            current_results_crosslingual_ru = semantic_search(df_GMMLU_ru, index, lang, top_k=top_k)

            update_results(current_results_crosslingual_de, results_crosslingual, "de")
            update_results(current_results_crosslingual_ru, results_crosslingual, "ru")
            update_results(current_results, results_crosslingual, "en")

        elif lang == "de":
            current_results = semantic_search(df_GMMLU_de, index, lang, top_k=top_k)

            update_results(current_results, results_multilingual, "de")

            current_results_crosslingual_en = semantic_search(df_GMMLU_en, index, lang, top_k=top_k)
            current_results_crosslingual_ru = semantic_search(df_GMMLU_ru, index, lang, top_k=top_k)

            update_results(current_results_crosslingual_en, results_crosslingual, "en")
            update_results(current_results_crosslingual_ru, results_crosslingual, "ru")
            update_results(current_results, results_crosslingual, "de")

        elif lang == "ru":
            current_results = semantic_search(df_GMMLU_ru, index, lang, top_k=top_k)

            update_results(current_results, results_multilingual, "ru")

            current_results_crosslingual_en = semantic_search(df_GMMLU_en, index, lang, top_k=top_k)
            current_results_crosslingual_de = semantic_search(df_GMMLU_de, index, lang, top_k=top_k)

            update_results(current_results_crosslingual_en, results_crosslingual, "en")
            update_results(current_results_crosslingual_de, results_crosslingual, "de")
            update_results(current_results, results_crosslingual, "ru")

lang: en
Amount of batches: 4


Resolving data files:   0%|          | 0/415 [00:00<?, ?it/s]

Batch 1/4 processed.
Batch 2/4 processed.
Batch 3/4 processed.
Batch 4/4 processed.
lang: de
Amount of batches: 4


Resolving data files:   0%|          | 0/208 [00:00<?, ?it/s]

Batch 1/4 processed.
Batch 2/4 processed.
Batch 3/4 processed.
Batch 4/4 processed.
lang: ru
Amount of batches: 4


Resolving data files:   0%|          | 0/138 [00:00<?, ?it/s]

Batch 1/4 processed.
Batch 2/4 processed.
Batch 3/4 processed.
Batch 4/4 processed.


In [None]:
for lang in results_crosslingual:
    print(lang)
    for key in list(results_crosslingual[lang].keys())[:3]:
        print(key)
        print(results_crosslingual[lang][key]['query'])
        for hit in results_crosslingual[lang][key]['hits'][:3]:
            print(hit['doc_id'], hit['doc']['title'], hit['doc']['url'])

en
abstract_algebra/test/0
Find the degree for the given field extension Q(sqrt(2), sqrt(3), sqrt(18)) over Q.
49550 Степень трансцендентности https://ru.wikipedia.org/wiki/%D0%A1%D1%82%D0%B5%D0%BF%D0%B5%D0%BD%D1%8C%20%D1%82%D1%80%D0%B0%D0%BD%D1%81%D1%86%D0%B5%D0%BD%D0%B4%D0%B5%D0%BD%D1%82%D0%BD%D0%BE%D1%81%D1%82%D0%B8
10788 Hofstadter sequence https://en.wikipedia.org/wiki/Hofstadter%20sequence
37105 Estrin's scheme https://en.wikipedia.org/wiki/Estrin%27s%20scheme
abstract_algebra/test/1
Let p = (1, 2, 5, 4)(2, 3) in S_5 . Find the index of <p> in S_5.
7497 15-Satz https://de.wikipedia.org/wiki/15-Satz
7610 Super-prime https://en.wikipedia.org/wiki/Super-prime
38972 Smith–Minkowski–Siegel mass formula https://en.wikipedia.org/wiki/Smith%E2%80%93Minkowski%E2%80%93Siegel%20mass%20formula
abstract_algebra/test/2
Find all zeros in the indicated finite field of the given polynomial with coefficients in that field. x^5 + 3x^3 + x^2 + 2x in Z_5
32195 Zeiger (C) https://de.wikipedia.org/wiki

In [None]:
for lang in results_multilingual:
    print(lang)
    for key in list(results_multilingual[lang].keys())[:2]:
        print(results_multilingual[lang][key]['query'])
        for hit in results_multilingual[lang][key]['hits'][:2]:
            print(hit['doc']['title'], hit['doc']['url'])
        print(key)

en
Find the degree for the given field extension Q(sqrt(2), sqrt(3), sqrt(18)) over Q.
Степень трансцендентности https://ru.wikipedia.org/wiki/%D0%A1%D1%82%D0%B5%D0%BF%D0%B5%D0%BD%D1%8C%20%D1%82%D1%80%D0%B0%D0%BD%D1%81%D1%86%D0%B5%D0%BD%D0%B4%D0%B5%D0%BD%D1%82%D0%BD%D0%BE%D1%81%D1%82%D0%B8
Hofstadter sequence https://en.wikipedia.org/wiki/Hofstadter%20sequence
abstract_algebra/test/0
Let p = (1, 2, 5, 4)(2, 3) in S_5 . Find the index of <p> in S_5.
15-Satz https://de.wikipedia.org/wiki/15-Satz
Super-prime https://en.wikipedia.org/wiki/Super-prime
abstract_algebra/test/1
de
Ermitteln Sie den Grad für die gegebene Felderweiterung Q(sqrt(2), sqrt(3), sqrt(18)) über Q.
Polynom vierten Grades https://de.wikipedia.org/wiki/Polynom%20vierten%20Grades
Algebraische K-Theorie https://de.wikipedia.org/wiki/Algebraische%20K-Theorie
abstract_algebra/test/0
Es ist p = (1, 2, 5, 4)(2, 3) in S_5 . Ermitteln Sie den Index von <p> in S_5.
15-Satz https://de.wikipedia.org/wiki/15-Satz
Pentapeptide https:

Write results to json:

In [None]:
import json

def convert_numpy_int_to_python_int(obj):
    if isinstance(obj, dict):
        return {k: convert_numpy_int_to_python_int(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_numpy_int_to_python_int(v) for v in obj]
    elif isinstance(obj, (np.int32, np.int64)):
        return int(obj)
    else:
        return obj

results_crosslingual = convert_numpy_int_to_python_int(results_crosslingual)
results_multilingual = convert_numpy_int_to_python_int(results_multilingual)

with open('results_dpr_binary_crosslingual.json', 'w') as f:
    json.dump(results_crosslingual, f)

with open('results_dpr_binary_multilingual.json', 'w') as f:
    json.dump(results_multilingual, f)