In [1]:
from datasets import load_dataset
import cohere

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DATASET = "LeoZotos/bio_full"

In [3]:
cohere_api_key = ""
hf_api_key = ""
with open("../tokens/COHERE.txt", "r") as f:
    cohere_api_key = f.read().strip()
with open("../tokens/HF_TOKEN.txt", "r") as f:
    hf_api_key = f.read().strip()
co = cohere.Client(cohere_api_key)

In [4]:
data = load_dataset(DATASET, split="train", token=hf_api_key )

Generating train split: 100%|██████████| 778/778 [00:00<00:00, 16800.19 examples/s]


In [5]:
def get_embeddings(batch):
    response = co.embed(
        texts=batch['Question_With_Options'],
        model='embed-multilingual-v3.0',
        input_type='search_query' # it is the query, not the document
    )
    return {"emb": response.embeddings}

data_with_embeddings = data.map(
    get_embeddings,
    batched=True,
    batch_size=96 
)

# Now `data_with_embeddings` is a new Dataset object with the 'emb' column
print(data_with_embeddings)

Map: 100%|██████████| 778/778 [00:06<00:00, 119.06 examples/s]

Dataset({
    features: ['id', 'Question', 'Answer_A', 'Answer_B', 'Answer_C', 'Answer_D', 'Answer_E', 'Answer_F', 'Answer_G', 'Answer_H', 'Answer_I', 'Answer_J', 'Answer_Key', 'Answer_Text', 'Question_With_Options', 'Correct_Answer_Rate', 'Answer_A_Rate', 'Answer_B_Rate', 'Answer_C_Rate', 'Answer_D_Rate', 'emb'],
    num_rows: 778
})





In [6]:
print(data_with_embeddings[0]['emb'])

[-0.0002412796, 0.001452446, 0.055267334, -0.031951904, 0.031219482, 0.081970215, 0.0075531006, -0.05645752, -0.041229248, -0.0057640076, 0.030273438, -0.024993896, 0.017333984, -0.010253906, -0.01071167, -0.007709503, 0.020751953, -0.008163452, 0.0009045601, -0.017562866, -0.015914917, 0.0038604736, 0.023239136, -0.04397583, -0.00020360947, 0.1105957, 0.018508911, -0.061523438, 0.08544922, -0.03152466, 0.0066337585, -0.058135986, 0.02935791, -0.00333786, 0.04714966, 0.01058197, 0.021087646, -0.038970947, -0.013008118, 0.050109863, -0.036071777, 0.006504059, 0.003320694, 0.016342163, -0.008659363, -0.012382507, 0.02230835, 0.0015802383, -0.024429321, 0.030593872, -0.020751953, -0.001750946, 0.030715942, -0.023071289, 0.041381836, 0.014793396, -0.029647827, 0.02407837, 0.03805542, -0.012802124, 0.029571533, 0.06921387, 0.0418396, 0.042022705, 0.010803223, 0.0024108887, 0.019973755, 0.0046043396, 0.0023555756, 0.002948761, 0.012420654, 0.023773193, 0.016494751, 0.0051612854, 0.0007162094

In [7]:
data_with_embeddings.push_to_hub(
    repo_id=DATASET,
    commit_message="Add Cohere v3 embeddings to 'emb' column",
    token=hf_api_key,
    private=True
)

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 34.25ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.44s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/LeoZotos/bio_full/commit/2e12a72d080c54951ab8fbc6d415d267027bda0a', commit_message="Add Cohere v3 embeddings to 'emb' column", commit_description='', oid='2e12a72d080c54951ab8fbc6d415d267027bda0a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/LeoZotos/bio_full', endpoint='https://huggingface.co', repo_type='dataset', repo_id='LeoZotos/bio_full'), pr_revision=None, pr_num=None)