#### **Semantic Search using OpenAI engines**

This notebook benchmarks OpenAIs GPT semantic search endpoints on BEIR. Note that this is not the same as their embedding endpoint. Beware of the costs!

##### Setup

In [1]:
import os
os.environ["OPENAI_API_KEY"] = "YOUR_API_KEY"

In [2]:
!pip install openai==0.10.5
import openai

Collecting openai==0.10.5
  Downloading openai-0.10.5.tar.gz (157 kB)
[?25l[K     |██                              | 10 kB 21.9 MB/s eta 0:00:01[K     |████▏                           | 20 kB 12.8 MB/s eta 0:00:01[K     |██████▎                         | 30 kB 9.4 MB/s eta 0:00:01[K     |████████▍                       | 40 kB 8.3 MB/s eta 0:00:01[K     |██████████▍                     | 51 kB 5.2 MB/s eta 0:00:01[K     |████████████▌                   | 61 kB 5.3 MB/s eta 0:00:01[K     |██████████████▋                 | 71 kB 5.5 MB/s eta 0:00:01[K     |████████████████▊               | 81 kB 6.0 MB/s eta 0:00:01[K     |██████████████████▊             | 92 kB 4.8 MB/s eta 0:00:01[K     |████████████████████▉           | 102 kB 5.2 MB/s eta 0:00:01[K     |███████████████████████         | 112 kB 5.2 MB/s eta 0:00:01[K     |█████████████████████████       | 122 kB 5.2 MB/s eta 0:00:01[K     |███████████████████████████     | 133 kB 5.2 MB/s eta 0:00:01[K    

In [None]:
### Get all files currently uploaded 
!curl https://api.openai.com/v1/files \
  -H "Authorization: Bearer $OPENAI_API_KEY"

{
  "object": "list",
  "data": [
    {
      "id": "file-Im4EQgqO6g91nYIxGMNxYUpE",
      "object": "file",
      "bytes": 247596,
      "created_at": 1620820316,
      "filename": "qa_texts.jsonl",
      "purpose": "answers",
      "status": "processed",
      "status_details": null
    },
    {
      "id": "file-vjC0Tlr6VW2gIVtP5DF5nnX0",
      "object": "file",
      "bytes": 32545,
      "created_at": 1620822168,
      "filename": "qa_texts_small.jsonl",
      "purpose": "answers",
      "status": "processed",
      "status_details": null
    },
    {
      "id": "file-OBIXdgtbe8JpHF4iVJkHNVMw",
      "object": "file",
      "bytes": 113150,
      "created_at": 1621957406,
      "filename": "qa_texts_bayer.jsonl",
      "purpose": "answers",
      "status": "processed",
      "status_details": null
    },
    {
      "id": "file-4TIYOBuUsclau6y2QZLOnioy",
      "object": "file",
      "bytes": 2701,
      "created_at": 1632223844,
      "filename": "compiled_results.csv",
      "p

##### Mock test

In [None]:
%%writefile mock.jsonl
{"text": "puppy A is happy", "metadata": "emotional state of puppy A"}
{"text": "puppy B is sad", "metadata": "emotional state of puppy B"}

Writing mock.jsonl


In [None]:
!curl https://api.openai.com/v1/files \
 -H "Authorization: Bearer $OPENAI_API_KEY" \
 -F purpose="search" \
 -F file="@mock.jsonl"

{
  "id": "file-Eepqe7o43OpTJhkqWvsmq1Az",
  "object": "file",
  "bytes": 139,
  "created_at": 1637666709,
  "filename": "mock.jsonl",
  "purpose": "search",
  "status": "uploaded",
  "status_details": null
}


In [None]:
!curl  https://api.openai.com/v1/engines/ada/search \
  -X POST \
  -H "Authorization: Bearer $OPENAI_API_KEY" \
  -H 'Content-Type: application/json' \
  -d '{"file": "file-Eepqe7o43OpTJhkqWvsmq1Az", "query": "happy", "search_model": "ada", "max_rerank": 5}'
# Works; Query costed < 0.01$

{
  "object": "list",
  "data": [
    {
      "object": "search_result",
      "document": 0,
      "score": 584.489,
      "text": "puppy A is happy"
    }
  ],
  "model": "ada:2020-05-03"
}


In [None]:
import openai
out = openai.Engine("ada").search(
    search_model="ada", 
    query="happy", 
    max_rerank=5,
    file="file-Eepqe7o43OpTJhkqWvsmq1Az",
    return_metadata=True,
)

In [None]:
print(dir(out))
print(out)

['ReprJSONEncoder', '__class__', '__contains__', '__copy__', '__deepcopy__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setitem__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_last_response', '_previous', '_retrieve_params', '_transient_values', '_unsaved_values', 'api_base', 'api_base_override', 'api_key', 'api_version', 'clear', 'construct_from', 'copy', 'engine', 'fromkeys', 'get', 'items', 'keys', 'last_response', 'openai_id', 'organization', 'pop', 'popitem', 'refresh_from', 'request', 'serialize', 'setdefault', 'to_dict', 'to_dict_recursive', 'update', 'values']
{
  "data": [
    {
      "document": 0,
      "metadata": "emotional state of puppy A",
   

In [None]:
openai.File("file-Eepqe7o43OpTJhkqWvsmq1Az").delete()

<File file id=file-Eepqe7o43OpTJhkqWvsmq1Az at 0x7f84c7ce58f0> JSON: {
  "deleted": true,
  "id": "file-Eepqe7o43OpTJhkqWvsmq1Az",
  "object": "file"
}

##### BEIR

In [None]:
!pip install -q beir
!pip install -q jsonlines
!pip install -q transformers

[K     |████████████████████████████████| 52 kB 1.7 MB/s 
[K     |████████████████████████████████| 78 kB 6.8 MB/s 
[K     |████████████████████████████████| 8.5 MB 47.2 MB/s 
[K     |████████████████████████████████| 385 kB 52.3 MB/s 
[K     |████████████████████████████████| 4.9 MB 46.2 MB/s 
[K     |████████████████████████████████| 3.3 MB 52.1 MB/s 
[K     |████████████████████████████████| 3.3 MB 42.0 MB/s 
[K     |████████████████████████████████| 1.2 MB 37.2 MB/s 
[K     |████████████████████████████████| 61 kB 547 kB/s 
[K     |████████████████████████████████| 895 kB 47.9 MB/s 
[K     |████████████████████████████████| 596 kB 37.0 MB/s 
[?25h  Building wheel for beir (setup.py) ... [?25l[?25hdone
  Building wheel for pytrec-eval (setup.py) ... [?25l[?25hdone
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


In [None]:
from beir import util, LoggingHandler
import logging
# Code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

  from tqdm.autonotebook import tqdm


In [None]:
# For non-public datasets
!mkdir /root/.kaggle
!cp /content/kaggle.json /root/.kaggle/

#!kaggle datasets download -d trecnews
#!unzip ./trecnews.zip -d datasets

#!kaggle datasets download -d signal1m
#!unzip ./signal1m.zip -d datasets

#!kaggle datasets download -d robust04
#!unzip ./robust04.zip -d datasets

mkdir: cannot create directory ‘/root/.kaggle’: File exists
Downloading signal1m.zip to /content
 94% 144M/154M [00:01<00:00, 102MB/s]
100% 154M/154M [00:01<00:00, 112MB/s]
Archive:  ./signal1m.zip
  inflating: datasets/signal1m/corpus.jsonl  
  inflating: datasets/signal1m/qrels/test.tsv  
  inflating: datasets/signal1m/queries.jsonl  


In [None]:
dataset = "cqadupstack"

url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
out_dir = os.path.join(os.getcwd(), "datasets")
data_path = util.download_and_unzip(url, out_dir)
print("Dataset downloaded here: {}".format(data_path))

2021-12-04 07:33:11 - Downloading cqadupstack.zip ...


/content/datasets/cqadupstack.zip:   0%|          | 0.00/4.98G [00:00<?, ?iB/s]

2021-12-04 07:36:38 - Unzipping cqadupstack.zip ...
Dataset downloaded here: /content/datasets/cqadupstack


In [None]:
import jsonlines
import time

from beir.retrieval.evaluation import EvaluateRetrieval
from beir.datasets.data_loader import GenericDataLoader

from transformers import GPT2TokenizerFast

k_values = [1, 10, 100]
comp_limit = False # How many to queries to compute; False will compute all
engine = "ada"
max_rerank = 100 # In BEIR 100 documents are reranked for their rerank encoder benchmark
file_id = None #"file-UPRov1rRCcWrnbIAWl0WKJnD" #"file-68hfaSNive2mGiT4ZOc8VY0M"
dataset = "cqadupstack/tex"
download = False

# Need to rerank min 100 for @100;
assert max_rerank >= max(k_values), "Max Rerank is too small for the sample scores to compute"

if download:
    url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
    out_dir = os.path.join(os.getcwd(), "datasets")
    data_path = util.download_and_unzip(url, out_dir)
    print("Dataset downloaded here: {}".format(data_path))

# Load the dataset into BEIR
data_path = f"datasets/{dataset}"
corpus, queries, qrels = GenericDataLoader(data_path).load(split="test")

if file_id is None:
    # https://beta.openai.com/docs/api-reference/searches/create
    # The maximum document length (in tokens) is 2034 minus the number of tokens in the query
    # OpenAI does not apply auto-truncation, hence we manually truncate to the 2034 limit
    tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
    max_query_len = max([len(tokenizer.tokenize(q)) for q in queries.values()])

    docs_truncated = 0
    toks_truncated = 0
    total_toks = 0
    with jsonlines.open(f'{data_path}/corpus.jsonl', 'r') as jsonl_in, jsonlines.open(f'{dataset.replace("/", "")}_corpus.jsonl', 'w') as jsonl_out:
        for obj in jsonl_in:
            # Truncate to max seq length
            token_len = len(tokenizer.tokenize(obj["text"]))
            # OpenAI will fail processing if any line is empty
            # For trec-news 16372 are empty 594977
            if token_len == 0:
                logging.info(f'Skipping empty id: {obj["_id"]}')
                continue
            if token_len + max_query_len > 2034:
                toks = tokenizer.encode(obj["text"], add_special_tokens=False)[:2034-max_query_len-1] # 0-indexed
                obj["text"] = tokenizer.decode(toks)
                docs_truncated += 1
                toks_truncated += token_len + max_query_len - 2034
            total_toks += token_len
            obj["metadata"] = obj["_id"] # Adapt the dictname so OpenAI returns it in their API
            jsonl_out.write(obj)

    logging.info(f"Truncated {docs_truncated} out of {len(corpus)} documents by {toks_truncated} out of {total_toks}.")

    # Upload the corpus to OpenAI
    upload_resp = openai.File.create(file=open(f'{dataset.replace("/", "")}_corpus.jsonl'), purpose="search")
    file_id = upload_resp["id"]

pre_proc_time = time.time() 
# OpenAI takes some time to process the file - Probably Generating a fixed embedding for their stage 1 filtering [BM25 Indices possibly]
status = [f["status"] == "processed" for f in openai.File.list()["data"]]
while not(all(status)):
    time.sleep(20)
    status = [f["status"] == "processed" for f in openai.File.list()["data"]]
post_proc_time = time.time()
logging.info(f"Took {post_proc_time - pre_proc_time} to process {len(corpus)} docs.")

query_ids = list(queries.keys())
results = {qid: {} for qid in query_ids}
# Construct results in BEIR format
for i, (q_id, q_val) in enumerate(queries.items(), start=1):

    try:
        out = openai.Engine(engine).search(
              search_model=engine,
              query=q_val,
              max_rerank=max_rerank,
              return_metadata=True,
              file=file_id)
    # e.g. InvalidRequestError: No similar documents were found in file with ID 'file-VfY1kFEkornMYSUe2UMBEgDh'.Please upload more documents or adjust your query.
    except Exception as e:
        # Make sure the error is only No similar documents were found" ... 
        logging.info(f"Error: {e}")
        continue
    
    for pred in out["data"]:
        results[q_id][pred["metadata"]] = pred["score"]

    if comp_limit and i == comp_limit:
        break

logging.info(f"Took {time.time() - post_proc_time} to predict {len(queries)} queries.")

# Use BEIR Evaluator
# Expects:
# qrels: Dict[str, Dict[str, int]]
# results: Dict[str, Dict[str, float]]
# k_values: List[int]) 
# Returns
# Tuple[Dict[str, float], Dict[str, float], Dict[str, float], Dict[str, float]]

logging.info("Retriever evaluation for k in: {}".format(k_values))

if comp_limit:
    results_sub = {k: v for k, v in results.items() if v.items()}
    qrels_sub = {k: v for k, v in qrels.items() if k in results_sub}
    ndcg, _map, recall, precision = EvaluateRetrieval().evaluate(qrels_sub, results_sub, k_values)
else:
    ndcg, _map, recall, precision = EvaluateRetrieval().evaluate(qrels, results, k_values)

with jsonlines.open(f'{dataset.replace("/", "")}_{engine}_{max_rerank}.jsonl', 'w') as jsonl_out:
    for k in results:
      jsonl_out.write({k: results[k]})

# Delete the file again
# openai.File(file_id).delete()

In [None]:
# Sometimes the python API upload fails, e.g.
# JSONDecodeError: Expecting value: line 1 column 1 (char 0)
# 502 Bad Gateway
# Out of Ram
# Use the below upload in that case with the filename & provide the file_id in the above script & rerun
!curl https://api.openai.com/v1/files \
 -H "Authorization: Bearer $OPENAI_API_KEY" \
 -F purpose="search" \
 -F file="@cqadupstackstats_corpus.jsonl"

{
  "id": "file-McK7Uzvv3ZXFRdwmBdQeyN1U",
  "object": "file",
  "bytes": 46070243,
  "created_at": 1638613890,
  "filename": "cqadupstackstats_corpus.jsonl",
  "purpose": "search",
  "status": "uploaded",
  "status_details": null
}


In [None]:
### IN CASE IT BREAKS DOWN ###

for i, (q_id, q_val) in enumerate(queries.items(), start=1):

    # Skip what has already been run
    if results[q_id]:
        continue

    print("Running q: ", q_id)

    try:
        out = openai.Engine(engine).search(
              search_model=engine,
              query=q_val,
              max_rerank=max_rerank,
              return_metadata=True,
              file=file_id)
    # e.g. InvalidRequestError: No similar documents were found in file with ID 'file-VfY1kFEkornMYSUe2UMBEgDh'.Please upload more documents or adjust your query.
    except Exception as e:
        logging.info(f"Error: {e}")
        continue
    
    for pred in out["data"]:
        results[q_id][pred["metadata"]] = pred["score"]

    if comp_limit and i == comp_limit:
        break

logging.info(f"Took {time.time() - post_proc_time} to predict {len(queries)} queries.")

# Use BEIR Evaluator
# Expects:
#qrels: Dict[str, Dict[str, int]]
#results: Dict[str, Dict[str, float]]
#k_values: List[int]) 
# Returns
#Tuple[Dict[str, float], Dict[str, float], Dict[str, float], Dict[str, float]]
logging.info("Retriever evaluation for k in: {}".format(k_values))

if comp_limit:
    results_sub = {k: v for k, v in results.items() if v.items()}
    qrels_sub = {k: v for k, v in qrels.items() if k in results_sub}
    ndcg, _map, recall, precision = EvaluateRetrieval().evaluate(qrels_sub, results_sub, k_values)
else:
    ndcg, _map, recall, precision = EvaluateRetrieval().evaluate(qrels, results, k_values)

with jsonlines.open(f'{dataset.replace("/", "")}_{engine}_{max_rerank}.jsonl', 'w') as jsonl_out:
    for k in results:
      jsonl_out.write({k: results[k]})

# Delete the file again
#openai.File(file_id).delete()

2021-12-04 13:25:53 - Took 6242.274959802628 to predict 2906 queries.
2021-12-04 13:25:53 - Retriever evaluation for k in: [1, 10, 100]
2021-12-04 13:25:53 - 

2021-12-04 13:25:53 - NDCG@1: 0.2168
2021-12-04 13:25:53 - NDCG@10: 0.2507
2021-12-04 13:25:53 - NDCG@100: 0.2723
2021-12-04 13:25:53 - 

2021-12-04 13:25:53 - MAP@1: 0.1790
2021-12-04 13:25:53 - MAP@10: 0.2233
2021-12-04 13:25:53 - MAP@100: 0.2280
2021-12-04 13:25:53 - 

2021-12-04 13:25:53 - Recall@1: 0.1790
2021-12-04 13:25:53 - Recall@10: 0.2974
2021-12-04 13:25:53 - Recall@100: 0.3938
2021-12-04 13:25:53 - 

2021-12-04 13:25:53 - P@1: 0.2168
2021-12-04 13:25:53 - P@10: 0.0401
2021-12-04 13:25:53 - P@100: 0.0055


In [None]:
### OPTIONAL RECONTINUE BY REUPLOADING A STARTED JSONL ###

import jsonlines
import time

from beir.retrieval.evaluation import EvaluateRetrieval
from beir.datasets.data_loader import GenericDataLoader

from transformers import GPT2TokenizerFast

k_values = [1, 10, 100]
comp_limit = False # How many to queries to compute; False will compute all
engine = "ada"
max_rerank = 100 # In BEIR 100 documents are reranked for their rerank encoder benchmark
file_id = "file-5zHqi2EHE34e9eFy2pUUACoX"
dataset = "quora"
download = True

# Need to rerank min 100 for @100;
assert max_rerank >= max(k_values), "Max Rerank is too small for the sample scores to compute"

if download:
    url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
    out_dir = os.path.join(os.getcwd(), "datasets")
    data_path = util.download_and_unzip(url, out_dir)
    print("Dataset downloaded here: {}".format(data_path))


results = {}
with jsonlines.open("/content/quora_ada_100.jsonl", "r") as jsonl_in:
    for obj in jsonl_in:
        results = {**results, **obj}


# Load the dataset into BEIR
data_path = f"datasets/{dataset}"
corpus, queries, qrels = GenericDataLoader(data_path).load(split="test")

query_ids = list(queries.keys())

2021-12-04 05:51:39 - Downloading quora.zip ...


/content/datasets/quora.zip:   0%|          | 0.00/15.1M [00:00<?, ?iB/s]

2021-12-04 05:51:43 - Unzipping quora.zip ...
Dataset downloaded here: /content/datasets/quora
2021-12-04 05:51:47 - Loading Corpus...


  0%|          | 0/522931 [00:00<?, ?it/s]

2021-12-04 05:51:50 - Loaded 522931 TEST Documents.
2021-12-04 05:51:50 - Doc Example: {'text': 'What is the step by step guide to invest in share market in india?', 'title': ''}
2021-12-04 05:51:50 - Loading Queries...
2021-12-04 05:51:50 - Loaded 10000 TEST Queries.
2021-12-04 05:51:50 - Query Example: Which question should I ask on Quora?


In [None]:
### Cost Estimates ###

# General
max_rerank = 100
# Davinci $0.0600
# Curie	$0.0060
# Babbage	$0.0012
# Ada	$0.0008
ada_cost = 0.0008
babbage_cost = 0.0012
curie_cost = 0.0060
davinci_cost = 0.0600

dataset = "trecnews"

# Scifact: 
# 213 Doc Len
# 12.37 Query Len
# 300 Queries
if dataset == "scifact":
    doc_len = 213
    query_len = 12.37
    queries = 300
# NFCorpus
elif dataset == "nfcorpus":
    doc_len = 232.26
    query_len = 3.30
    queries = 324
# TREC-COVID
elif dataset == "treccovid":
    doc_len = 160.77
    query_len = 10.60
    queries = 50
# FIQA
elif dataset == "fiqa":
    doc_len = 132.32
    query_len = 10.77
    queries = 648
# TOUCHE
elif dataset == "touche":
    doc_len = 292
    query_len = 6.55
    queries = 49   
# TREC-NEWS
elif dataset == "trecnews":
    doc_len = 634.79
    query_len = 11.14
    queries = 57   

# Residual factor computed by solving for X
factor = 0.0001423252530542999

# Some precise davinci estimates based on running single samples
# Davinci (FiQA): 1004.4 USD (648*1.55)
# Davinci (NQ): 5000 USD (Likely too big anyways)
# Davinci (HotpotQA): ~10000 USD (Likely too big anyways)
# Davinci (SciFact) 2.34 USD * 300 = 702 USD
# Davinci (TREC-Covid) 1.83 USD * 50 = 91.5 USD (Was actlly 100.14 USD)
# Davinci (NFCorpus) 2.16 USD * 323 = 697.68

# Ada (SCIDOCS) 20 USD
# > Davinci (SCIDOCS) 1500 USD

# > Running Top 10 reranking first instead > Will have 10% of the costs above
# SciFact 70 USD + NF 70 USD + FiQA 100 USD



# Custom formula --- Accurate within 25%
doc_len * query_len * queries * max_rerank * ada_cost * factor

4.589465132590204

In [None]:
!pip install -q transformers

In [None]:
### Tokenization experiments ##
from transformers import GPT2TokenizerFast

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
max_query_len = 0

obj = {"text": "Hello there, what are you upppp to gjiejign!gensiko1! OKAY"}

toks = tokenizer.encode(obj["text"], add_special_tokens=False)[:2034-max_query_len-1] # 0-indexed
out = tokenizer.decode(toks)

assert obj["text"] == out, "Decoding is lossy."

##### Recompute scores

In [None]:
from beir import util, LoggingHandler
import logging
# Code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

In [None]:
### Download / Upload the jsonl result files ###

!mkdir results
!mv *.jsonl results/
!mv results/*corpus.jsonl ./

In [None]:
!ls results

cqadupstackandroid_ada_100.jsonl      nfcorpus_ada_100.jsonl
cqadupstackenglish_ada_100.jsonl      nfcorpus_davinci_100.jsonl
cqadupstackgaming_ada_100.jsonl       quora_ada_100.jsonl
cqadupstackgis_ada_100.jsonl	      scidocs_ada_100.jsonl
cqadupstackmathematica_ada_100.jsonl  scifact_ada_100.jsonl
cqadupstackphysics_ada_100.jsonl      scifact_ada_10.jsonl
cqadupstackprogrammers_ada_100.jsonl  scifact_davinci_10.jsonl
cqadupstackstats_ada_100.jsonl	      signal1m_ada_100.jsonl
cqadupstacktex_ada_100.jsonl	      trec-covid_ada_100.jsonl
cqadupstackunix_ada_100.jsonl	      trec-covid_ada_10.jsonl
cqadupstackwebmasters_ada_100.jsonl   trec-covid_davinci_100.jsonl
cqadupstackwordpress_ada_100.jsonl    trec-covid_davinci_10.jsonl
fiqa_ada_100.jsonl		      webis-touche2020_ada_100.jsonl


In [None]:
!pip install -q jsonlines

In [None]:
### Cqadupstack ###

import os
import json

import jsonlines

from beir.retrieval.evaluation import EvaluateRetrieval
from beir.datasets.data_loader import GenericDataLoader

base_dir = "results"
out_path = "beir_openai_ndcgs.json"
max_k_options = [1, 3, 5, 10, 100, 1000]

ndcgs = {}

cqa_ndcgs, cqa_maps, cqa_recalls, cqa_precisions = [], [], [], []

for dataset_name in os.listdir(base_dir):
    if not "cqadupstack" in  dataset_name:
        continue
    if not(dataset_name.endswith(".jsonl")):
        continue
    
    dataset, engine, max_rerank = dataset_name.split("_")
    max_rerank, _ = max_rerank.split(".")

    sub_name = dataset.split("cqadupstack")[-1]
    dataset = f"cqadupstack/{sub_name}"

    # Read jsonlines into single dictionary
    results = {}
    with jsonlines.open(os.path.join(base_dir, dataset_name), "r") as jsonl_in:
        for obj in jsonl_in:
            # Only load when a prediction has been made - This skips around 10 examples of NFCorpus where OA errors out
            if list(obj.values()):
                results = {**results, **obj}

    # Load the dataset into BEIR
    data_path = f"datasets/{dataset}"
    corpus, queries, qrels = GenericDataLoader(data_path).load(split="test")

    assert len(results.keys()) == len(qrels), "Results and dataset do not match"

    # Compute scores until the max rerank performed - Generally [1, 10, 100]
    k_values = max_k_options[:max_k_options.index(int(max_rerank))+1]

    ndcg, _map, recall, precision = EvaluateRetrieval.evaluate(qrels, results, k_values)

    cqa_ndcgs.append(ndcg)
    cqa_maps.append(_map)
    cqa_recalls.append(recall)
    cqa_precisions.append(precision)


for (metric, group) in [(ndcg, cqa_ndcgs), (_map, cqa_maps), (recall, cqa_recalls), (precision, cqa_precisions)]:
    for k in metric.keys():
        metric[k] = sum([score[k] for score in group]) / len(group)

logging.info("CQA Final")
logging.info(f"{ndcg}")
logging.info(f"{_map}")
logging.info(f"{recall}")
logging.info(f"{precision}")


ndcgs.setdefault(engine, {})
ndcgs[engine].setdefault(max_rerank, {})
ndcgs[engine][max_rerank].setdefault("cqadupstack", {})
ndcgs[engine][max_rerank][dataset] = ndcg

2021-12-04 14:00:29 - Loading Corpus...


  0%|          | 0/48605 [00:00<?, ?it/s]

2021-12-04 14:00:30 - Loaded 48605 TEST Documents.
2021-12-04 14:00:30 - Doc Example: {'text': "In a shortcode context, is there any difference here?               array(             'slideshow' => '',         ),       and               array(             'slideshow' => NULL,         ),       Is there a best practice for that?", 'title': 'What is the difference between Null vs Empty (Zero Length) string?'}
2021-12-04 14:00:30 - Loading Queries...
2021-12-04 14:00:32 - Loaded 541 TEST Queries.
2021-12-04 14:00:32 - Query Example: How to enqueue script or style in a theme's template file?
2021-12-04 14:00:32 - 

2021-12-04 14:00:32 - NDCG@1: 0.2532
2021-12-04 14:00:32 - NDCG@3: 0.2670
2021-12-04 14:00:32 - NDCG@5: 0.2758
2021-12-04 14:00:32 - NDCG@10: 0.2857
2021-12-04 14:00:32 - NDCG@100: 0.3095
2021-12-04 14:00:32 - 

2021-12-04 14:00:32 - MAP@1: 0.2314
2021-12-04 14:00:32 - MAP@3: 0.2558
2021-12-04 14:00:32 - MAP@5: 0.2609
2021-12-04 14:00:32 - MAP@10: 0.2653
2021-12-04 14:00:32 - MAP

  0%|          | 0/68184 [00:00<?, ?it/s]

2021-12-04 14:00:34 - Loaded 68184 TEST Documents.
2021-12-04 14:00:34 - Doc Example: {'text': "I am using a pgfplots stacked bar to display the aggregated energy demand of a houshold and the associated price. When the energy demand exceeds a certain threshold, than a higher price has to be paid. This is visualized by the color red and blue of the bars. The threshold is displayed by the thick red horizontal line. My problem is, that I want this red line to exceed the width of the bar, so that it's width is circa 120 percent of the width of the bar. Is there any possibility to achieve this? Thanks ![enter image description here](http://i.stack.imgur.com/3qeEi.jpg)               \\documentclass[tikz]{standalone}     \\usepackage{pgfplots}     \\pgfplotsset{compat=1.10}     \\begin{document}     \\begin{tikzpicture}     \\begin{axis}[       ymin=0,ymax=4,       samples=3,       enlarge x limits={abs=0.5},       bar width=0.6,       ybar stacked,       legend pos=south east,         every 

  0%|          | 0/40221 [00:00<?, ?it/s]

2021-12-04 14:02:24 - Loaded 40221 TEST Documents.
2021-12-04 14:02:24 - Doc Example: {'text': 'An eponym is one way to eternal (if posthumous) fame. But is there a word meaning an eponym someone would sooner not have? (One would presume that Captain Charles _Boycott_ , Mr Justice _Lynch_ , and Patrick _Hooligan_ would not appreciate their undying notoriety.)', 'title': 'Is there a word meaning "an unwanted eponym"?'}
2021-12-04 14:02:24 - Loading Queries...
2021-12-04 14:02:41 - Loaded 1570 TEST Queries.
2021-12-04 14:02:41 - Query Example: Is "a wide range of features" singular or plural?
2021-12-04 14:02:41 - 

2021-12-04 14:02:41 - NDCG@1: 0.3420
2021-12-04 14:02:41 - NDCG@3: 0.3403
2021-12-04 14:02:41 - NDCG@5: 0.3481
2021-12-04 14:02:41 - NDCG@10: 0.3545
2021-12-04 14:02:41 - NDCG@100: 0.3673
2021-12-04 14:02:41 - 

2021-12-04 14:02:41 - MAP@1: 0.2652
2021-12-04 14:02:41 - MAP@3: 0.3056
2021-12-04 14:02:41 - MAP@5: 0.3140
2021-12-04 14:02:41 - MAP@10: 0.3189
2021-12-04 14:02:41 -

  0%|          | 0/32176 [00:00<?, ?it/s]

2021-12-04 14:02:42 - Loaded 32176 TEST Documents.
2021-12-04 14:02:42 - Doc Example: {'text': "I am in the midst of writing a web application for work. Everything is from scratch. I have been a PHP programmer for about 13 years, Node.js programmer for the past 2 years, and have no shortage of experience with JavaScript. I love Node.js, and recently rebuilt the company's API in it... So, in planning this web application, the approach I'm considering is, have the Node.js API for getting data from the server, but render everything in the browser. Use AJAX for retrieving data, History API for loading pages, and a MVC-like pattern for the different components. I have read articles detailing twitters rebuild a few years ago. It was more or less a client-side JavaScript app, but a couple years after launching it, they started moving a lot of processing/rendering back to the server, claiming the app improved dramatically in terms of speed. So, my question is as the title asks, is a client-sid

  0%|          | 0/37637 [00:00<?, ?it/s]

2021-12-04 14:02:53 - Loaded 37637 TEST Documents.
2021-12-04 14:02:53 - Doc Example: {'text': "There is a satellite image it's size is 10 GB and I need to display this image using GeoServer and OpenLayers. When user select the Satellite image in the layer switcher need to display image within 10 seconds. I tried geopdf but the image quality loss isn't acceptable to customer. I want to achieve 10 seconds response time using 32 GB satellite image. Please advice me how to achieve this? Thanks in advance.", 'title': 'Satellite image display with the help of GeoServer and OpenLayers'}
2021-12-04 14:02:53 - Loading Queries...
2021-12-04 14:03:03 - Loaded 885 TEST Queries.
2021-12-04 14:03:03 - Query Example: Calculating mean upslope aspect from each cell in DEM using Python?
2021-12-04 14:03:03 - 

2021-12-04 14:03:03 - NDCG@1: 0.2599
2021-12-04 14:03:03 - NDCG@3: 0.2914
2021-12-04 14:03:03 - NDCG@5: 0.3056
2021-12-04 14:03:03 - NDCG@10: 0.3189
2021-12-04 14:03:03 - NDCG@100: 0.3412
2021-12

  0%|          | 0/16705 [00:00<?, ?it/s]

2021-12-04 14:03:04 - Loaded 16705 TEST Documents.
2021-12-04 14:03:04 - Doc Example: {'text': "I'm trying to use `Get` to load some pretty substantial packages from a custom menu in the _Mathematica_ toolbar (added via MenuSetup.tr).   The problem is, the standard 5-second evaluation timeout seems to apply to commands executed with `KernelExecute`, so only a fraction of my `Get` is evaluated before the command times out. I'm wondering whether there's an option that can be passed to `KernelExecute` (or to `Item` / `MenuItem`) that will remove that time constraint so that my command can be executed completely.", 'title': 'Time constraints on KernelExecute commands or MenuItems?'}
2021-12-04 14:03:04 - Loading Queries...
2021-12-04 14:03:13 - Loaded 804 TEST Queries.
2021-12-04 14:03:13 - Query Example: How to use Automorphisms[] on a graph?
2021-12-04 14:03:14 - 

2021-12-04 14:03:14 - NDCG@1: 0.1754
2021-12-04 14:03:14 - NDCG@3: 0.1957
2021-12-04 14:03:14 - NDCG@5: 0.2111
2021-12-04 14

  0%|          | 0/17405 [00:00<?, ?it/s]

2021-12-04 14:03:14 - Loaded 17405 TEST Documents.
2021-12-04 14:03:14 - Doc Example: {'text': 'I\'m making a website for a small hotel in php. The hotel owners want a reservation system that uses paypal. They want people to see a calendar and choose a date to make a reservation. If the day has vacancy, they want the user to request booking a room. This would then require the hotel owner to accept the purchase. I have not worked on a project that has this "request to purchase" method of buying with paypal. Is this possible? Does anyone know of an open php system that handles this?', 'title': 'Hotel Reservation Request Booking Paypal PHP'}
2021-12-04 14:03:14 - Loading Queries...
2021-12-04 14:03:15 - Loaded 506 TEST Queries.
2021-12-04 14:03:15 - Query Example: Someone else is using our Google Analytics Tracking code number. What do we do?
2021-12-04 14:03:15 - 

2021-12-04 14:03:15 - NDCG@1: 0.2826
2021-12-04 14:03:15 - NDCG@3: 0.3065
2021-12-04 14:03:15 - NDCG@5: 0.3178
2021-12-04 14

  0%|          | 0/47382 [00:00<?, ?it/s]

2021-12-04 14:03:16 - Loaded 47382 TEST Documents.
2021-12-04 14:03:16 - Loading Queries...
2021-12-04 14:03:21 - Loaded 1072 TEST Queries.
2021-12-04 14:03:21 - Query Example: Yanked USB Key During Move
2021-12-04 14:03:21 - 

2021-12-04 14:03:21 - NDCG@1: 0.2892
2021-12-04 14:03:21 - NDCG@3: 0.2942
2021-12-04 14:03:21 - NDCG@5: 0.3037
2021-12-04 14:03:21 - NDCG@10: 0.3172
2021-12-04 14:03:21 - NDCG@100: 0.3334
2021-12-04 14:03:21 - 

2021-12-04 14:03:21 - MAP@1: 0.2478
2021-12-04 14:03:21 - MAP@3: 0.2766
2021-12-04 14:03:21 - MAP@5: 0.2831
2021-12-04 14:03:21 - MAP@10: 0.2891
2021-12-04 14:03:21 - MAP@100: 0.2930
2021-12-04 14:03:21 - 

2021-12-04 14:03:21 - Recall@1: 0.2478
2021-12-04 14:03:21 - Recall@3: 0.3021
2021-12-04 14:03:21 - Recall@5: 0.3282
2021-12-04 14:03:21 - Recall@10: 0.3682
2021-12-04 14:03:21 - Recall@100: 0.4338
2021-12-04 14:03:21 - 

2021-12-04 14:03:21 - P@1: 0.2892
2021-12-04 14:03:21 - P@3: 0.1231
2021-12-04 14:03:21 - P@5: 0.0812
2021-12-04 14:03:21 - P@10: 0

  0%|          | 0/22998 [00:00<?, ?it/s]

2021-12-04 14:03:21 - Loaded 22998 TEST Documents.
2021-12-04 14:03:21 - Doc Example: {'text': "I want to send files to android tablet with a application from PC. - I can send files directly to tablet (2.3 android OS) PC see it as a external usb drive. - But i can't send files to tablet (4.2 android OS), because PC see it as a portable media player.(MTP) - How can i fix this problem ? - How can show my device as a external drive? my application that sent files written via Delphi.", 'title': 'How can show android tablet as a external storage to PC?'}
2021-12-04 14:03:21 - Loading Queries...
2021-12-04 14:03:23 - Loaded 699 TEST Queries.
2021-12-04 14:03:23 - Query Example: Android chroot ubuntu - is it possible to get ubuntu to recognise usb devices
2021-12-04 14:03:23 - 

2021-12-04 14:03:23 - NDCG@1: 0.3333
2021-12-04 14:03:23 - NDCG@3: 0.3626
2021-12-04 14:03:23 - NDCG@5: 0.3819
2021-12-04 14:03:23 - NDCG@10: 0.3964
2021-12-04 14:03:23 - NDCG@100: 0.4211
2021-12-04 14:03:23 - 

2021-

  0%|          | 0/42269 [00:00<?, ?it/s]

2021-12-04 14:03:23 - Loaded 42269 TEST Documents.
2021-12-04 14:03:23 - Doc Example: {'text': "I'm a beginner in statistics and R, sorry if this question may seem trivial. I've collected data measuring several different parameters in 40 subjects at two time-points (t1 and t2). There are 3 main parameters in which I'm interested, let's call them ParA, ParB, ParC. ParA is a score of disability. It is on an arbitrary scale (so it is an ordinal scale measure, if my understanding is correct) and values range from 0.0 to 10.0. Note that the increments in this scale are by 0.5 unit, so values like, e.g. 1.5 are possible. I have two measures, at t1 and t2, so I can describe at least three variables from ParA: ParA at t1, ParA at t2, and whether a subject progressed or not (0 or 1). Being a ratio scale measure, I think it would not make much sense to compute a difference (eg. ParA at t2 - ParA at t1), but I'm willing to accept suggestions on this matter. ParB and ParC are meausurements of two 

  0%|          | 0/38316 [00:00<?, ?it/s]

2021-12-04 14:03:27 - Loaded 38316 TEST Documents.
2021-12-04 14:03:27 - Doc Example: {'text': "Let's discuss about $SU(3)$. I understand that the most important representations (relevant to physics) are the defining and the adjoint. In the defining representation of $SU(3)$; namely $\\mathbf{3}$, the Gell-Mann matrices are used to represent the generators $$ \\left[T^{A}\\right]_{ij} = \\dfrac{1}{2}\\lambda^{A}, $$ where $T^A$ are the generators and $\\lambda^A$ the Gell-Mann matrices. In adjoint representation, on the other hand, an $\\mathbf{8}$, the generators are represented by matrices according to $$ \\left[ T_{i} \\right]_{jk} = -if_{ijk}, $$ where $f_{ijk}$ are the structure constants. My question is this, how can one represent the generators in the $\\mathbf{10}$ of $SU(3)$, which corresponds to a symmetric tensor with 3 upper or lower indices (or for that matter how to represent the $\\mathbf{6}$ with two symmetric indices). What is the general procedure to represent the gen

  0%|          | 0/45301 [00:00<?, ?it/s]

2021-12-04 14:03:33 - Loaded 45301 TEST Documents.
2021-12-04 14:03:33 - Doc Example: {'text': 'What\'s your Supreme Commander 2 build order. I don\'t just want "6 mass extractors, 2 power and a factory". List of building and units out to the second or third factory, please.', 'title': 'Supreme Commander 2 - Build Orders'}
2021-12-04 14:03:33 - Loading Queries...
2021-12-04 14:03:52 - Loaded 1595 TEST Queries.
2021-12-04 14:03:52 - Query Example: Can the trophy system protect me against bullets?
2021-12-04 14:03:52 - 

2021-12-04 14:03:52 - NDCG@1: 0.4157
2021-12-04 14:03:52 - NDCG@3: 0.4532
2021-12-04 14:03:52 - NDCG@5: 0.4693
2021-12-04 14:03:52 - NDCG@10: 0.4879
2021-12-04 14:03:52 - NDCG@100: 0.5032
2021-12-04 14:03:52 - 

2021-12-04 14:03:52 - MAP@1: 0.3659
2021-12-04 14:03:52 - MAP@3: 0.4266
2021-12-04 14:03:52 - MAP@5: 0.4384
2021-12-04 14:03:52 - MAP@10: 0.4476
2021-12-04 14:03:52 - MAP@100: 0.4520
2021-12-04 14:03:52 - 

2021-12-04 14:03:52 - Recall@1: 0.3659
2021-12-04 14:03:

In [None]:
import os
import json

import jsonlines

from beir.retrieval.evaluation import EvaluateRetrieval
from beir.datasets.data_loader import GenericDataLoader

base_dir = "results"
out_path = "beir_openai_ndcgs.json"
max_k_options = [1, 3, 5, 10, 100, 1000]

#ndcgs = {}

for dataset_name in os.listdir(base_dir):
    if not(dataset_name.endswith(".jsonl")):
        continue
    if "cqadupstack" in dataset_name:
        continue

    dataset, engine, max_rerank = dataset_name.split("_")
    max_rerank, _ = max_rerank.split(".")

    # Read jsonlines into single dictionary
    results = {}
    with jsonlines.open(os.path.join(base_dir, dataset_name), "r") as jsonl_in:
        for obj in jsonl_in:
            # Only load when a prediction has been made - This skips around 10 examples of NFCorpus where OA errors out
            if list(obj.values()):
                results = {**results, **obj}

    if not os.path.exists(os.path.join("datasets", dataset)):
        url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
        out_dir = os.path.join(os.getcwd(), "datasets")
        data_path = util.download_and_unzip(url, out_dir)
        print("Dataset downloaded here: {}".format(data_path))

    # Load the dataset into BEIR
    data_path = f"datasets/{dataset}"
    corpus, queries, qrels = GenericDataLoader(data_path).load(split="test")

    assert len(results.keys()) == len(qrels), "Results and dataset do not match"

    # Compute scores until the max rerank performed - Generally [1, 10, 100]
    k_values = max_k_options[:max_k_options.index(int(max_rerank))+1]

    ndcg, _map, recall, precision = EvaluateRetrieval.evaluate(qrels, results, k_values)

    ndcgs.setdefault(engine, {})
    ndcgs[engine].setdefault(max_rerank, {})
    ndcgs[engine][max_rerank].setdefault(dataset, {})
    ndcgs[engine][max_rerank][dataset] = ndcg

with open(out_path, 'w') as fp:
    json.dump(ndcgs, fp)

2021-12-04 14:05:01 - Downloading trec-covid.zip ...


/content/datasets/trec-covid.zip:   0%|          | 0.00/70.5M [00:00<?, ?iB/s]

2021-12-04 14:05:10 - Unzipping trec-covid.zip ...
Dataset downloaded here: /content/datasets/trec-covid
2021-12-04 14:05:12 - Loading Corpus...


  0%|          | 0/171332 [00:00<?, ?it/s]

2021-12-04 14:05:14 - Loaded 171332 TEST Documents.
2021-12-04 14:05:14 - Doc Example: {'text': 'OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia. METHODS: Patients with positive M. pneumoniae cultures from respiratory specimens from January 1997 through December 1998 were identified through the Microbiology records. Charts of patients were reviewed. RESULTS: 40 patients were identified, 33 (82.5%) of whom required admission. Most infections (92.5%) were community-acquired. The infection affected all age groups but was most common in infants (32.5%) and pre-school children (22.5%). It occurred year-round but was most common in the fall (35%) and spring (30%). More than three-quarters of patients (77.5%) had comorbidities. Twenty-four isolates (60%) were associated with pneumonia, 14 (35%) with upper respiratory tract 

  0%|          | 0/171332 [00:00<?, ?it/s]

2021-12-04 14:05:17 - Loaded 171332 TEST Documents.
2021-12-04 14:05:17 - Doc Example: {'text': 'OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia. METHODS: Patients with positive M. pneumoniae cultures from respiratory specimens from January 1997 through December 1998 were identified through the Microbiology records. Charts of patients were reviewed. RESULTS: 40 patients were identified, 33 (82.5%) of whom required admission. Most infections (92.5%) were community-acquired. The infection affected all age groups but was most common in infants (32.5%) and pre-school children (22.5%). It occurred year-round but was most common in the fall (35%) and spring (30%). More than three-quarters of patients (77.5%) had comorbidities. Twenty-four isolates (60%) were associated with pneumonia, 14 (35%) with upper respiratory tract 

  0%|          | 0/522931 [00:00<?, ?it/s]

2021-12-04 14:05:25 - Loaded 522931 TEST Documents.
2021-12-04 14:05:25 - Doc Example: {'text': 'What is the step by step guide to invest in share market in india?', 'title': ''}
2021-12-04 14:05:25 - Loading Queries...
2021-12-04 14:05:26 - Loaded 10000 TEST Queries.
2021-12-04 14:05:26 - Query Example: Which question should I ask on Quora?
2021-12-04 14:05:26 - 

2021-12-04 14:05:26 - NDCG@1: 0.6918
2021-12-04 14:05:26 - NDCG@3: 0.7501
2021-12-04 14:05:26 - NDCG@5: 0.7697
2021-12-04 14:05:26 - NDCG@10: 0.7864
2021-12-04 14:05:26 - NDCG@100: 0.8002
2021-12-04 14:05:26 - 

2021-12-04 14:05:26 - MAP@1: 0.6045
2021-12-04 14:05:26 - MAP@3: 0.7085
2021-12-04 14:05:26 - MAP@5: 0.7264
2021-12-04 14:05:26 - MAP@10: 0.7374
2021-12-04 14:05:26 - MAP@100: 0.7431
2021-12-04 14:05:26 - 

2021-12-04 14:05:26 - Recall@1: 0.6045
2021-12-04 14:05:26 - Recall@3: 0.7846
2021-12-04 14:05:26 - Recall@5: 0.8365
2021-12-04 14:05:26 - Recall@10: 0.8842
2021-12-04 14:05:26 - Recall@100: 0.9345
2021-12-04 14:0

/content/datasets/fiqa.zip:   0%|          | 0.00/17.1M [00:00<?, ?iB/s]

2021-12-04 14:05:32 - Unzipping fiqa.zip ...
Dataset downloaded here: /content/datasets/fiqa
2021-12-04 14:05:33 - Loading Corpus...


  0%|          | 0/57638 [00:00<?, ?it/s]

2021-12-04 14:05:33 - Loaded 57638 TEST Documents.
2021-12-04 14:05:33 - Doc Example: {'text': "I'm not saying I don't like the idea of on-the-job training too, but you can't expect the company to do that. Training workers is not their job - they're building software. Perhaps educational systems in the U.S. (or their students) should worry a little about getting marketable skills in exchange for their massive investment in education, rather than getting out with thousands in student debt and then complaining that they aren't qualified to do anything.", 'title': ''}
2021-12-04 14:05:33 - Loading Queries...
2021-12-04 14:05:34 - Loaded 648 TEST Queries.
2021-12-04 14:05:34 - Query Example: How to deposit a cheque issued to an associate in my business into my business account?
2021-12-04 14:05:34 - 

2021-12-04 14:05:34 - NDCG@1: 0.3210
2021-12-04 14:05:34 - NDCG@3: 0.2963
2021-12-04 14:05:34 - NDCG@5: 0.3012
2021-12-04 14:05:34 - NDCG@10: 0.3204
2021-12-04 14:05:34 - NDCG@100: 0.3538
202

/content/datasets/scifact.zip:   0%|          | 0.00/2.69M [00:00<?, ?iB/s]

2021-12-04 14:05:37 - Unzipping scifact.zip ...
Dataset downloaded here: /content/datasets/scifact
2021-12-04 14:05:37 - Loading Corpus...


  0%|          | 0/5183 [00:00<?, ?it/s]

2021-12-04 14:05:37 - Loaded 5183 TEST Documents.
2021-12-04 14:05:37 - Doc Example: {'text': 'Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A line scan diffusion-weighted magnetic resonance imaging (MRI) sequence with diffusion tensor analysis was applied to measure the apparent diffusion coefficient, to calculate relative anisotropy, and to delineate three-dimensional fiber architecture in cerebral white matter in preterm (n = 17) and full-term infants (n = 7). To assess effects of prematurity on cerebral white matter development, early gestation preterm infants (n = 10) were studied a second time at term. In the central white matter the mean apparent diffusion coefficient at 28 wk was high, 1.8 microm2/ms, and decreased toward term to 1.2 microm2/ms. In the posterior limb of the internal capsule, the mean apparent diffusion coefficients at both times were similar (1.2 vers

/content/datasets/webis-touche2020.zip:   0%|          | 0.00/217M [00:00<?, ?iB/s]

2021-12-04 14:05:48 - Unzipping webis-touche2020.zip ...
Dataset downloaded here: /content/datasets/webis-touche2020
2021-12-04 14:05:55 - Loading Corpus...


  0%|          | 0/382545 [00:00<?, ?it/s]

2021-12-04 14:06:02 - Loaded 382545 TEST Documents.
2021-12-04 14:06:02 - Doc Example: {'text': 'My opponent forfeited every round. None of my arguments were answered. I don’t like the idea of winning by default, but here we are.Tule: it’s good for students to get involved and address big issues like teen pregnancy. You need to be able to answer arguments like mine and not simply prepare for an abstinence-only type of response. You should also be aware that, in the U.S., condoms may be sold to minors in ANY state. A retailer who says it is illegal to sell you them is, frankly, wrong.', 'title': 'Contraceptive Forms for High School Students'}
2021-12-04 14:06:02 - Loading Queries...
2021-12-04 14:06:02 - Loaded 49 TEST Queries.
2021-12-04 14:06:02 - Query Example: Should teachers get tenure?
2021-12-04 14:06:02 - 

2021-12-04 14:06:02 - NDCG@1: 0.3775
2021-12-04 14:06:02 - NDCG@3: 0.3445
2021-12-04 14:06:02 - NDCG@5: 0.3356
2021-12-04 14:06:02 - NDCG@10: 0.3321
2021-12-04 14:06:02 - NDC

  0%|          | 0/2866316 [00:00<?, ?it/s]

2021-12-04 14:06:22 - Loaded 2866316 TEST Documents.
2021-12-04 14:06:22 - Doc Example: {'text': 'This Boston college professor who lives in #NH is on leave after being arrested for child pornography, endangerment:', 'title': ''}
2021-12-04 14:06:22 - Loading Queries...
2021-12-04 14:06:22 - Loaded 97 TEST Queries.
2021-12-04 14:06:22 - Query Example: VIDEO:Good Samaritans Stop Alleged Hit-and-Run Driver in Miami
2021-12-04 14:06:22 - 

2021-12-04 14:06:22 - NDCG@1: 0.4433
2021-12-04 14:06:22 - NDCG@3: 0.3839
2021-12-04 14:06:22 - NDCG@5: 0.3524
2021-12-04 14:06:22 - NDCG@10: 0.3130
2021-12-04 14:06:22 - NDCG@100: 0.3109
2021-12-04 14:06:22 - 

2021-12-04 14:06:22 - MAP@1: 0.0318
2021-12-04 14:06:22 - MAP@3: 0.0615
2021-12-04 14:06:22 - MAP@5: 0.0802
2021-12-04 14:06:22 - MAP@10: 0.1080
2021-12-04 14:06:22 - MAP@100: 0.1585
2021-12-04 14:06:22 - 

2021-12-04 14:06:22 - Recall@1: 0.0318
2021-12-04 14:06:22 - Recall@3: 0.0694
2021-12-04 14:06:22 - Recall@5: 0.1033
2021-12-04 14:06:22 - R

  0%|          | 0/5183 [00:00<?, ?it/s]

2021-12-04 14:06:22 - Loaded 5183 TEST Documents.
2021-12-04 14:06:22 - Doc Example: {'text': 'Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A line scan diffusion-weighted magnetic resonance imaging (MRI) sequence with diffusion tensor analysis was applied to measure the apparent diffusion coefficient, to calculate relative anisotropy, and to delineate three-dimensional fiber architecture in cerebral white matter in preterm (n = 17) and full-term infants (n = 7). To assess effects of prematurity on cerebral white matter development, early gestation preterm infants (n = 10) were studied a second time at term. In the central white matter the mean apparent diffusion coefficient at 28 wk was high, 1.8 microm2/ms, and decreased toward term to 1.2 microm2/ms. In the posterior limb of the internal capsule, the mean apparent diffusion coefficients at both times were similar (1.2 vers

  0%|          | 0/171332 [00:00<?, ?it/s]

2021-12-04 14:06:25 - Loaded 171332 TEST Documents.
2021-12-04 14:06:25 - Doc Example: {'text': 'OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia. METHODS: Patients with positive M. pneumoniae cultures from respiratory specimens from January 1997 through December 1998 were identified through the Microbiology records. Charts of patients were reviewed. RESULTS: 40 patients were identified, 33 (82.5%) of whom required admission. Most infections (92.5%) were community-acquired. The infection affected all age groups but was most common in infants (32.5%) and pre-school children (22.5%). It occurred year-round but was most common in the fall (35%) and spring (30%). More than three-quarters of patients (77.5%) had comorbidities. Twenty-four isolates (60%) were associated with pneumonia, 14 (35%) with upper respiratory tract 

/content/datasets/nfcorpus.zip:   0%|          | 0.00/2.34M [00:00<?, ?iB/s]

2021-12-04 14:06:29 - Unzipping nfcorpus.zip ...
Dataset downloaded here: /content/datasets/nfcorpus
2021-12-04 14:06:29 - Loading Corpus...


  0%|          | 0/3633 [00:00<?, ?it/s]

2021-12-04 14:06:29 - Loaded 3633 TEST Documents.
2021-12-04 14:06:29 - Doc Example: {'text': 'Recent studies have suggested that statins, an established drug group in the prevention of cardiovascular mortality, could delay or prevent breast cancer recurrence but the effect on disease-specific mortality remains unclear. We evaluated risk of breast cancer death among statin users in a population-based cohort of breast cancer patients. The study cohort included all newly diagnosed breast cancer patients in Finland during 1995–2003 (31,236 cases), identified from the Finnish Cancer Registry. Information on statin use before and after the diagnosis was obtained from a national prescription database. We used the Cox proportional hazards regression method to estimate mortality among statin users with statin use as time-dependent variable. A total of 4,151 participants had used statins. During the median follow-up of 3.25 years after the diagnosis (range 0.08–9.0 years) 6,011 participants die

  0%|          | 0/5183 [00:00<?, ?it/s]

2021-12-04 14:06:29 - Loaded 5183 TEST Documents.
2021-12-04 14:06:29 - Doc Example: {'text': 'Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A line scan diffusion-weighted magnetic resonance imaging (MRI) sequence with diffusion tensor analysis was applied to measure the apparent diffusion coefficient, to calculate relative anisotropy, and to delineate three-dimensional fiber architecture in cerebral white matter in preterm (n = 17) and full-term infants (n = 7). To assess effects of prematurity on cerebral white matter development, early gestation preterm infants (n = 10) were studied a second time at term. In the central white matter the mean apparent diffusion coefficient at 28 wk was high, 1.8 microm2/ms, and decreased toward term to 1.2 microm2/ms. In the posterior limb of the internal capsule, the mean apparent diffusion coefficients at both times were similar (1.2 vers

/content/datasets/scidocs.zip:   0%|          | 0.00/136M [00:00<?, ?iB/s]

2021-12-04 14:06:38 - Unzipping scidocs.zip ...
Dataset downloaded here: /content/datasets/scidocs
2021-12-04 14:06:42 - Loading Corpus...


  0%|          | 0/25657 [00:00<?, ?it/s]

2021-12-04 14:06:43 - Loaded 25657 TEST Documents.
2021-12-04 14:06:43 - Doc Example: {'text': 'An evolutionary recurrent network which automates the design of recurrent neural/fuzzy networks using a new evolutionary learning algorithm is proposed in this paper. This new evolutionary learning algorithm is based on a hybrid of genetic algorithm (GA) and particle swarm optimization (PSO), and is thus called HGAPSO. In HGAPSO, individuals in a new generation are created, not only by crossover and mutation operation as in GA, but also by PSO. The concept of elite strategy is adopted in HGAPSO, where the upper-half of the best-performing individuals in a population are regarded as elites. However, instead of being reproduced directly to the next generation, these elites are first enhanced. The group constituted by the elites is regarded as a swarm, and each elite corresponds to a particle within it. In this regard, the elites are enhanced by PSO, an operation which mimics the maturing pheno

  0%|          | 0/171332 [00:00<?, ?it/s]

2021-12-04 14:06:46 - Loaded 171332 TEST Documents.
2021-12-04 14:06:46 - Doc Example: {'text': 'OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia. METHODS: Patients with positive M. pneumoniae cultures from respiratory specimens from January 1997 through December 1998 were identified through the Microbiology records. Charts of patients were reviewed. RESULTS: 40 patients were identified, 33 (82.5%) of whom required admission. Most infections (92.5%) were community-acquired. The infection affected all age groups but was most common in infants (32.5%) and pre-school children (22.5%). It occurred year-round but was most common in the fall (35%) and spring (30%). More than three-quarters of patients (77.5%) had comorbidities. Twenty-four isolates (60%) were associated with pneumonia, 14 (35%) with upper respiratory tract 

  0%|          | 0/3633 [00:00<?, ?it/s]

2021-12-04 14:06:46 - Loaded 3633 TEST Documents.
2021-12-04 14:06:46 - Doc Example: {'text': 'Recent studies have suggested that statins, an established drug group in the prevention of cardiovascular mortality, could delay or prevent breast cancer recurrence but the effect on disease-specific mortality remains unclear. We evaluated risk of breast cancer death among statin users in a population-based cohort of breast cancer patients. The study cohort included all newly diagnosed breast cancer patients in Finland during 1995–2003 (31,236 cases), identified from the Finnish Cancer Registry. Information on statin use before and after the diagnosis was obtained from a national prescription database. We used the Cox proportional hazards regression method to estimate mortality among statin users with statin use as time-dependent variable. A total of 4,151 participants had used statins. During the median follow-up of 3.25 years after the diagnosis (range 0.08–9.0 years) 6,011 participants die

##### Embeddings Endpoint Experiments

Experiments on the embeddings endpoint from OpenAI.

Main findings:
- OpenAI API has a problem with certain IDs, such as [[126]], hence sending token ids is not a reliable solution
- OpenAI has revoked my API access after sending too many requests by throttling my rate limit

In [None]:
!pip install --upgrade openai

In [3]:
!pip install -q transformers

[K     |████████████████████████████████| 3.4 MB 5.0 MB/s 
[K     |████████████████████████████████| 67 kB 4.6 MB/s 
[K     |████████████████████████████████| 596 kB 52.1 MB/s 
[K     |████████████████████████████████| 895 kB 54.8 MB/s 
[K     |████████████████████████████████| 3.3 MB 31.5 MB/s 
[?25h

In [4]:
from transformers import GPT2TokenizerFast
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [5]:
import openai

In [6]:
response = openai.Engine(id="babbage-similarity").embeddings(
    input=[tokenizer.encode("Sample document text goes here"), tokenizer.encode("Sample document text goes here")]
)
embeddings = response['data'][0]['embedding']

RateLimitError: ignored

In [None]:
tokenizer.encode("Sample document text goes here") == tokenizer.encode("Sample document text goes here", add_special_tokens=False)

True

In [None]:
TXT = """
;Ð’Ð¾Ð·Ð½Ð¸ÐºÑˆÐ°Ñ\x8f Ð½Ð° Ñ€ÑƒÐ±ÐµÐ¶Ðµ 2019-2020 Ð³Ð³ Ñ\x81Ð½Ð°Ñ‡Ð°Ð»Ð° Ð² Ð³ Ð£Ñ…Ð°Ð½ÑŒ (ÐºÐ¸Ñ‚Ð°Ð¹Ñ\x81ÐºÐ¾Ð¹ Ð¿Ñ€Ð¾Ð²Ð¸Ð½Ñ†Ð¸Ð¸ Ð¥ÑƒÐ±Ñ\x8dÐ¹) Ð¸ Ñ€Ð°Ñ\x81Ð¿Ñ€Ð¾Ñ\x81Ñ‚Ñ€Ð°Ð½Ð¸Ð²ÑˆÐ°Ñ\x8fÑ\x81Ñ\x8f Ð·Ð°Ñ‚ÐµÐ¼ Ð²Ð¾ Ð¼Ð½Ð¾Ð³Ð¸Ðµ Ñ\x81Ñ‚Ñ€Ð°Ð½Ñ‹ Ð¼Ð¸Ñ€Ð° Ñ\x8dÐ¿Ð¸Ð´ÐµÐ¼Ð¸Ñ\x8f COVID-19 (Coronavirus disease 2019), Ñ\x8dÑ‚Ð¸Ð¾Ð»Ð¾Ð³Ð¸Ñ‡ÐµÑ\x81ÐºÐ¸ Ñ\x81Ð²Ñ\x8fÐ·Ð°Ð½Ð½Ð°Ñ\x8f Ñ\x81 Ð²Ð¸Ñ€ÑƒÑ\x81Ð¾Ð¼ Ñ‚Ñ\x8fÐ¶Ñ‘Ð»Ð¾Ð³Ð¾ Ð¾Ñ\x81Ñ‚Ñ€Ð¾Ð³Ð¾ Ñ€ÐµÑ\x81Ð¿Ð¸Ñ€Ð°Ñ‚Ð¾Ñ€Ð½Ð¾Ð³Ð¾ Ñ\x81Ð¸Ð½Ð´Ñ€Ð¾Ð¼Ð° 2â€‘Ð³Ð¾ Ñ‚Ð¸Ð¿Ð° (SARS-CoV-2 â€“ Severe acute respiratory syndrome 2), Ð¿Ð¾Ñ€Ð¾Ð´Ð¸Ð»Ð° Ð½Ð¾Ð²ÑƒÑŽ Ð²Ð¾Ð»Ð½Ñƒ Ð¸Ð½Ñ‚ÐµÑ€ÐµÑ\x81Ð° Ðº ÐºÐ¾Ñ€Ð¾Ð½Ð°Ð²Ð¸Ñ€ÑƒÑ\x81Ð°Ð¼ ÐŸÐµÑ€Ð²Ñ‹Ðµ ÐºÐ¾Ñ€Ð¾Ð½Ð°Ð²Ð¸Ñ€ÑƒÑ\x81Ñ‹ â€“ Ð¿Ñ€ÐµÐ´Ñ\x81Ñ‚Ð°Ð²Ð¸Ñ‚ÐµÐ»Ð¸ Ñ\x81ÐµÐ¼ÐµÐ¹Ñ\x81Ñ‚Ð²Ð° Coronaviridae Ð¸Ð· Ð¾Ñ‚Ñ€Ñ\x8fÐ´Ð° Nidovirales â€“ Ð±Ñ‹Ð»Ð¸ Ð¾Ñ‚ÐºÑ€Ñ‹Ñ‚Ñ‹ ÐµÑ‰Ñ‘ Ð² Ð¿ÐµÑ€Ð²Ð¾Ð¹ Ð¿Ð¾Ð»Ð¾Ð²Ð¸Ð½Ðµ Ð¿Ñ€Ð¾ÑˆÐ»Ð¾Ð³Ð¾ Ð²ÐµÐºÐ° ÐŸÐµÑ€Ð²Ñ‹Ð¹ ÐºÐ¾Ñ€Ð¾Ð½Ð°Ð²Ð¸Ñ€ÑƒÑ\x81 Ñ‡ÐµÐ»Ð¾Ð²ÐµÐºÐ°, HCoV-B814, Ð±Ñ‹Ð» Ð¸Ð·Ð¾Ð»Ð¸Ñ€Ð¾Ð²Ð°Ð½ Ð² 1965 Ð³ Ð¸ Ðº Ð½Ð°Ñ\x81Ñ‚Ð¾Ñ\x8fÑ‰ÐµÐ¼Ñƒ Ð²Ñ€ÐµÐ¼ÐµÐ½ Ð½Ðµ Ñ\x81Ð¾Ñ…Ñ€Ð°Ð½Ð¸Ð»Ñ\x81Ñ\x8f Ð² Ð²Ð¸Ñ€ÑƒÑ\x81Ð¾Ð»Ð¾Ð³Ð¸Ñ‡ÐµÑ\x81ÐºÐ¸Ñ… ÐºÐ¾Ð»Ð»ÐµÐºÑ†Ð¸Ñ\x8fÑ… Ð—Ð° Ð¿Ñ€Ð¾ÑˆÐµÐ´ÑˆÐµÐµ Ð²Ñ€ÐµÐ¼Ñ\x8f Ð¿Ñ€Ð¾Ð¸Ð·Ð¾ÑˆÐ»Ð¾ Ð¼Ð½Ð¾Ð³Ð¾ÐºÑ€Ð°Ñ‚Ð½Ð¾Ðµ Ð½Ð°Ñ\x81Ð»Ð¾ÐµÐ½Ð¸Ðµ ÑƒÑ\x81Ñ‚Ð°Ñ€ÐµÐ²ÑˆÐ¸Ñ… Ð½Ð°Ð·Ð²Ð°Ð½Ð¸Ð¹ Ðš Ð½Ð°Ñ‡Ð°Ð»Ñƒ XXI Ð²ÐµÐºÐ° ÐºÐ¾Ñ€Ð¾Ð½Ð°Ð²Ð¸Ñ€ÑƒÑ\x81Ñ‹ Ð¿Ñ€ÐµÐ´Ñ\x81Ñ‚Ð°Ð²Ð»Ñ\x8fÐ»Ð¸ Ñ\x81ÐµÑ€ÑŒÑ‘Ð·Ð½ÑƒÑŽ Ð²ÐµÑ‚ÐµÑ€Ð¸Ð½Ð°Ñ€Ð½ÑƒÑŽ Ð¿Ñ€Ð¾Ð±Ð»ÐµÐ¼Ñƒ, Ð¾Ð´Ð½Ð°ÐºÐ¾ Ñ\x81Ñ‡Ð¸Ñ‚Ð°Ð»Ð¾Ñ\x81ÑŒ, Ñ‡Ñ‚Ð¾ Ñ\x8dÐ¿Ð¸Ð´ÐµÐ¼Ð¸Ñ‡ÐµÑ\x81ÐºÐ¸Ðµ ÐºÐ¾Ñ€Ð¾Ð½Ð°Ð²Ð¸Ñ€ÑƒÑ\x81Ñ‹ Ð½Ðµ Ð¾Ñ‚Ð½Ð¾Ñ\x81Ñ\x8fÑ‚Ñ\x81Ñ\x8f Ðº Ñ‡Ð¸Ñ\x81Ð»Ñƒ Ð¾Ñ\x81Ð¾Ð±Ð¾ Ð¾Ð¿Ð°Ñ\x81Ð½Ñ‹Ñ… Ð\x9dÐ°ÑƒÑ‡Ð½Ð¾Ð¼Ñƒ Ñ\x81Ð¾Ð¾Ð±Ñ‰ÐµÑ\x81Ñ‚Ð²Ñƒ Ð¿Ñ€Ð¸ÑˆÐ»Ð¾Ñ\x81ÑŒ Ð¿ÐµÑ€ÐµÑ\x81Ð¼Ð°Ñ‚Ñ€Ð¸Ð²Ð°Ñ‚ÑŒ Ñ\x8dÑ‚Ð¸ Ð¿Ñ€ÐµÐ´Ñ\x81Ñ‚Ð°Ð²Ð»ÐµÐ½Ð¸Ñ\x8f Ñ\x81Ð½Ð°Ñ‡Ð°Ð»Ð° Ð² 2002 Ð³ , ÐºÐ¾Ð³Ð´Ð° Ð²Ð¸Ñ€ÑƒÑ\x81 Ñ‚Ñ\x8fÐ¶Ñ‘Ð»Ð¾Ð³Ð¾ Ð¾Ñ\x81Ñ‚Ñ€Ð¾Ð³Ð¾ Ñ€ÐµÑ\x81Ð¿Ð¸Ñ€Ð°Ñ‚Ð¾Ñ€Ð½Ð¾Ð³Ð¾ Ñ\x81Ð¸Ð½Ð´Ñ€Ð¾Ð¼Ð° (SARS-CoV â€“ Severe acute respiratory syndrome-related coronavirus) Ð¿Ñ€Ð¾Ð½Ð¸Ðº Ð² Ð¿Ð¾Ð¿ÑƒÐ»Ñ\x8fÑ†Ð¸ÑŽ Ð»ÑŽÐ´ÐµÐ¹ Ð¸Ð· Ð¿Ð¾Ð¿ÑƒÐ»Ñ\x8fÑ†Ð¸Ð¸ Ð»ÐµÑ‚ÑƒÑ‡Ð¸Ñ… Ð¼Ñ‹ÑˆÐµÐ¹ Ð² Ð®Ð³Ð¾-Ð’Ð¾Ñ\x81Ñ‚Ð¾Ñ‡Ð½Ð¾Ð¹ Ð\x90Ð·Ð¸Ð¸, Ð° Ð¿Ð¾Ñ‚Ð¾Ð¼ Ð² 2012 Ð³ â€“ ÐºÐ¾Ð³Ð´Ð° Ð±Ñ‹Ð»Ð¸ Ð¾Ñ‚ÐºÑ€Ñ‹Ñ‚Ñ‹ Ð¿Ñ€Ð¸Ñ€Ð¾Ð´Ð½Ñ‹Ðµ Ð¾Ñ‡Ð°Ð³Ð¸ Ð²Ð¸Ñ€ÑƒÑ\x81Ð° Ð‘Ð»Ð¸Ð¶Ð½ÐµÐ²Ð¾Ñ\x81Ñ‚Ð¾Ñ‡Ð½Ð¾Ð³Ð¾ Ñ€ÐµÑ\x81Ð¿Ð¸Ñ€Ð°Ñ‚Ð¾Ñ€Ð½Ð¾Ð³Ð¾ Ñ\x81Ð¸Ð½Ð´Ñ€Ð¾Ð¼Ð° (MERS-CoV â€“ Middle East respiratory syndrome-related coronavirus) Ð½Ð° Ñ‚ÐµÑ€Ñ€Ð¸Ñ‚Ð¾Ñ€Ð¸Ð¸ Ð\x90Ñ€Ð°Ð²Ð¸Ð¹Ñ\x81ÐºÐ¾Ð³Ð¾ Ð¿Ð¾Ð»ÑƒÐ¾Ñ\x81Ñ‚Ñ€Ð¾Ð²Ð° Ð’ Ñ€ÐµÐ·ÑƒÐ»ÑŒÑ‚Ð°Ñ‚Ðµ Ð°ÐºÑ‚Ð¸Ð²Ð¸Ð·Ð°Ñ†Ð¸Ð¸ Ð¸Ð½Ñ‚ÐµÑ€ÐµÑ\x81Ð° Ðº ÐºÐ¾Ñ€Ð¾Ð½Ð°Ð²Ð¸Ñ€ÑƒÑ Ð°Ð¼, Ð² Ð¿ÐµÑ€Ð²Ñ‹Ðµ Ð´Ð²Ð° Ð´ÐµÑ\x81Ñ\x8fÑ‚Ð¸Ð»ÐµÑ‚Ð¸Ñ\x8f XXI Ð²ÐµÐºÐ° Ð±Ñ‹Ð»Ð¾ Ð¾Ñ‚ÐºÑ€Ñ‹Ñ‚Ð¾ Ð±Ð¾Ð»ÑŒÑˆÐ¾Ðµ ÐºÐ¾Ð»Ð¸Ñ‡ÐµÑ\x81Ñ‚Ð²Ð¾ Ð½Ð¾Ð²Ñ‹Ñ… Ð¿Ñ€ÐµÐ´Ñ\x81Ñ‚Ð°Ð²Ð¸Ñ‚ÐµÐ»ÐµÐ¹ Coronaviridae, Ñ‡Ñ‚Ð¾ Ð¿Ð¾Ñ‚Ñ€ÐµÐ±Ð¾Ð²Ð°Ð»Ð¾ Ð½ÐµÑ\x81ÐºÐ¾Ð»ÑŒÐºÐ¸Ñ… Ñ€ÐµÐ²Ð¸Ð·Ð¸Ð¹ Ñ‚Ð°ÐºÑ\x81Ð¾Ð½Ð¾Ð¼Ð¸Ñ‡ÐµÑ\x81ÐºÐ¾Ð¹ Ñ\x81Ñ‚Ñ€ÑƒÐºÑ‚ÑƒÑ€Ñ‹ Ñ\x8dÑ‚Ð¾Ð³Ð¾ Ñ\x81ÐµÐ¼ÐµÐ¹Ñ\x81Ñ‚Ð²Ð° Ð\x9dÐ°Ñ\x81Ñ‚Ð¾Ñ\x8fÑ‰Ð¸Ð¹ Ð¾Ð±Ð·Ð¾Ñ€ Ð¿Ð¾Ñ\x81Ð²Ñ\x8fÑ‰Ñ‘Ð½ Ð¸Ñ\x81Ñ‚Ð¾Ñ€Ð¸Ð¸ Ð¸Ð·ÑƒÑ‡ÐµÐ½Ð¸Ñ\x8f ÐºÐ¾Ñ€Ð¾Ð½Ð°Ð²Ð¸Ñ€ÑƒÑ\x81Ð¾Ð² Ð¸ Ñ\x81Ð¸Ñ\x81Ñ‚ÐµÐ¼Ðµ Ð¸Ñ… Ñ\x81Ð¾Ð²Ñ€ÐµÐ¼ÐµÐ½Ð½Ð¾Ð¹ ÐºÐ»Ð°Ñ\x81Ñ\x81Ð¸Ñ„Ð¸ÐºÐ°Ñ†Ð¸Ð¸, ÐºÐ¾Ñ‚Ð¾Ñ€Ð°Ñ\x8f Ñ\x81Ð»Ð¾Ð¶Ð¸Ð»Ð°Ñ\x81ÑŒ Ð½Ð° Ð½Ð°Ñ‡Ð°Ð»Ð¾ 2020 Ð³ Ð² Ñ\x81Ð¾Ð¾Ñ‚Ð²ÐµÑ‚Ñ\x81Ñ‚Ð²Ð¸Ðµ Ñ\x81 Ð¿Ð¾Ñ\x81Ð»ÐµÐ´Ð½Ð¸Ð¼Ð¸ Ñ€ÐµÐºÐ¾Ð¼ÐµÐ½Ð´Ð°Ñ†Ð¸Ñ\x8fÐ¼Ð¸ ÐœÐµÐ¶Ð´ÑƒÐ½Ð°Ñ€Ð¾Ð´Ð½Ð¾Ð³Ð¾ ÐšÐ¾Ð¼Ð¸Ñ‚ÐµÑ‚Ð° Ð¿Ð¾ Ñ‚Ð°ÐºÑ\x81Ð¾Ð½Ð¾Ð¼Ð¸Ð¸ Ð²Ð¸Ñ€ÑƒÑ\x81Ð¾Ð²'
"""

response = openai.Engine(id="babbage-similarity").embeddings(
    input=[[126]]
)
embeddings = response['data'][0]['embedding']

APIError: ignored

In [None]:
tokenizer.decode(tokenizer.encode(TXT)) == TXT

False