In [1]:
from opensearchpy import OpenSearch

In [3]:
#Initialize connection to opensearch
host = 'localhost'
port = 9200
auth = ('admin', 'admin') 

client = OpenSearch(
    hosts = [{'host': host, 'port': port}],
    http_auth = auth,
    use_ssl = True,
    verify_certs = False,
    timeout=100
)
#check status
print(client.info())



{'name': 'opensearch-node1', 'cluster_name': 'opensearch-cluster', 'cluster_uuid': 'nWznQVwDRy-Rrj0Ugyk__Q', 'version': {'distribution': 'opensearch', 'number': '2.11.1', 'build_type': 'tar', 'build_hash': '6b1986e964d440be9137eba1413015c31c5a7752', 'build_date': '2023-11-29T21:43:10.135035992Z', 'build_snapshot': False, 'lucene_version': '9.7.0', 'minimum_wire_compatibility_version': '7.10.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'The OpenSearch Project: https://opensearch.org/'}


In [4]:
#create VectorDB index:

index_name = "wmt_financial_e5_512"
index_body = {
    "settings": {
    "index": {
      "knn": True,
      "knn.algo_param.ef_search": 100
    }
  },
  "mappings": {
    "properties": {
        "vector": {
          "type": "knn_vector",
          "dimension": 1024,     #Thats the output dimension of the e5 model
          "method": {
            "name": "hnsw",
            "space_type": "cosinesimil",
            "engine": "nmslib",
            "parameters": {
              "ef_construction": 512,
              "m": 24
            }
          }
        }
    }
  }
}

response = client.indices.create(index_name, body = index_body)
print(response)



{'acknowledged': True, 'shards_acknowledged': True, 'index': 'wmt_financial_e5_512'}


In [5]:
import csv

loaded_list_of_lists = []

# Read the CSV file and load it into a list of lists
with open('../data/embeddings/embeddings_3.csv', 'r') as file:
    reader = csv.reader(file)
    loaded_list_of_lists = [row for row in reader]

In [6]:
for chunk in loaded_list_of_lists:
    chunk[0] = eval(chunk[0])

In [31]:
print(loaded_list_of_lists[0])

[[-0.00016050208068918437, -0.0225758608430624, 0.011367831379175186, -0.021035796031355858, -0.04042990878224373, 0.024891795590519905, -0.06107403710484505, -0.04026772454380989, -0.03388090431690216, 0.012928882613778114, 0.031399328261613846, -0.017383959144353867, 0.04096107929944992, 0.031691115349531174, 0.03828968480229378, 0.025142256170511246, -0.015011095441877842, -0.012760070152580738, 0.0613536611199379, -0.05611129105091095, 0.039570827037096024, -0.045836418867111206, -0.03392574563622475, 0.003746081842109561, 0.0009654692839831114, -0.03735732659697533, -0.004992614034563303, -0.01590718887746334, -0.03253298997879028, 0.012206889688968658, -0.020696306601166725, -0.008061053231358528, 0.04077144339680672, -0.002242524642497301, 0.03209373727440834, 0.013896296732127666, -0.053449537605047226, -0.052630726248025894, 0.03315321356058121, 0.019637294113636017, -0.02626550942659378, 0.04514243081212044, -0.025280628353357315, 0.05744270980358124, 0.02571248821914196, -0.

In [7]:
def divide_list(input_list, n):
    chunk_size = len(input_list) // n
    remainder = len(input_list) % n

    start = 0
    result = []

    for i in range(n):
        end = start + chunk_size + (1 if i < remainder else 0)
        result.append(input_list[start:end])
        start = end

    return result


In [8]:
result = divide_list(loaded_list_of_lists, 20)

In [9]:
print(len(result[10]))

63


In [10]:
print(len(result[0]))
from tqdm import tqdm
data_for_bulk_insert = []

for batch in tqdm(result):
    for chunk in batch:
        data_for_bulk_insert.append({"index": {"_index": index_name, "_id": chunk[3]}})
        data_for_bulk_insert.append({"vector" : chunk[0], "text" : chunk[1], "year": chunk[2]})
    response = client.bulk(data_for_bulk_insert)
    data_for_bulk_insert = []

64


100%|██████████| 20/20 [00:06<00:00,  3.03it/s]
