In [1]:
import os
import json
import uuid

from dotenv import dotenv_values
import weaviate
import weaviate.classes as wvc

In [2]:
config = dotenv_values("../.env")

In [3]:
# function that reads json file
def read_json(file):
    with open(file, "r") as f:
        data = json.load(f)
    return data

In [4]:
weaviate_api_key = config["AUTHENTICATION_APIKEY_ALLOWED_KEYS"].split(",")[0]

In [5]:
client = weaviate.connect_to_custom(
    http_host="localhost",
    http_port=8080,
    http_secure=False,
    grpc_host="localhost",
    grpc_port=50051,
    grpc_secure=False,
    auth_credentials=weaviate.auth.AuthApiKey(
        weaviate_api_key
    ),  # Set this environment variable
)

In [6]:
client.collections.delete("Subject")

In [7]:
subjects = client.collections.create(
    "Subject",
    vectorizer_config=wvc.config.Configure.Vectorizer.none(),
    vector_index_config=wvc.config.Configure.VectorIndex.hnsw(
        distance_metric=wvc.config.VectorDistances.COSINE  # select prefered distance metric
    ),
)

In [8]:
with open("subject_data/subject_code_to_name.json") as f:
    subject_code_to_name = json.load(f)

with open("subject_data/subject_to_degrees.json") as f:
    subject_to_degrees = json.load(f)

with open("subject_data/subject_to_majors.json") as f:
    subject_to_majors = json.load(f)

In [9]:
subject_weaviate_objects = []

for uni in os.listdir("./subject_data/embeddings"):
    if not os.path.isdir(f"./subject_data/embeddings/{uni}"):
        continue

    for subject_filename in os.listdir(f"./subject_data/embeddings/{uni}"):
        if not subject_filename.endswith(".json"):
            continue

        subject_code = subject_filename.split(".")[0]
        subject_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, f"{uni}-{subject_code}"))

        subject_embedding = read_json(
            f"./subject_data/embeddings/{uni}/{subject_filename}"
        )

        subject_weaviate_objects.append(
            wvc.data.DataObject(
                uuid=subject_id,
                properties={
                    "subjectCode": subject_code,
                    "name": subject_code_to_name.get(subject_code, subject_code),
                    "university": uni.upper(),
                    "degrees": subject_to_degrees[subject_code],
                    "majors": subject_to_majors[subject_code],
                },
                vector=subject_embedding,
            )
        )

In [10]:
subjects.data.insert_many(subject_weaviate_objects)

BatchObjectReturn(all_responses=[UUID('7afdd291-1e8c-56d1-9b43-f33ccfdd7a69'), UUID('a10d0335-af61-5879-a19f-2fa2171d6599'), UUID('9f165be4-2e25-59cb-b7c6-369e226b4eed'), UUID('2abec3ee-7035-5c45-829f-47bd633089dd'), UUID('cbb83c81-72e5-54b5-8af3-28f49127d295'), UUID('4e3696e5-fdba-5a27-b329-d17a83efe503'), UUID('c4e3f7be-a41c-54e2-8769-9973b4efa962'), UUID('a3264daa-2946-5c4e-9d02-87ef8b545dea'), UUID('788e2104-3625-5155-a83f-e7ba3ea11df6'), UUID('8b36ac41-a9ef-52fa-8489-ae7e244e53e3'), UUID('5d869bb1-8fea-5575-afe8-7ae25c4ccaae'), UUID('7eeb0b29-8f5e-53b2-b2b5-10ae118a8dc4'), UUID('61b62eaa-d0b8-5b39-a230-38e46c7a0e8b'), UUID('e3d9187f-0416-5d2c-a70e-a6a59dc604b8'), UUID('ca441d4a-8513-5175-a727-97416f1bfcfd'), UUID('b296b848-e609-575a-be4b-dbe29e19ad92'), UUID('fd3f3283-ffdc-5e41-9420-f041c8957dfb'), UUID('35a61263-f8cd-5a32-ab77-8bba76be08a4'), UUID('4f378200-975a-5705-84d7-2686956883b9'), UUID('221ef27a-00f8-594e-bc84-5947a1f822d7'), UUID('9d90a970-7743-5112-a332-0c88361e1f31'), U

In [12]:
print(
    subjects.query.fetch_object_by_id(
        "7afdd291-1e8c-56d1-9b43-f33ccfdd7a69", include_vector=True
    ).vector
)

{'default': [0.030688438564538956, -0.0013925213133916259, 0.02559700980782509, -0.017014268785715103, 0.005993809085339308, -0.0746903270483017, -0.09490004181861877, 0.021492144092917442, -0.042022913694381714, -0.01241435669362545, -0.008256797678768635, 0.06625650823116302, -0.0055818078108131886, -0.09669041633605957, -0.02875162661075592, 0.02735813334584236, 0.009058274328708649, -0.061619535088539124, -0.028259504586458206, 0.0017427615821361542, -0.01703542470932007, 0.006391908973455429, -0.05140922963619232, 0.051324158906936646, -0.06361517310142517, -0.052826885133981705, -0.011759343557059765, -0.02733600325882435, -0.02576264925301075, 0.03314208984375, 0.04999025538563728, -0.02504749968647957, 0.019766950979828835, -0.031993523240089417, 0.009822833351790905, -0.0007116939523257315, -0.008562606759369373, -0.041862450540065765, 0.023994864895939827, -0.009662508033216, 0.046809758991003036, 0.04205669090151787, 0.029405293986201286, -0.0007488352130167186, 0.0256856754

In [51]:
client.close()