In [None]:
import csv
import os

ccscm_id2name = {}
with open('../../resources/CCSCM.csv', 'r', encoding='utf-8', errors='replace') as f:
    lines = f.readlines()
    for line in lines[1:]:
        line = line.strip().split(',')
        ccscm_id2name[line[0]] = line[1].lower()

ccsproc_id2name = {}
with open('../../resources/CCSPROC.csv', 'r', encoding='utf-8', errors='replace') as f:
    lines = f.readlines()
    for line in lines[1:]:
        line = line.strip().split(',')
        ccsproc_id2name[line[0]] = line[1].lower()

atc3_id2name = {}
with open("../../resources/ATC.csv", newline='', encoding='utf-8', errors='replace') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        if row['level'] == '3.0':
            atc3_id2name[row['code']] = row['name'].lower()

In [None]:
path_0 = "/data/pj20/exp_data"
os.makedirs(path_0, exist_ok=True)

ccscm_id2emb = {}
ccsproc_id2emb = {}
atc3_id2emb = {}

In [None]:
from get_emb import embedding_retriever
import numpy as np
from tqdm import tqdm
import pickle
from concurrent.futures import ThreadPoolExecutor, as_completed

MAX_WORKERS = 100  # adjust depending on how hard you want to push the API

In [4]:
def build_emb_map(id2name):
    emb_map = {}
    items = list(id2name.items())

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # submit one job per (key, name)
        futures = {
            executor.submit(embedding_retriever, term=name): key
            for key, name in items
        }
        for fut in tqdm(as_completed(futures), total=len(futures)):
            key = futures[fut]
            emb_map[key] = fut.result()

    return emb_map


# build all three maps in parallel (per map)
ccscm_id2emb = build_emb_map(ccscm_id2name)
ccsproc_id2emb = build_emb_map(ccsproc_id2name)
atc3_id2emb = build_emb_map(atc3_id2name)

with open(f"{path_0}/ccscm_id2emb.pkl", "wb") as f:
    pickle.dump(ccscm_id2emb, f)

with open(f"{path_0}/ccsproc_id2emb.pkl", "wb") as f:
    pickle.dump(ccsproc_id2emb, f)

with open(f"{path_0}/atc3_id2emb.pkl", "wb") as f:
    pickle.dump(atc3_id2emb, f)


100%|██████████| 285/285 [00:21<00:00, 13.23it/s]
100%|██████████| 231/231 [00:14<00:00, 15.70it/s]
100%|██████████| 269/269 [00:15<00:00, 17.24it/s]


In [5]:
def cosine_similarity(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

In [6]:
import json

path_1 = "/data/pj20/exp_data/ccscm_ccsproc"
path_1_ = "../../graphs/cond_proc/CCSCM_CCSPROC"

ent2id_file = f"{path_1_}/ent2id.json"
ent_emb_file = f"{path_1_}/entity_embedding.pkl"
map_cluster_file = f"{path_1}/clusters_th015.json" 
map_cluster_inv = f"{path_1}/clusters_inv_th015.json"

with open(ent2id_file, "r") as f:
    ent2id = json.load(f)

with open(ent_emb_file, "rb") as f:
    ent_emb = pickle.load(f)

with open(map_cluster_file, "r") as f:
    map_cluster = json.load(f)

with open(map_cluster_inv, "r") as f:
    map_cluster_inv = json.load(f)

FileNotFoundError: [Errno 2] No such file or directory: '../../graphs/cond_proc/CCSCM_CCSPROC/ent2id.json'

In [None]:
ccscm_id2clus = {}
ccsproc_id2clus = {}

for key in tqdm(ccscm_id2emb.keys()):
    emb = ccscm_id2emb[key]
    emb = np.array(emb)
    max_sim = 0
    max_id = None
    for i in range(ent_emb.shape[0]):
        emb_compare = ent_emb[i]
        sim = cosine_similarity(emb, emb_compare)
        if sim > max_sim:
            max_sim = sim
            max_id = i
    
    cluster_id = map_cluster_inv[str(max_id)]
    ccscm_id2clus[key] = cluster_id

for key in tqdm(ccsproc_id2emb.keys()):
    emb = ccsproc_id2emb[key]
    emb = np.array(emb)
    max_sim = 0
    max_id = None
    for i in range(ent_emb.shape[0]):
        emb_compare = ent_emb[i]
        sim = cosine_similarity(emb, emb_compare)
        if sim > max_sim:
            max_sim = sim
            max_id = i
    
    cluster_id = map_cluster_inv[str(max_id)]
    ccsproc_id2clus[key] = cluster_id

with open(f"{path_1}/ccscm_id2clus.json", "w") as f:
    json.dump(ccscm_id2clus, f)

with open(f"{path_1}/ccsproc_id2clus.json", "w") as f:
    json.dump(ccsproc_id2clus, f)


In [None]:
path_2 = "/data/pj20/exp_data/ccscm_ccsproc_atc3"
path_2_ = "../../graphs/cond_proc_drug/CCSCM_CCSPROC_ATC3"

ent2id_file = f"{path_2_}/ent2id.json"
ent_emb_file = f"{path_2_}/entity_embedding.pkl"
map_cluster_file = f"{path_2}/clusters_th015.json" 
map_cluster_inv = f"{path_2}/clusters_inv_th015.json"

with open(ent2id_file, "r") as f:
    ent2id = json.load(f)

with open(ent_emb_file, "rb") as f:
    ent_emb = pickle.load(f)

with open(map_cluster_file, "r") as f:
    map_cluster = json.load(f)

with open(map_cluster_inv, "r") as f:
    map_cluster_inv = json.load(f)

In [None]:
ccscm_id2clus = {}
ccsproc_id2clus = {}
atc3_id2clus = {}

for key in tqdm(ccscm_id2emb.keys()):
    emb = ccscm_id2emb[key]
    emb = np.array(emb)
    max_sim = 0
    max_id = None
    for i in range(ent_emb.shape[0]):
        emb_compare = ent_emb[i]
        sim = cosine_similarity(emb, emb_compare)
        if sim > max_sim:
            max_sim = sim
            max_id = i
    
    cluster_id = map_cluster_inv[str(max_id)]
    ccscm_id2clus[key] = cluster_id

for key in tqdm(ccsproc_id2emb.keys()):
    emb = ccsproc_id2emb[key]
    emb = np.array(emb)
    max_sim = 0
    max_id = None
    for i in range(ent_emb.shape[0]):
        emb_compare = ent_emb[i]
        sim = cosine_similarity(emb, emb_compare)
        if sim > max_sim:
            max_sim = sim
            max_id = i
    
    cluster_id = map_cluster_inv[str(max_id)]
    ccsproc_id2clus[key] = cluster_id

for key in tqdm(atc3_id2emb.keys()):
    emb = atc3_id2emb[key]
    emb = np.array(emb)
    max_sim = 0
    max_id = None
    for i in range(ent_emb.shape[0]):
        emb_compare = ent_emb[i]
        sim = cosine_similarity(emb, emb_compare)
        if sim > max_sim:
            max_sim = sim
            max_id = i
    
    cluster_id = map_cluster_inv[str(max_id)]
    atc3_id2clus[key] = cluster_id

with open(f"{path_2}/ccscm_id2clus.json", "w") as f:
    json.dump(ccscm_id2clus, f)

with open(f"{path_2}/ccsproc_id2clus.json", "w") as f:
    json.dump(ccsproc_id2clus, f)

with open(f"{path_2}/atc3_id2clus.json", "w") as f:
    json.dump(atc3_id2clus, f)