In [16]:
import numpy as np
import math
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score
import tempfile
import pickle
import math
from collections import OrderedDict

In [17]:
def length_to_distance(length: float) -> float:
    return (2 - 2*math.cos(length))**(1/2)

def distance_to_length(distance: float) -> float:
    return math.acos((2-distance**2)/2)

def score_to_distance(score: float) -> float:
    return similarity_to_distance(score_to_similarity(score))

def distance_to_similarity(distance: float) -> float:
    return (2 - distance ** 2) / 2

In [18]:
# load the 2D ids and embeddings
# create profiles dictionary
profile_ids = list([f"P{i+1}" for i in range(16)])
profile_embeddings = list([[1, 0], [3**(1/2)/2, 1/2], [1/2**(1/2), 1/2**(1/2)], [1/2, 3**(1/2)/2], 
                      [0, 1], [-1/2, 3**(1/2)/2], [-1/2**(1/2), 1/2**(1/2)], [-3**(1/2)/2, 1/2],
                      [-1, 0], [-3**(1/2)/2, -1/2], [-1/2**(1/2), -1/2**(1/2)], [-1/2, -3**(1/2)/2], 
                      [0, -1], [1/2, -3**(1/2)/2], [1/2**(1/2), -1/2**(1/2)], [3**(1/2)/2, -1/2]
                     ])

profiles = {"profile_id": profile_ids, "profile_embedding": profile_embeddings}
print(f"profiles: ")
print(f"{profiles}")
print()

profiles: 
{'profile_id': ['P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'P10', 'P11', 'P12', 'P13', 'P14', 'P15', 'P16'], 'profile_embedding': [[1, 0], [0.8660254037844386, 0.5], [0.7071067811865475, 0.7071067811865475], [0.5, 0.8660254037844386], [0, 1], [-0.5, 0.8660254037844386], [-0.7071067811865475, 0.7071067811865475], [-0.8660254037844386, 0.5], [-1, 0], [-0.8660254037844386, -0.5], [-0.7071067811865475, -0.7071067811865475], [-0.5, -0.8660254037844386], [0, -1], [0.5, -0.8660254037844386], [0.7071067811865475, -0.7071067811865475], [0.8660254037844386, -0.5]]}



In [19]:
# create cohorts for QII, QIII, QIV
cohort_ids = [f"C{i+1}" for i in range(3)]
cohort_embeddings = [[1/2**(1/2), 1/2**(1/2)], [-1/2**(1/2), 1/2**(1/2)], [-1/2**(1/2), -1/2**(1/2)]]
cohorts = {"cohort_id": cohort_ids, "cohort_embedding": cohort_embeddings}
print(f"cohorts: ")
print(f"{cohorts}")
print()

cohorts: 
{'cohort_id': ['C1', 'C2', 'C3'], 'cohort_embedding': [[0.7071067811865475, 0.7071067811865475], [-0.7071067811865475, 0.7071067811865475], [-0.7071067811865475, -0.7071067811865475]]}



In [20]:
# create queries: for this example query embeddings = [cos(theta), sin(theta)] for theta = pi/6, 5pi/6, pi/3
query_ids = [f"Q{i+1}" for i in range(3)]
query_embeddings = [[1, 0], [-3**(1/2)/2, 1/2], [1/2, -3**(1/2)/2]]
queries = {"query_id": query_ids, "query_embedding": query_embeddings}
print(f"queries:  ")
print(f"{queries}")
print()

queries:  
{'query_id': ['Q1', 'Q2', 'Q3'], 'query_embedding': [[1, 0], [-0.8660254037844386, 0.5], [0.5, -0.8660254037844386]]}



In [21]:
# set the cohort radius
# for theta = pi/2 -> cohort_radius = sqrt(2), for theta = pi/4 -> cohort_radius = sqrt(2 - sqrt(2)), for theta = pi/3 -> cohort_radius = 1
cohort_radius = 1
query_radius = 1

In [22]:
# cohort assignment algorithm
# tag profiles with cohorts in range, tag cohorts with profiles in range
profile_cohort_ids = []
cohort_profile_ids = {cohort_id: [] for cohort_id in cohort_ids}  # Dictionary to track profiles for each cohort

def multiple_cohort_assignment(cohorts, profiles, cohort_radius):
    cohort_ids = np.array(cohorts["cohort_id"]) 
    cohort_embeddings = np.array(cohorts["cohort_embedding"])
    profile_ids = np.array(profiles["profile_id"]) 
    profile_embeddings = np.array(profiles["profile_embedding"])
    print(f"Assigning {len(profile_ids)} profiles to {len(cohort_ids)} cohorts...")

    knn = NearestNeighbors(algorithm='brute', metric="euclidean", n_jobs=-1)
    knn.fit(cohort_embeddings)
    cohort_distances, cohort_indices = knn.kneighbors(profile_embeddings, n_neighbors=3, return_distance=True)
    
    # Properly initialize cohort_profile_ids dictionary
    cohort_profile_ids = {cohort_id: [] for cohort_id in cohort_ids}
    profile_cohort_ids = []
    
    for profile_idx, (distances, indices) in enumerate(zip(cohort_distances, cohort_indices)):
        assigned_cohort_ids = []
        for distance, cohort_idx in zip(distances, indices):
            if distance <= cohort_radius:
                cohort_id = cohort_ids[cohort_idx]
                assigned_cohort_ids.append(cohort_id)
                cohort_profile_ids[cohort_id].append(profile_ids[profile_idx])  # Track profiles in respective cohorts
        profile_cohort_ids.append(assigned_cohort_ids)

    tagged_profiles = [
        {"profile_id": pid, "profile_embedding": pembed, "cohort_ids": cid}
        for pid, pembed, cid in zip(profile_ids, profile_embeddings, profile_cohort_ids)
    ]

    tagged_cohorts = [
        {"cohort_id": cid, "cohort_embedding": cembed, "profile_ids": cohort_profile_ids[cid]}
        for cid, cembed in zip(cohort_ids, cohort_embeddings)
    ]

    return tagged_profiles, tagged_cohorts

In [23]:
tagged_profiles, tagged_cohorts = multiple_cohort_assignment(cohorts, profiles, cohort_radius)

print(f"tagged_profiles: {tagged_profiles}")
print()
print(f"tagged_cohorts: {tagged_cohorts}")
print()

Assigning 16 profiles to 3 cohorts...
tagged_profiles: [{'profile_id': 'P1', 'profile_embedding': array([1., 0.]), 'cohort_ids': ['C1']}, {'profile_id': 'P2', 'profile_embedding': array([0.8660254, 0.5      ]), 'cohort_ids': ['C1']}, {'profile_id': 'P3', 'profile_embedding': array([0.70710678, 0.70710678]), 'cohort_ids': ['C1']}, {'profile_id': 'P4', 'profile_embedding': array([0.5      , 0.8660254]), 'cohort_ids': ['C1']}, {'profile_id': 'P5', 'profile_embedding': array([0., 1.]), 'cohort_ids': ['C1', 'C2']}, {'profile_id': 'P6', 'profile_embedding': array([-0.5      ,  0.8660254]), 'cohort_ids': ['C2']}, {'profile_id': 'P7', 'profile_embedding': array([-0.70710678,  0.70710678]), 'cohort_ids': ['C2']}, {'profile_id': 'P8', 'profile_embedding': array([-0.8660254,  0.5      ]), 'cohort_ids': ['C2']}, {'profile_id': 'P9', 'profile_embedding': array([-1.,  0.]), 'cohort_ids': ['C3', 'C2']}, {'profile_id': 'P10', 'profile_embedding': array([-0.8660254, -0.5      ]), 'cohort_ids': ['C3']},

In [24]:
# cohort querying algorithms
# brute force for precision and recall measures
def brute_force_search(queries, tagged_profiles, min_cosine_similarity):
    query_embeddings = np.array(queries["query_embedding"])
    profile_embeddings = np.stack([profile["profile_embedding"] for profile in tagged_profiles])    
    cosine_similarities = cosine_similarity(query_embeddings, profile_embeddings)    
    return [len(profile_embeddings)] * len(query_embeddings), (cosine_similarities >= min_cosine_similarity).astype(int)


def cohort_search(queries, tagged_cohorts, query_radius, cohort_radius):
    cohort_ids = [cohort["cohort_id"] for cohort in tagged_cohorts]
    cohort_embeddings = np.stack([cohort["cohort_embedding"] for cohort in tagged_cohorts]) if tagged_cohorts else np.array([])
    query_ids = queries["query_id"]
    query_embeddings = np.array(queries["query_embedding"])
    
    print(f"Querying cohorts...")
    if cohort_embeddings.size > 0:
        cohort_knn = NearestNeighbors(algorithm='brute', metric="euclidean", n_jobs=-1)
        cohort_knn.fit(cohort_embeddings)
        all_cohort_indices = cohort_knn.radius_neighbors(
            query_embeddings,
            radius=length_to_distance(distance_to_length(query_radius) + distance_to_length(cohort_radius)),
            return_distance=False
        )
    else:
        all_cohort_indices = [[] for _ in range(len(query_embeddings))]
    
    print(f"There are {[len(cohort_indices) for cohort_indices in all_cohort_indices]} cohorts matched to each query, respectively.")
    
    return cohort_ids, all_cohort_indices


def collect_profiles_to_query(tagged_profiles, cohort_ids, all_cohort_indices, query_count):
    profiles_to_query = {query_idx: {"profile_id": [], "profile_embedding": []} for query_idx in range(query_count)}

    for query_idx in range(query_count):
        cohort_matched_profiles = OrderedDict()  # Preserves insertion order
        
        for cohort_idx in all_cohort_indices[query_idx]:
            cohort_id = cohort_ids[cohort_idx]
            inner_profiles = [(profile["profile_id"], profile["profile_embedding"]) 
                              for profile in tagged_profiles if cohort_id in profile["cohort_ids"]]
            
            for profile_id, embedding in inner_profiles:
                if profile_id not in cohort_matched_profiles:
                    cohort_matched_profiles[profile_id] = embedding
        
        # Extend the lists while maintaining order
        profiles_to_query[query_idx]["profile_id"].extend(cohort_matched_profiles.keys())
        profiles_to_query[query_idx]["profile_embedding"].extend(cohort_matched_profiles.values())

    # Retrieve uncohorted profiles
    uncohorted_profiles = [(profile["profile_id"], profile["profile_embedding"]) 
                           for profile in tagged_profiles if not profile["cohort_ids"]]

    for query_idx in range(query_count):
        for profile_id, embedding in uncohorted_profiles:
            profiles_to_query[query_idx]["profile_id"].append(profile_id)
            profiles_to_query[query_idx]["profile_embedding"].append(embedding)

    inner_profile_calculation_counts = [len(profiles_to_query[q_idx]['profile_id']) for q_idx in range(query_count)]

    print(f"Profiles collected per query: {inner_profile_calculation_counts}")
    print(f"profiles_to_query: ")
    print(f"{profiles_to_query}")

    return profiles_to_query


def profile_search(queries, tagged_profiles, cohort_ids, all_cohort_indices, query_radius):
    query_ids = queries["query_id"]
    query_embeddings = np.array(queries["query_embedding"])
    
    print(f"Querying profiles...")

    # Collect profiles to query
    profiles_to_query = collect_profiles_to_query(tagged_profiles, cohort_ids, all_cohort_indices, len(query_ids))
    
    # Count of profiles in each cohort per query
    cohort_calculation_counts = [len(cohort_ids) + len(profiles_to_query[q_idx]["profile_id"]) for q_idx in range(len(query_ids))]
    print(f"Cohort calculation counts per query: {cohort_calculation_counts}")
    
    # Initialize the list to hold profile ids in range for each query
    in_range_profile_ids = [[] for _ in range(len(query_ids))]
    
    # Query processing: find profiles in range for each query
    for query_idx in range(len(query_ids)):
        profiles = profiles_to_query[query_idx]
        print(f"profiles_to_query for query_idx {query_idx}: ")
        print(f"{profiles}")
        embeddings = np.array(profiles["profile_embedding"]) if profiles["profile_embedding"] else np.array([])

        if embeddings.size > 0:
            # Perform the nearest neighbor search
            profile_knn = NearestNeighbors(algorithm="brute", metric="euclidean", n_jobs=-1)
            profile_knn.fit(embeddings)
            all_inner_profile_indices = profile_knn.radius_neighbors([query_embeddings[query_idx]], radius=query_radius, return_distance=False)[0]
            print()
            print(f"all_inner_profile_indices for query_idx {query_idx}: ")
            print(all_inner_profile_indices)
            print()
            in_range_profile_ids[query_idx] = [profiles["profile_id"][i] for i in all_inner_profile_indices]
    
    print(f"in_range_profile_ids: ")
    print(in_range_profile_ids)
    print()
    # Map profile_id to index for fast lookup
    profile_id_to_index = {profile["profile_id"]: idx for idx, profile in enumerate(tagged_profiles)}
    
    # Initialize the result array (query x profiles)
    cohort_query_results = np.zeros((len(query_ids), len(tagged_profiles)), dtype=int)

    # For each query, create a set of in-range profile IDs for fast lookup
    for query_idx in range(len(query_ids)):
        matched_profile_ids = in_range_profile_ids[query_idx]
        for profile_idx, profile in enumerate(tagged_profiles):
            # Check if the profile_id is in the matched_profile_ids
            if profile["profile_id"] in matched_profile_ids:
                cohort_query_results[query_idx, profile_idx] = 1
            else:
                cohort_query_results[query_idx, profile_idx] = 0

    # Print out statistics
    print(f"Total Queries Processed: {len(in_range_profile_ids)}, Expected: {len(query_ids)}")
    print(f"Profiles matched per query: {[len(in_range_profile_ids[q_idx]) for q_idx in range(len(query_ids))]}")
    print(f"cohort_query_results shape: {cohort_query_results.shape}")
    
    return cohort_calculation_counts, cohort_query_results

In [25]:
brute_force_calculation_counts, brute_force_query_results = brute_force_search(queries, tagged_profiles, distance_to_similarity(query_radius))

print(f"brute_force_calculation_counts: {brute_force_calculation_counts}")
print()
print(f"brute_force_query_results: ")
print(f"{brute_force_query_results}")

brute_force_calculation_counts: [16, 16, 16]

brute_force_query_results: 
[[1 1 1 1 0 0 0 0 0 0 0 0 0 1 1 1]
 [0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1]]


In [26]:
cohort_ids, all_cohort_indices = cohort_search(queries, tagged_cohorts, query_radius, cohort_radius)

print(f"cohort_ids: {cohort_ids}")
print()
print(f"all_cohort_indices: {all_cohort_indices}")

Querying cohorts...
There are [1, 3, 2] cohorts matched to each query, respectively.
cohort_ids: ['C1', 'C2', 'C3']

all_cohort_indices: [array([0]) array([0, 1, 2]) array([0, 2])]


In [27]:
cohort_calculation_counts, cohort_query_results = profile_search(queries, tagged_profiles, cohort_ids, all_cohort_indices, query_radius)

print(f"cohort_calculation_counts: {cohort_calculation_counts}")
print()
print(f"cohort_query_results: ")
print(f"{cohort_query_results}")

Querying profiles...
Profiles collected per query: [8, 16, 13]
profiles_to_query: 
{0: {'profile_id': ['P1', 'P2', 'P3', 'P4', 'P5', 'P14', 'P15', 'P16'], 'profile_embedding': [array([1., 0.]), array([0.8660254, 0.5      ]), array([0.70710678, 0.70710678]), array([0.5      , 0.8660254]), array([0., 1.]), array([ 0.5      , -0.8660254]), array([ 0.70710678, -0.70710678]), array([ 0.8660254, -0.5      ])]}, 1: {'profile_id': ['P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'P10', 'P11', 'P12', 'P13', 'P14', 'P15', 'P16'], 'profile_embedding': [array([1., 0.]), array([0.8660254, 0.5      ]), array([0.70710678, 0.70710678]), array([0.5      , 0.8660254]), array([0., 1.]), array([-0.5      ,  0.8660254]), array([-0.70710678,  0.70710678]), array([-0.8660254,  0.5      ]), array([-1.,  0.]), array([-0.8660254, -0.5      ]), array([-0.70710678, -0.70710678]), array([-0.5      , -0.8660254]), array([ 0., -1.]), array([ 0.5      , -0.8660254]), array([ 0.70710678, -0.70710678]), array([ 0

In [28]:
def score_results(y_trues: list[list[int]], y_preds: list[list[int]]):
    y_trues_counts = [len(y_trues[query_idx]) for query_idx in range(len(y_trues))]
    y_preds_counts = [len(y_preds[query_idx]) for query_idx in range(len(y_preds))]
    print(f"Counts of y_trues by query: {y_trues_counts}.")
    print(f"Counts of y_preds by query: {y_preds_counts}.")
    precisions = [precision_score(y_trues[query_idx], y_preds[query_idx]) for query_idx in range(len(y_trues))]
    recalls = [recall_score(y_trues[query_idx], y_preds[query_idx]) for query_idx in range(len(y_trues))]
    return precisions, recalls

In [29]:
precisions, recalls = score_results(brute_force_query_results, cohort_query_results)

print(f"precisions: {precisions}")
print(f"recalls: {recalls}")

Counts of y_trues by query: [16, 16, 16].
Counts of y_preds by query: [16, 16, 16].
precisions: [1.0, 1.0, 1.0]
recalls: [1.0, 1.0, 1.0]


In [15]:
final_results = [{"query_id": query_ids[query_idx],
                      "calculation_count": cohort_calculation_counts[query_idx],
                      "precision": precisions[query_idx],
                      "recall": recalls[query_idx]}
                     for query_idx in range(len(query_ids))]

print(final_results)

[{'query_id': 'Q1', 'calculation_count': 11, 'precision': 1.0, 'recall': 1.0}, {'query_id': 'Q2', 'calculation_count': 19, 'precision': 0.8333333333333334, 'recall': 1.0}, {'query_id': 'Q3', 'calculation_count': 16, 'precision': 0.8333333333333334, 'recall': 1.0}]


In [9]:
# Extract profiles that are assigned to no cohorts
uncohorted_profile_ids = [profile["profile_id"] for profile in tagged_profiles if len(profile["cohort_ids"]) == 0]
uncohorted_profile_embeddings = [profile["profile_embedding"] for profile in tagged_profiles if len(profile["cohort_ids"]) == 0]
uncohorted_profiles = [{
    "profile_id": uncohorted_profile_ids,
    "profile_embedding": uncohorted_profile_embeddings
} for query_idx in range(len(query_ids))]

print(uncohorted_profiles)

[{'profile_id': ['P14', 'P15', 'P16'], 'profile_embedding': [[0.5, -0.8660254037844386], [0.7071067811865475, -0.7071067811865475], [0.8660254037844386, -0.5]]}, {'profile_id': ['P14', 'P15', 'P16'], 'profile_embedding': [[0.5, -0.8660254037844386], [0.7071067811865475, -0.7071067811865475], [0.8660254037844386, -0.5]]}, {'profile_id': ['P14', 'P15', 'P16'], 'profile_embedding': [[0.5, -0.8660254037844386], [0.7071067811865475, -0.7071067811865475], [0.8660254037844386, -0.5]]}]


In [10]:
# retrieve the inner profile embeddings for matched cohorts
inner_profile_candidates = []
cohort_calculation_counts = []
for query_idx, cohort_indices in enumerate(all_cohort_indices):
    cohort_matched_profile_ids = []
    cohort_matched_profile_embeddings = []
    # matched_cohort_ids = []
    
    for cohort_idx in cohort_indices:
        cohort_id = cohort_ids[cohort_idx]
        # matched_cohort_ids.append(cohort_id)
        inner_profile_ids = [profile["profile_id"] for profile in tagged_profiles if cohort_id in profile["cohort_ids"]]
        inner_profile_embeddings = [profile["profile_embedding"] for profile in tagged_profiles if profile["profile_id"] in inner_profile_ids]
        
        cohort_matched_profile_ids.extend(profile_id for profile_id in inner_profile_ids if profile_id not in cohort_matched_profile_ids)
        cohort_matched_profile_embeddings.extend(profile_embedding for profile_embedding in inner_profile_embeddings if profile_embedding not in cohort_matched_profile_embeddings)
        
        profiles_in_cohort = {
            "profile_id": cohort_matched_profile_ids,
            "profile_embedding": cohort_matched_profile_embeddings
        }
   
    inner_profile_candidates.append(profiles_in_cohort)
    cohort_calculation_counts.append(len(cohort_ids) + len(profiles_in_cohort["profile_id"]) + len(uncohorted_profiles))

print(inner_profile_candidates)
print()
print(cohort_calculation_counts)

[{'profile_id': ['P1', 'P2', 'P3', 'P4', 'P5'], 'profile_embedding': [[1, 0], [0.8660254037844386, 0.5], [0.7071067811865475, 0.7071067811865475], [0.5, 0.8660254037844386], [0, 1]]}, {'profile_id': ['P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'P10', 'P11', 'P12', 'P13'], 'profile_embedding': [[1, 0], [0.8660254037844386, 0.5], [0.7071067811865475, 0.7071067811865475], [0.5, 0.8660254037844386], [0, 1], [-0.5, 0.8660254037844386], [-0.7071067811865475, 0.7071067811865475], [-0.8660254037844386, 0.5], [-1, 0], [-0.8660254037844386, -0.5], [-0.7071067811865475, -0.7071067811865475], [-0.5, -0.8660254037844386], [0, -1]]}, {'profile_id': ['P1', 'P2', 'P3', 'P4', 'P5', 'P9', 'P10', 'P11', 'P12', 'P13'], 'profile_embedding': [[1, 0], [0.8660254037844386, 0.5], [0.7071067811865475, 0.7071067811865475], [0.5, 0.8660254037844386], [0, 1], [-1, 0], [-0.8660254037844386, -0.5], [-0.7071067811865475, -0.7071067811865475], [-0.5, -0.8660254037844386], [0, -1]]}]

[11, 19, 16]


In [11]:
profiles_to_query = []
for query_idx in range(len(query_ids)):
    candidate_profiles = {
        key: uncohorted_profiles[query_idx][key] + inner_profile_candidates[query_idx][key]
        for key in uncohorted_profiles[query_idx]
    }
    profiles_to_query.append(candidate_profiles)
print(profiles_to_query)

[{'profile_id': ['P14', 'P15', 'P16', 'P1', 'P2', 'P3', 'P4', 'P5'], 'profile_embedding': [[0.5, -0.8660254037844386], [0.7071067811865475, -0.7071067811865475], [0.8660254037844386, -0.5], [1, 0], [0.8660254037844386, 0.5], [0.7071067811865475, 0.7071067811865475], [0.5, 0.8660254037844386], [0, 1]]}, {'profile_id': ['P14', 'P15', 'P16', 'P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'P10', 'P11', 'P12', 'P13'], 'profile_embedding': [[0.5, -0.8660254037844386], [0.7071067811865475, -0.7071067811865475], [0.8660254037844386, -0.5], [1, 0], [0.8660254037844386, 0.5], [0.7071067811865475, 0.7071067811865475], [0.5, 0.8660254037844386], [0, 1], [-0.5, 0.8660254037844386], [-0.7071067811865475, 0.7071067811865475], [-0.8660254037844386, 0.5], [-1, 0], [-0.8660254037844386, -0.5], [-0.7071067811865475, -0.7071067811865475], [-0.5, -0.8660254037844386], [0, -1]]}, {'profile_id': ['P14', 'P15', 'P16', 'P1', 'P2', 'P3', 'P4', 'P5', 'P9', 'P10', 'P11', 'P12', 'P13'], 'profile_embedding':

In [12]:
in_range_profile_ids = []
for query_idx, profiles in enumerate(profiles_to_query):
    embeddings = profiles["profile_embedding"]  # Extract embeddings properly

    if len(embeddings) > 0:
        profile_knn = NearestNeighbors(algorithm="brute", metric="euclidean", n_jobs=-1)
        profile_knn.fit(embeddings)

        # Find profiles within query_radius
        all_inner_profile_indices = profile_knn.radius_neighbors(
            [query_embeddings[query_idx]], radius=query_radius, return_distance=False
        )[0]  # Extract first list

        # Retrieve profile IDs
        matched_profile_ids = [profiles["profile_id"][i] for i in all_inner_profile_indices]
        in_range_profile_ids.append(matched_profile_ids)
    else:
        in_range_profile_ids.append([])

print(in_range_profile_ids)

[['P14', 'P15', 'P16', 'P1', 'P2', 'P3', 'P4'], ['P5', 'P6', 'P7', 'P8', 'P9', 'P10'], ['P14', 'P15', 'P16', 'P1', 'P12', 'P13']]


In [13]:
cohort_query_results = np.array([
    [
        1 if profile["profile_id"] in in_range_profile_ids[q_idx] else 0
        for profile in tagged_profiles
    ]
    for q_idx in range(len(query_ids))
])

print(cohort_query_results)

[[1 1 1 1 0 0 0 0 0 0 0 0 0 1 1 1]
 [0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1]]


In [14]:
from sklearn.metrics.pairwise import cosine_similarity
def brute_force_search(query_embeddings, profile_embeddings, min_cosine_similarity):
    cosine_similarities = cosine_similarity(query_embeddings, profile_embeddings)
    print(cosine_similarities)
    return [len(profile_embeddings)] * len(query_embeddings), (cosine_similarities >= min_cosine_similarity).astype(int)

brute_force_calculation_counts, brute_force_query_results = brute_force_search(query_embeddings, profile_embeddings, 0.5)
print(brute_force_query_results)

[[ 1.00000000e+00  8.66025404e-01  7.07106781e-01  5.00000000e-01
   0.00000000e+00 -5.00000000e-01 -7.07106781e-01 -8.66025404e-01
  -1.00000000e+00 -8.66025404e-01 -7.07106781e-01 -5.00000000e-01
   0.00000000e+00  5.00000000e-01  7.07106781e-01  8.66025404e-01]
 [-8.66025404e-01 -5.00000000e-01 -2.58819045e-01 -1.48741681e-17
   5.00000000e-01  8.66025404e-01  9.65925826e-01  1.00000000e+00
   8.66025404e-01  5.00000000e-01  2.58819045e-01  1.48741681e-17
  -5.00000000e-01 -8.66025404e-01 -9.65925826e-01 -1.00000000e+00]
 [ 5.00000000e-01  1.48741681e-17 -2.58819045e-01 -5.00000000e-01
  -8.66025404e-01 -1.00000000e+00 -9.65925826e-01 -8.66025404e-01
  -5.00000000e-01 -1.48741681e-17  2.58819045e-01  5.00000000e-01
   8.66025404e-01  1.00000000e+00  9.65925826e-01  8.66025404e-01]]
[[1 1 1 1 0 0 0 0 0 0 0 0 0 1 1 1]
 [0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1]]


In [15]:
math.cos(math.pi/3)

0.5000000000000001

In [16]:
from sklearn.metrics import precision_score, recall_score
def score_results(y_trues: list[list[int]], y_preds: list[list[int]]):
    y_trues_counts = [len(y_trues[query_idx]) for query_idx in range(len(y_trues))]
    y_preds_counts = [len(y_preds[query_idx]) for query_idx in range(len(y_preds))]
    print(f"Counts of y_trues by query: {y_trues_counts}.")
    print(f"Counts of y_preds by query: {y_preds_counts}.")
    precisions = [precision_score(y_trues[query_idx], y_preds[query_idx]) for query_idx in range(len(y_trues))]
    recalls = [recall_score(y_trues[query_idx], y_preds[query_idx]) for query_idx in range(len(y_trues))]
    return precisions, recalls

In [17]:
precisions, recalls = score_results(brute_force_query_results, cohort_query_results)
final_results = [{"query_id": query_ids[query_idx],
                  "calculation_count": cohort_calculation_counts[query_idx],
                  "precision": precisions[query_idx],
                  "recall": recalls[query_idx]}
                 for query_idx in range(len(queries))]

Counts of y_trues by query: [16, 16, 16].
Counts of y_preds by query: [16, 16, 16].


In [18]:
print(final_results)

[{'query_id': 'Q1', 'calculation_count': 11, 'precision': 1.0, 'recall': 1.0}, {'query_id': 'Q2', 'calculation_count': 19, 'precision': 1.0, 'recall': 1.0}]


In [19]:
# integrating the script into cohorts pipeline
import sys
import json
from load_embeddings import LoadEmbeddings as loader
from cohort_utils import get_settings, score_to_length, score_to_similarity, score_to_distance, score_to_cosine_distance, read_objects_from_file, write_objects_to_file, write_objects_to_csv, score_results
from cohort_assignment_algorithms_v6 import multiple_cohort_assignment
from cohort_querying_algorithms_v6 import brute_force_search, multiple_cohort_querying
from cohorts_profiles_distributions import cohorts_and_profiles_distributions
from sklearn.preprocessing import normalize
import numpy as np
import matplotlib.pyplot as plt

import os
print(os.getcwd())

/home/jupyter/cohort/scripts


In [22]:
import json
import numpy as np
from sklearn.preprocessing import normalize

class LoadEmbeddings:
    def __init__(self, file_path, embedding_key):
        self.file_path = file_path
        self.embedding_key = embedding_key

    def _read_objects_from_file(self):
        """Generator to yield JSON objects from a file line-by-line."""
        with open(self.file_path, 'r') as file:
            for line in file:
                try:
                    yield json.loads(line.strip())  # Load each line as a dictionary
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON: {e}")

    def load_cohorts(self):
        """Load cohort embeddings into a structured dictionary."""
        cohorts = {
            "cohort_id": [],
            "cohort_embedding": []
        }

        for obj in self._read_objects_from_file():
            cohorts["cohort_id"].append(obj.get("id"))
            cohorts["cohort_embedding"].append(obj.get(self.embedding_key, []))

        cohorts["cohort_embedding"] = normalize(cohorts["cohort_embedding"], axis=1)
        
        print(f"Loaded {len(cohorts['cohort_id'])} cohorts.")
        
        return cohorts
    
    def load_profiles(self):
        """Load profile embeddings into a structured dictionary."""
        profiles = {
            "profile_id": [],
            "profile_embedding": []
        }

        for obj in self._read_objects_from_file():
            profiles["profile_id"].append(obj.get("id"))
            profiles["profile_embedding"].append(obj.get(self.embedding_key, []))

        profiles["profile_embedding"] = normalize(profiles["profile_embedding"], axis=1)
        
        print(f"Loaded {len(profiles['profile_id'])} profiles.") 
        
        return profiles
    
    def load_queries(self):
        """Load query embeddings into a structured dictionary."""
        queries = {
            "query_id": [],
            "query_embedding": []
        }

        for obj in self._read_objects_from_file():
            queries["query_id"].append(obj.get("id"))
            queries["query_embedding"].append(obj.get("embedding", []))

        queries["query_embedding"] = normalize(queries["query_embedding"], axis=1)
        
        print(f"Loaded {len(queries['query_id'])} queries.") 
        
        return queries

In [23]:
os.chdir("/home/jupyter")
configs = get_settings("/home/jupyter/cohort/examples/aipp_cohort_config.example.json")
client = configs["client"]
version = configs["version"]
embedding_key = configs["embedding_key"]


print("Loading embeddings...")
cohorts_loader = LoadEmbeddings(configs["keyword_embeddings_file_path"], configs["embedding_key"])
cohorts = cohorts_loader.load_cohorts()


profiles_loader = LoadEmbeddings(configs["profile_embeddings_file_path"], configs["embedding_key"])
profiles = profiles_loader.load_profiles()
print(f"There are {len(profiles)} profiles.")

queries_loader = LoadEmbeddings(configs["query_embeddings_file_path"], configs["embedding_key"])
queries = queries_loader.load_queries()
print(f"There are {len(queries)} queries.")

# keywords = read_objects_from_file(configs["keyword_embeddings_file_path"])
# cohorts = [loader(keyword, embedding_key).cohorts_dict() for keyword in keywords]
# print(f"Loaded {len(cohorts)} cohorts.")

# profiles_data = read_objects_from_file(configs["profile_embeddings_file_path"])
# profiles = [loader(profile, embedding_key).profiles_dict() for profile in profiles_data]
# print(f"Loaded {len(profiles)} profiles.")

# queries_data = read_objects_from_file(configs["query_embeddings_file_path"])
# queries = [loader(query, embedding_key).queries_dict() for query in queries_data]
# print(f"Loaded {len(queries)} queries.")

Loading embeddings...
Loaded 1966 cohorts.
Loaded 915506 profiles.
There are 2 profiles.
Loaded 10 queries.
There are 2 queries.


In [26]:
profiles["profile_id"][0:5]

['[', "'", 'K', 'Q', 'f']

In [31]:
from sklearn.neighbors import NearestNeighbors
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from cohort_utils import distance_to_length, length_to_distance
import tempfile
import pickle

def brute_force_search(queries, tagged_profiles, min_cosine_similarity):
    query_embeddings = np.array(queries["query_embedding"])
    profile_embeddings = np.stack([np.array(tagged_profiles["profile_embedding"])])    
    cosine_similarities = cosine_similarity(query_embeddings, profile_embeddings)
    
    similarity_matches = (cosine_similarities > min_cosine_similarity).astype(int)
    match_counts = similarity_matches.sum(axis=1)  # Sum over columns (profiles) for each query
    print(f"Profiles matched per query: {match_counts.tolist()}")  # Convert to list for readable output

    return [len(profile_embeddings)] * len(query_embeddings), (cosine_similarities > min_cosine_similarity).astype(int)

In [32]:
brute_force_calculation_counts, brute_force_query_results = brute_force_search(queries, profiles,
                                                                                   score_to_similarity(configs["min_query_score"]))

ValueError: Found array with dim 3. check_pairwise_arrays expected <= 2.