In [1]:
import numpy as np
import random

In [None]:
def distancePS(centerSet: np.ndarray, i: int, complete: np.ndarray) -> float:
    """
    Returns the distance between a certain point and a certain set.
    
    Parameters:
        centerSet: A numpy array containing confirmed center indexes
        i: The index of any point
        complete : Complete graph adjacency matrix containing distances between all pairs of points
    
    Returns:
        min_distance: The distance between point and center set
    """
    min_distance = float("inf")
    for center in centerSet:
        distance = complete[center][i]
        if (distance < min_distance):
            min_distance = distance
    
    return min_distance

In [None]:
def GMM(points_index: np.ndarray, k: int, complete: np.ndarray, initial: np.ndarray) -> np.ndarray:
    """
    Returns indexes of k centers after running GMM Algorithm.
    
    Parameters: 
        points_index: The indexes of data
        k: A decimal integer, the number of centers
        complete: Complete graph adjacency matrix containing distances between all pairs of points
        initial: An initial set of elements
    
    Returns:
        centers: A numpy array with k indexes as center point indexes
    """
    centers = []
    initial_list = list(initial)
    if len(initial) == 0:
        initial_point_index = random.choice(points_index)
        centers.append(initial_point_index)
    while (len(centers) < k):
        max_distance = 0
        max_distance_vector_index = None
        for i in points_index:
            distance = distancePS(centers + initial_list, i, complete)
            if distance > max_distance:
                max_distance = distance
                max_distance_vector_index = i
        centers.append(max_distance_vector_index)
    centers = np.array(centers)

    return centers

In [None]:
def FairSwap(complete: np.ndarray, set_1: np.ndarray, set_2: np.ndarray, k_1: int, k_2: int) -> tuple[np.ndarray, np.ndarray]:
    """
    Performs max-min fair diversification under partition matroid constraint when m = 2.
    From the paper Diverse Data Selection under Fairness Constraints

    Parameters:
        complete: Complete graph adjacency matrix containing distances between all pairs of points
        set_1: First set of points
        set_2: Second set of points 
        k_1: Number of points to select from set_1
        k_2: Number of points to select from set_2
        
    Returns:
        final_centers_u: 
        final_centers_o: 
    """
    amount = complete.shape[0]
    complete_array = np.arange(amount)
    centers = GMM(complete_array, k_1 + k_2, complete, np.empty(0))
    centers_1 = np.intersect1d(centers, set_1)
    centers_2 = np.intersect1d(centers, set_2)
    
    number_1 = k_1 - len(centers_1)
    number_2 = k_2 - len(centers_2)
    centers_u = None
    centers_o = None
    set_u = None
    set_o = None
    number_u = None
    number_o = None
    centers_u, centers_o, set_u, set_o, number_u, number_o = (
    (centers_1, centers_2, set_1, set_2, number_1, number_2) 
    if number_1 < number_2 
    else (centers_2, centers_1, set_2, set_1, number_2, number_1)
    )

    points_e = GMM(set_u, number_u, complete, centers_u)

    points_r = []
    for point in points_e:
        min_distance = float("inf")
        r = None
        for o in centers_o: 
            distance = complete[point][o]
            if distance < min_distance:
                min_distance = distance
                r = o
        points_r.append(r)
    points_r = np.array(points_r)
    
    final_centers_u = np.union1d(centers_u, points_e)
    final_centers_o = np.setdiff1d(centers_o, points_r)

    return final_centers_u, final_centers_o
    

In [None]:
def FairFlow(complete: np.ndarray, sets: tuple, gamma: float):
    """
    Performs max-min fair diversification under partition matroid constraint when m > 2.
    From the paper Diverse Data Selection under Fairness Constraints

    Parameters:
        complete: Complete graph adjacency matrix containing distances between all pairs of points
        set_s: The points including critical regions and k
        gamma: A guess of the optimum fair diversity
        
    Returns:
    
    """
    # 获取集合数量m和每个集合对应的k值
    m = len(sets)
    ks = [s[1] for s in sets]
    universes = [s[0] for s in sets]
    
    # 步骤1-3: 对每个集合使用GMM算法
    Y = []
    for i in range(m):
        y_i = GMM(universes[i], ks[i], complete, np.array([]))
        Y.append(y_i)
        
    # 步骤4: 计算Z_i
    d1 = m / (3*m-1) * gamma
    Z = []
    for i in range(m):
        z_i = []
        for y in Y[i]:
            valid = True
            for z in z_i:
                if complete[y][z] < d1:
                    valid = False
                    break
            if valid:
                z_i.append(y)
        Z.append(np.array(z_i))
    
    # 步骤5-6: 构建无向图G_Z并获取连通分量
    d2 = 2/(3*m-1) * gamma
    nodes = np.concatenate(Z)
    edges = []
    for i in range(len(nodes)):
        for j in range(i+1, len(nodes)):
            if complete[nodes[i]][nodes[j]] <= d2:
                edges.append((nodes[i], nodes[j]))
                
    # 使用并查集找到连通分量
    parent = {node: node for node in nodes}
    def find(x):
        if parent[x] != x:
            parent[x] = find(parent[x])
        return parent[x]
    
    def union(x, y):
        parent[find(x)] = find(y)
        
    for edge in edges:
        union(edge[0], edge[1])
        
    components = {}
    for node in nodes:
        root = find(node)
        if root not in components:
            components[root] = []
        components[root].append(node)
    
    C = list(components.values())
    
    # 步骤7-8: 构建有向图并计算最大流
    from collections import defaultdict
    
    def max_flow(graph, source, sink):
        def bfs(graph, source, sink, parent):
            visited = set()
            queue = [source]
            visited.add(source)
            while queue:
                u = queue.pop(0)
                for v, cap in graph[u].items():
                    if v not in visited and cap > 0:
                        queue.append(v)
                        visited.add(v)
                        parent[v] = u
            return sink in visited
            
        flow = 0
        parent = {}
        while bfs(graph, source, sink, parent):
            path_flow = float("inf")
            s = sink
            while s != source:
                path_flow = min(path_flow, graph[parent[s]][s])
                s = parent[s]
            flow += path_flow
            v = sink
            while v != source:
                u = parent[v]
                graph[u][v] -= path_flow
                graph[v][u] += path_flow
                v = parent[v]
        return flow
    
    # 构建网络流图
    graph = defaultdict(lambda: defaultdict(int))
    source = 'source'
    sink = 'sink'
    
    # 添加从源点到u_i的边
    for i in range(m):
        graph[source][f'u_{i}'] = ks[i]
    
    # 添加从u_i到v_j的边
    for i in range(m):
        for j, comp in enumerate(C):
    
