In [1]:
import pickle
import numpy as np
import torch


In [3]:
with open("./codebook_pos_sim_graph.pkl", 'rb') as f:
    data = pickle.load(f)

In [4]:
data.topk(2, dim=-1)

torch.return_types.topk(
values=tensor([[0.0028, 0.0024],
        [0.0054, 0.0052],
        [0.0050, 0.0049],
        ...,
        [0.0006, 0.0006],
        [0.0043, 0.0038],
        [0.0034, 0.0033]], dtype=torch.float64),
indices=tensor([[1904, 3565],
        [1796, 3409],
        [3300,  686],
        ...,
        [ 656,  677],
        [2787, 3658],
        [ 273,  932]]))

In [None]:
# 
epsilon = 1e-8
transformed = np.zeros_like(data)
nonzero_indices = data > 0
transformed[nonzero_indices] = 1 / (data[nonzero_indices] + epsilon)

In [6]:
torch.from_numpy(transformed).topk(2, dim=-1)

torch.return_types.topk(
values=tensor([[ 410.0954,  359.7313],
        [ 192.1242,  184.2651],
        [ 203.3474,  199.7520],
        ...,
        [1768.2491, 1758.3579],
        [ 262.4926,  232.6010],
        [ 300.0035,  294.7760]], dtype=torch.float64),
indices=tensor([[3565, 1904],
        [3409, 1796],
        [ 686, 3300],
        ...,
        [ 677,  656],
        [3658, 2787],
        [ 932,  273]]))

In [None]:
# 
alpha = 5
transformed[nonzero_indices] = np.exp(-alpha * data[nonzero_indices])

In [8]:
transformed = torch.from_numpy(transformed)

In [9]:
transformed.topk(2, dim=-1)

torch.return_types.topk(
values=tensor([[0.9879, 0.9862],
        [0.9743, 0.9732],
        [0.9757, 0.9753],
        ...,
        [0.9972, 0.9972],
        [0.9811, 0.9787],
        [0.9835, 0.9832]], dtype=torch.float64),
indices=tensor([[3565, 1904],
        [3409, 1796],
        [ 686, 3300],
        ...,
        [ 677,  656],
        [3658, 2787],
        [ 932,  273]]))

In [None]:
import torch
from torch_geometric.utils import dense_to_sparse, add_self_loops, degree

def convert_and_normalize(data):
    # 
    edge_index, edge_weight = dense_to_sparse(data)

    num_nodes = data.size(0)

    # 
    edge_index, edge_weight = add_self_loops(edge_index, edge_weight, num_nodes=num_nodes)

    # 
    row, col = edge_index
    deg = degree(row, num_nodes=num_nodes, dtype=edge_weight.dtype)
    deg_inv_sqrt = deg.pow(-0.5)
    deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0

    # 
    norm_edge_weight = deg_inv_sqrt[row] * edge_weight * deg_inv_sqrt[col]

    return edge_index, norm_edge_weight

edge_index, norm_edge_weight = convert_and_normalize(transformed)



In [12]:
edge_index, norm_edge_weight

(tensor([[   0,    0,    1,  ..., 4093, 4094, 4095],
         [1904, 3565, 1796,  ..., 4093, 4094, 4095]]),
 tensor([0.3287, 0.3293, 0.3244,  ..., 0.3333, 0.3333, 0.3333],
        dtype=torch.float64))

In [13]:
with open("./codebook_replace_graph_norm_self.pkl", 'wb') as f:
    pickle.dump([edge_index, norm_edge_weight], f)

In [14]:
with open("./codebook_co_occurrence_graph.pkl", 'rb') as f:
    codebook_co_occurrence_graph = pickle.load(f)

In [15]:
codebook_co_occurrence_graph

array([[4., 0., 1., ..., 4., 0., 1.],
       [0., 0., 1., ..., 1., 0., 0.],
       [1., 1., 0., ..., 2., 1., 1.],
       ...,
       [4., 1., 2., ..., 0., 2., 0.],
       [0., 0., 1., ..., 2., 0., 2.],
       [1., 0., 1., ..., 0., 2., 2.]])

In [None]:
def filter_co_matrix_by_row_proportion(co_matrix, topk_ratio=0.1):
    N = co_matrix.size(0)
    device = co_matrix.device

    values = []
    row_indices = []
    col_indices = []

    for i in range(N):
        row = co_matrix[i]
        row[i] = 0

        nonzero = row.nonzero(as_tuple=True)[0]
        if len(nonzero) == 0:
            continue
        
        k = max(1, int(len(nonzero) * topk_ratio))  # 
        topk_vals, topk_indices = torch.topk(row[nonzero], k)

        row_indices.extend([i] * k)
        col_indices.extend(nonzero[topk_indices].tolist())
        values.extend(topk_vals.tolist())

    indices = torch.tensor([row_indices, col_indices], dtype=torch.long, device=device)
    values = torch.tensor(values, dtype=co_matrix.dtype, device=device)
    sparse_filtered = torch.sparse_coo_tensor(indices, values, co_matrix.shape).to_dense()  

    return sparse_filtered


In [None]:
co_matrix = torch.tensor(codebook_co_occurrence_graph).float()  # shape: [1024, 1024]
filtered_matrix = filter_co_matrix_by_row_proportion(co_matrix, topk_ratio=0.01)


In [18]:
filtered_matrix

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [19]:
filtered_matrix.nonzero()

tensor([[   0,   38],
        [   0,  281],
        [   0,  301],
        ...,
        [4095, 3850],
        [4095, 3951],
        [4095, 3992]])

In [20]:
edge_index, edge_weight = convert_and_normalize(filtered_matrix)

In [21]:
with open("./codebook_co_occurrence_norm_self_20.pkl", 'wb') as f:
    pickle.dump([edge_index, edge_weight], f)