In [4]:
import json
import pickle
import struct
from pathlib import Path

import numpy as np
import torch
from tqdm import tqdm

In [5]:
data_dir = Path('/home/lucasheng/Tencent/data/TencentGR_1k')
with open(data_dir / 'indexer.pkl', 'rb') as ff:
    indexer = pickle.load(ff)

In [6]:
list(indexer['i'].items())[:10]

[(20002650278, 1),
 (20002695270, 2),
 (20005426854, 3),
 (20004710150, 4),
 (20004532262, 5),
 (20001868390, 6),
 (20003001030, 7),
 (20004865798, 8),
 (20005081734, 9),
 (20002194022, 10)]

In [7]:
indexer_i_rev = {v: k for k, v in indexer['i'].items()}
indexer_i_rev[28515]

20001088917

In [11]:
list(indexer['u'].items())[:10]

[('user_01060923', 1),
 ('user_00300038', 2),
 ('user_00122945', 3),
 ('user_00311455', 4),
 ('user_00427257', 5),
 ('user_00302056', 6),
 ('user_00768541', 7),
 ('user_01021061', 8),
 ('user_01070082', 9),
 ('user_01143180', 10)]

In [14]:
list(indexer['f'].keys())

['122',
 '102',
 '121',
 '118',
 '111',
 '119',
 '103',
 '120',
 '117',
 '115',
 '107',
 '114',
 '101',
 '108',
 '112',
 '109',
 '110',
 '116',
 '106',
 '105',
 '104',
 '100']

In [15]:
data_file = open(data_dir / "seq.jsonl", 'rb')
with open(Path(data_dir, 'seq_offsets.pkl'), 'rb') as f:
    seq_offsets = pickle.load(f)


In [None]:
data_file.seek(self.seq_offsets[uid])
line = data_file.readline()
data = json.loads(line)

In [8]:
import numpy as np

def embed_emb_to_3d(emb, mask):
    """
    将[N,D]的emb矩阵按[N,M]的mask嵌入到[N,M,D]的三维矩阵
    
    参数:
        emb: 嵌入矩阵，形状[N, D]
        mask: 掩码矩阵，形状[N, M]，值为1表示需要填充的位置
    返回:
        三维矩阵，形状[N, M, D]，mask为1的位置填充emb对应行，其余为0
    """
    N, D = emb.shape
    M = mask.shape[1]
    
    # 初始化[N,M,D]的全零矩阵
    result = np.zeros((N, M, D), dtype=emb.dtype)
    
    # 找到mask中所有值为1的位置（返回两个一维数组：行索引和列索引）
    rows, cols = np.where(mask == 1)
    
    # 填充emb值：result[rows, cols, :] = emb[rows, :]
    # 利用广播机制，emb[rows, :] 形状为[K,D]，与result的[K,1,D]对齐
    result[rows, cols] = emb[rows]
    
    return result

# 示例
if __name__ == "__main__":
    # 输入：emb[N,D]，mask[N,M]
    N, M, D = 3, 5, 2  # 3行，5列，每个嵌入向量长度2
    emb = np.array([
        [0.1, 0.2],   # 第0行嵌入
        [0.3, 0.4],   # 第1行嵌入
        [0.5, 0.6]    # 第2行嵌入
    ])
    mask = np.zeros((N, M), dtype=int)
    mask[0, 2] = 1  # 第0行第2列需要填充
    mask[1, 0] = 1  # 第1行第0列需要填充
    mask[2, 3] = 1  # 第2行第3列需要填充
    
    # 嵌入操作
    result = embed_emb_to_3d(emb, mask)
    
    print("emb矩阵:\n", emb)
    print("\nmask矩阵:\n", mask)
    print("\n三维结果矩阵形状:", result.shape)
    print("三维结果矩阵:\n", result)

emb矩阵:
 [[0.1 0.2]
 [0.3 0.4]
 [0.5 0.6]]

mask矩阵:
 [[0 0 1 0 0]
 [1 0 0 0 0]
 [0 0 0 1 0]]

三维结果矩阵形状: (3, 5, 2)
三维结果矩阵:
 [[[0.  0. ]
  [0.  0. ]
  [0.1 0.2]
  [0.  0. ]
  [0.  0. ]]

 [[0.3 0.4]
  [0.  0. ]
  [0.  0. ]
  [0.  0. ]
  [0.  0. ]]

 [[0.  0. ]
  [0.  0. ]
  [0.  0. ]
  [0.5 0.6]
  [0.  0. ]]]
