In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import lil_matrix

ldgm_blocks = {}

df = pd.read_csv("/Volumes/data_files/LDGM/example/height/ldgms.GRCh38/ldgm_EUR_rsid.tsv", sep="\t")

first_blocks = sorted(df['LD_block'].unique(), key=int)[-20:]


In [2]:

for block_id in first_blocks:
    blk_name = f"blk_{block_id}"
    blk_df = df[df['LD_block'] == block_id].copy()

    # 1. 获取所有出现过的 node，并重编号（如原编号是 [3, 7, 9]，我们变成 [0, 1, 2]）
    node_ids = sorted(blk_df['LD_node'].unique())
    node_index = {node_id: i for i, node_id in enumerate(node_ids)}  # 映射 old_id -> new_id
    num_nodes = len(node_ids)

    node_to_snp = dict(zip(blk_df['LD_node'], blk_df['SNP']))
    snplist = [node_to_snp[n] for n in node_ids]

    # 2. 初始化 LD 矩阵
    ld_matrix = lil_matrix((num_nodes, num_nodes), dtype=np.float32)

    for _, row in blk_df.iterrows():
        if row['LD_neighbors'] == '.' or pd.isna(row['LD_neighbors']):
            node = node_index[int(row['LD_node'])]
            ld_matrix[node, node] = float(row['LD_diagonal'])
            continue
            
        node = node_index[int(row['LD_node'])]
        neighbors = list(map(int, row['LD_neighbors'].split(',')))
        weights = list(map(float, row['LD_weights'].split(',')))

        for n, w in zip(neighbors, weights):
            if n in node_index:  # 只处理在当前 block 中的节点
                ni = node_index[n]
                ld_matrix[node, ni] = w
                ld_matrix[ni, node] = w

        ld_matrix[node, node] = float(row['LD_diagonal'])

    chrom = blk_df["CHROM"].iloc[0]

    ldgm_blocks[blk_name] = {
        "ldblk": ld_matrix.tocsr(),
        "snplist": snplist,
        "node_map": node_index,
        "CHROM": chrom
    }

blk_1341 4089
{5: 0, 6: 1, 12: 2, 15: 3, 18: 4, 25: 5, 30: 6, 32: 7, 33: 8, 34: 9, 35: 10, 39: 11, 40: 12, 41: 13, 45: 14, 46: 15, 54: 16, 56: 17, 59: 18, 62: 19, 70: 20, 80: 21, 87: 22, 88: 23, 91: 24, 93: 25, 96: 26, 97: 27, 99: 28, 102: 29, 105: 30, 106: 31, 107: 32, 118: 33, 119: 34, 122: 35, 123: 36, 124: 37, 126: 38, 127: 39, 128: 40, 136: 41, 139: 42, 141: 43, 142: 44, 146: 45, 148: 46, 150: 47, 153: 48, 154: 49, 157: 50, 158: 51, 164: 52, 166: 53, 167: 54, 182: 55, 185: 56, 186: 57, 187: 58, 188: 59, 190: 60, 197: 61, 203: 62, 212: 63, 213: 64, 214: 65, 215: 66, 218: 67, 219: 68, 222: 69, 224: 70, 226: 71, 231: 72, 234: 73, 236: 74, 239: 75, 240: 76, 241: 77, 250: 78, 251: 79, 256: 80, 259: 81, 260: 82, 261: 83, 263: 84, 266: 85, 269: 86, 272: 87, 274: 88, 275: 89, 277: 90, 282: 91, 283: 92, 284: 93, 287: 94, 289: 95, 291: 96, 293: 97, 296: 98, 298: 99, 301: 100, 302: 101, 303: 102, 304: 103, 306: 104, 307: 105, 313: 106, 317: 107, 319: 108, 323: 109, 324: 110, 326: 111, 327: 1

In [4]:
ldgm_blocks

{'blk_1341': {'ldblk': <4089x4089 sparse matrix of type '<class 'numpy.float32'>'
  	with 57003 stored elements in Compressed Sparse Row format>,
  'snplist': ['rs1380822515',
   'rs1199020204',
   'rs1244560803',
   'rs1257002619',
   'rs1248504347',
   'rs1323985801',
   'rs1289765736',
   'rs1281174538',
   'rs1474730334',
   'rs1293960976',
   'rs1208212677',
   'rs1405060183',
   'rs1347649563',
   'rs1436460593',
   'rs1283833409',
   'rs1286184399',
   'rs1211681973',
   'rs1261088061',
   'rs1352008724',
   'rs1222512095',
   'rs1256139941',
   'rs1268019079',
   'rs1327531680',
   'rs1285098535',
   'rs1388209720',
   'rs1270270185',
   'rs1445473902',
   'rs1307452366',
   'rs1238440681',
   'rs1228006397',
   'rs1320562603',
   'rs1298325067',
   'rs1315976391',
   'rs1252223090',
   'rs1389509686',
   'rs1203629237',
   'rs1350315262',
   'rs1327694528',
   'rs1402522649',
   'rs1321154748',
   'rs1268316517',
   'rs1276217440',
   'rs1217882422',
   'rs1241819910',
   'rs1

In [6]:
print(ldgm_blocks["blk_1341"]["ldblk"].shape, len(ldgm_blocks["blk_1341"]["snplist"]))

(4089, 4089) 4089


In [27]:
import pandas as pd

ldblk = ldgm_blocks["blk_0"]["ldblk"]
snplist = ldgm_blocks["blk_0"]["snplist"]

# 转为 dense matrix（注意：如果矩阵太大可能很慢）
ld_matrix_dense = ldblk.todense()

# 转为 DataFrame，并添加行列标签为 SNP ID
df_ld = pd.DataFrame(ld_matrix_dense, index=snplist, columns=snplist)

In [28]:
df_ld

Unnamed: 0,rs62636367,rs62636368,rs3891260,rs199745162,rs201833382,rs199740902,rs867691030,rs200978805,rs369606208,rs754322362,...,rs2045333,rs80273827,rs12127835,rs12124890,rs7552943,rs72629424,rs2377039,rs3001794,rs11581885,rs12122321
rs62636367,1.553790,-0.319217,0.0,0.00000,0.00000,0.0000,-0.466229,0.0,0.0,0.0,...,0.000000,0.00000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
rs62636368,-0.319217,1.730140,0.0,0.00000,0.00000,0.0000,-0.914626,0.0,0.0,0.0,...,0.000000,0.00000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
rs3891260,0.000000,0.000000,1.0,0.00000,0.00000,0.0000,0.000000,0.0,0.0,0.0,...,0.000000,0.00000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
rs199745162,0.000000,0.000000,0.0,3.26079,0.00000,-1.6973,0.000000,0.0,0.0,0.0,...,0.000000,0.00000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
rs201833382,0.000000,0.000000,0.0,0.00000,1.14963,0.0000,0.000000,0.0,0.0,0.0,...,0.000000,0.00000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
rs72629424,0.000000,0.000000,0.0,0.00000,0.00000,0.0000,0.000000,0.0,0.0,0.0,...,0.000000,0.00000,0.00000,0.00000,0.000000,1.559130,-0.199933,0.000000,0.000000,0.000000
rs2377039,0.000000,0.000000,0.0,0.00000,0.00000,0.0000,0.000000,0.0,0.0,0.0,...,3.611450,0.00000,0.00000,0.00000,0.000000,-0.199933,14.438400,10.850700,-4.574580,0.504967
rs3001794,0.000000,0.000000,0.0,0.00000,0.00000,0.0000,0.000000,0.0,0.0,0.0,...,0.000000,2.40653,-1.09939,-4.45594,0.000000,0.000000,10.850700,17.287100,-4.757650,0.464853
rs11581885,0.000000,0.000000,0.0,0.00000,0.00000,0.0000,0.000000,0.0,0.0,0.0,...,7.794350,0.00000,-3.07405,0.00000,-0.719083,0.000000,-4.574580,-4.757650,16.392099,4.048100
