In [1]:
import os
import glob
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.cluster import KMeans
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
# set your file dir path
npy_directory = '/bask/homes/a/asiw9691/PathVLM/WSI_Dataset/Conch/TCGA-*/'
feature1_files = glob.glob(os.path.join(npy_directory, "*_0_1024.npy"))
feature2_files = glob.glob(os.path.join(npy_directory, "*_1_512.npy"))
feature3_files = glob.glob(os.path.join(npy_directory, "*_1_1024.npy"))

df_fea1 = pd.DataFrame(feature1_files, columns=['fea1_file_path'])
df_fea2 = pd.DataFrame(feature2_files, columns=['fea2_file_path'])
df_fea3 = pd.DataFrame(feature3_files, columns=['fea3_file_path'])

patients_file = '/bask/homes/a/asiw9691/PathVLM/WSI_Dataset/TCGA_Clinical.tsv'
patients_list = pd.read_csv(patients_file, sep='\t')[['case_submitter_id','project_id']].drop_duplicates()
patients_list.columns = ['patient_id','project']

In [3]:
df_fea1['slide_id'] = df_fea1['fea1_file_path'].apply(lambda x: os.path.basename(x).split('.')[0])
df_fea2['slide_id'] = df_fea2['fea2_file_path'].apply(lambda x: os.path.basename(x).split('.')[0])
df_fea3['slide_id'] = df_fea3['fea3_file_path'].apply(lambda x: os.path.basename(x).split('.')[0])

df_fea1 = df_fea1.drop_duplicates(subset='slide_id', keep='first').reset_index(drop=True)
df_fea2 = df_fea2.drop_duplicates(subset='slide_id', keep='first').reset_index(drop=True)
df_fea3 = df_fea3.drop_duplicates(subset='slide_id', keep='first').reset_index(drop=True)

df_fea = pd.merge(df_fea1, df_fea2, on='slide_id', how='inner')
df_fea = pd.merge(df_fea, df_fea3, on='slide_id', how='inner')

df_fea['patient_id'] = df_fea['slide_id'].apply(lambda x: x[:12])

df = pd.merge(df_fea, patients_list, on='patient_id', how='inner')

df['project'] = df['fea3_file_path'].map(lambda x: x.split('/')[-2])

In [5]:
df.head()

Unnamed: 0,fea1_file_path,slide_id,fea2_file_path,fea3_file_path,patient_id,project
0,/bask/homes/a/asiw9691/PathVLM/WSI_Dataset/Con...,TCGA-FK-A4UB-01Z-00-DX1,/bask/homes/a/asiw9691/PathVLM/WSI_Dataset/Con...,/bask/homes/a/asiw9691/PathVLM/WSI_Dataset/Con...,TCGA-FK-A4UB,TCGA-THCA
1,/bask/homes/a/asiw9691/PathVLM/WSI_Dataset/Con...,TCGA-EM-A22O-01Z-00-DX1,/bask/homes/a/asiw9691/PathVLM/WSI_Dataset/Con...,/bask/homes/a/asiw9691/PathVLM/WSI_Dataset/Con...,TCGA-EM-A22O,TCGA-THCA
2,/bask/homes/a/asiw9691/PathVLM/WSI_Dataset/Con...,TCGA-FK-A3SE-01Z-00-DX1,/bask/homes/a/asiw9691/PathVLM/WSI_Dataset/Con...,/bask/homes/a/asiw9691/PathVLM/WSI_Dataset/Con...,TCGA-FK-A3SE,TCGA-THCA
3,/bask/homes/a/asiw9691/PathVLM/WSI_Dataset/Con...,TCGA-DE-A4MD-01Z-00-DX1,/bask/homes/a/asiw9691/PathVLM/WSI_Dataset/Con...,/bask/homes/a/asiw9691/PathVLM/WSI_Dataset/Con...,TCGA-DE-A4MD,TCGA-THCA
4,/bask/homes/a/asiw9691/PathVLM/WSI_Dataset/Con...,TCGA-L6-A4EP-01Z-00-DX1,/bask/homes/a/asiw9691/PathVLM/WSI_Dataset/Con...,/bask/homes/a/asiw9691/PathVLM/WSI_Dataset/Con...,TCGA-L6-A4EP,TCGA-THCA


In [6]:
# 定义聚类中心计算函数
def get_cluster_centers_indices(data, n_clusters):
    
    # 如果n_clusters大于数据点个数，复制数据
    if len(data) < n_clusters:
        return list(range(len(data)))
    
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(data)
    labels = kmeans.labels_
    
    cluster_centers_indices = []
    for i in range(n_clusters):
        cluster_indices = np.where(labels == i)[0]
        center_index = cluster_indices[np.argmin(np.linalg.norm(data[cluster_indices] - kmeans.cluster_centers_[i], axis=1))]
        cluster_centers_indices.append(center_index)
    
    return cluster_centers_indices

# 定义处理单行数据的函数
def process_row(i, df, n_clusters):
    
    
    feature1_content = np.load(df.iloc[i]['fea1_file_path'], allow_pickle=True)
    feature1 = feature1_content[()]['feature'].cpu().numpy()
    # feature1_cor = feature1_content[()]['index']
    
    feature2_content = np.load(df.iloc[i]['fea2_file_path'], allow_pickle=True)
    feature2 = feature2_content[()]['feature'].cpu().numpy()
    # feature2_cor = feature2_content[()]['index']
    
    feature3_content = np.load(df.iloc[i]['fea3_file_path'], allow_pickle=True)
    feature3 = feature3_content[()]['feature'].cpu().numpy()
    # feature3_cor = feature3_content[()]['index']
    
    # 计算每个特征的聚类中心
    try:
        f1_cc = get_cluster_centers_indices(feature1, n_clusters[0])
        f1_cc_fea = np.array([feature1[index, :] for index in f1_cc])  # 转换为numpy数组
    except Exception as e:
        print(f"Error processing feature1 for row {i}: {e}")
        f1_cc_fea = np.array([])  # 返回空的numpy数组

    try:
        f2_cc = get_cluster_centers_indices(feature2, n_clusters[1])
        f2_cc_fea = np.array([feature2[index, :] for index in f2_cc])  # 转换为numpy数组
    except Exception as e:
        print(f"Error processing feature2 for row {i}: {e}")
        f2_cc_fea = np.array([])  # 返回空的numpy数组

    try:
        f3_cc = get_cluster_centers_indices(feature3, n_clusters[2])
        f3_cc_fea = np.array([feature3[index, :] for index in f3_cc])  # 转换为numpy数组
    except Exception as e:
        print(f"Error processing feature3 for row {i}: {e}")
        f3_cc_fea = np.array([])  # 返回空的numpy数组
    
    slide_id = df['slide_id'].iloc[i]
    
    return {'f1024': f1_cc_fea, 
            'f2048': f2_cc_fea, 
            'f4096': f3_cc_fea}, slide_id
            # 'slide_id': df['slide_id'].iloc[i],
            # 'project': df['project'].iloc[i]}

def process_data_multithreaded(df, n_clusters, save_dir, max_threads=4):
    # data = []
    
    with ThreadPoolExecutor(max_workers=max_threads) as executor:
        futures = [executor.submit(process_row, i, df, n_clusters) for i in range(df.shape[0])]
        
        for future in tqdm(as_completed(futures), total=len(futures)):
            feature_cc, slide_id = future.result()
            np.save(os.path.join(save_dir, f'{slide_id}.npy'), feature_cc)

In [10]:
n_clusters = [32, 16, 8]

for project in df['project'].unique()[15:]:
    print(project)
    df_sub = df[df['project'] == project]
    print(df_sub.shape)
    save_dir = os.path.join('/bask/homes/a/asiw9691/PathVLM/WSI_Dataset/Conch_CC', project)
    
    # create dir
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)  # Ensure the directory exists
        
    processed_data = process_data_multithreaded(df_sub, n_clusters, save_dir, max_threads=10)
    # processed_data = pd.DataFrame(processed_data)
    # processed_data.to_csv(os.path.join('/bask/homes/a/asiw9691/PathVLM/WSI_Dataset/Conch_CC', '{}_cc.csv'.format(project)), index=False)

TCGA-LGG
(843, 6)


100%|██████████| 843/843 [02:33<00:00,  5.50it/s]


TCGA-COAD
(442, 6)


100%|██████████| 442/442 [01:28<00:00,  5.00it/s]


TCGA-Rest
(1720, 6)


100%|██████████| 1720/1720 [05:37<00:00,  5.10it/s]


In [9]:
df['project'].unique()[15:]

array(['TCGA-LGG', 'TCGA-COAD', 'TCGA-Rest'], dtype=object)

In [12]:
from datasets import load_dataset, concatenate_datasets, load_from_disk, logging
data_cache_dir = "/bask/projects/p/phwq4930-renal-canc/Zeyu/PathVLM/.cache"
split_text = "train"

dataset_name = "CNX-PathLLM/Pathcap,CNX-PathLLM/PubMedPath,CNX-PathLLM/TwitterPath,CNX-PathLLM/CleanedTextData"
dataset_name = dataset_name.split(',')

In [13]:
dataset = load_dataset(dataset_name[0], split=split_text, cache_dir=data_cache_dir)
print(dataset_name)


In [14]:
dataset

Dataset({
    features: ['__key__', '__url__', 'jpg', 'txt'],
    num_rows: 223169
})