In [1]:
import anndata as ad
import os
import numpy as np
from loguru import logger
from tqdm.notebook import tqdm
import random
import torch
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import sys
from scipy.sparse import csr_matrix

In [4]:

# Directories
bench_data_dir = '/home/s2022310812/bench_data'  # bench_data 경로 복붙
datasets = [
  "IDC",
   "COAD",
   "READ",
    "LYMPH_IDC"
]
percent = '30%-5' #밑에 Step 4에서 퍼센트 따로 수정해줘야해요 아직 자동화 못했음 ㅎㅎ
for dataset in datasets:
    directory_path = os.path.join(bench_data_dir, dataset)
    dropout_path = os.path.join('/home/s2022310812/dropout_data', dataset, percent)
    adata_names = [f for f in os.listdir(directory_path + '/adata') if os.path.isfile(os.path.join(directory_path, 'adata', f))]
    
    # CUDA device configuration
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    
    # Iterate over each file in the adata directory
    for adata_name in adata_names:
        # Load AnnData object
        adata = ad.read_h5ad(os.path.join(directory_path, 'adata', adata_name))
        
        # Check if the data is sparse or dense, and save the original dtype
        original_dtype = adata.X.dtype
        if isinstance(adata.X, csr_matrix):
            data = adata.X.toarray()  # Convert sparse matrix to dense numpy array for processing
            is_sparse = True  # Remember it was sparse
        else:
            data = adata.X  # Data is already dense
            is_sparse = False  # Remember it was dense
        
        # Step 1: Convert data to int32 for processing (can be safely processed with int32)
        data_int32 = data.astype(np.int32)  # Convert to int32
        
        # Convert numpy array to PyTorch tensor and move it to GPU (if available)
        data_tensor = torch.tensor(data_int32, device=device, dtype=torch.int32)
    
        # Step 2: Create masks using the condition directly on the PyTorch tensor
        mask_gt_0 = (data_tensor > 0).to(torch.bool)
        mask_le_5 = (data_tensor <= 5).to(torch.bool)
        
        # Combine the conditions (0 < X <= 5)
        mask = mask_gt_0 & mask_le_5
        
        # Step 3: Get the indices of values to drop (where 0 < X <= 5)
        indices_to_drop = torch.nonzero(mask, as_tuple=False)
        
        # Step 4: Calculate the number of values to randomly drop (30% of the total values to drop)
        num_to_drop = int(indices_to_drop.shape[0] * 0.3)  # 30% of the total number of values
        
        # Step 5: Randomly select indices to drop
        random_indices = indices_to_drop[torch.randint(0, indices_to_drop.shape[0], (num_to_drop,))]
        
        # Step 6: Create a new "dropped" layer with the same shape as X, initialized to 0 (or False)
        dropped_layer = np.zeros_like(data_int32, dtype=bool)  # Initialize the dropped layer with False
        
        # Step 7: Mark the randomly selected indices in the dropped layer as True (or 1)
        for idx in random_indices:
            dropped_layer[idx[0], idx[1]] = True  # Mark the drop position
        
        # Step 8: Set the selected values to 0 in the main data
        data_tensor[random_indices[:, 0], random_indices[:, 1]] = 0
        
        # Step 9: Convert the tensor back to a NumPy array, preserving original dtype
        data_int32 = data_tensor.cpu().numpy().astype(np.int32)
        
        # Step 10: convert back to csr_matrix
        adata.X = csr_matrix(data_int32.astype(original_dtype))  # Convert back to csr_matrix (sparse)
        
        
        # Add the "dropped" layer to the AnnData object (store the dropped indices as a boolean mask)
        adata.layers['dropped'] = dropped_layer  # Store the boolean mask indicating dropped values
        
        # Ensure the directory exists
        os.makedirs(dropout_path, exist_ok=True)
        
        # Step 11: Save the AnnData object to the specified directory
        save_path = os.path.join(dropout_path, adata_name)
        adata.write(save_path)
        
        print(f"Dropped {num_to_drop} values with 0 < X <= 5 and saved to: {save_path}")


Dropped 162926 values with 0 < X <= 5 and saved to: /home/s2022310812/dropout_data/IDC/30%-5/NCBI785.h5ad
Dropped 118962 values with 0 < X <= 5 and saved to: /home/s2022310812/dropout_data/IDC/30%-5/NCBI783.h5ad
Dropped 307415 values with 0 < X <= 5 and saved to: /home/s2022310812/dropout_data/IDC/30%-5/TENX95.h5ad
Dropped 708676 values with 0 < X <= 5 and saved to: /home/s2022310812/dropout_data/IDC/30%-5/TENX99.h5ad
Dropped 216086 values with 0 < X <= 5 and saved to: /home/s2022310812/dropout_data/COAD/30%-5/TENX149.h5ad
Dropped 200057 values with 0 < X <= 5 and saved to: /home/s2022310812/dropout_data/COAD/30%-5/TENX147.h5ad
Dropped 222730 values with 0 < X <= 5 and saved to: /home/s2022310812/dropout_data/COAD/30%-5/TENX148.h5ad
Dropped 294286 values with 0 < X <= 5 and saved to: /home/s2022310812/dropout_data/COAD/30%-5/TENX111.h5ad
Dropped 1896465 values with 0 < X <= 5 and saved to: /home/s2022310812/dropout_data/READ/30%-5/ZEN40.h5ad
Dropped 2559624 values with 0 < X <= 5 and s