In [1]:
import sys
import os 
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import numpy as np
sys.path.append("/home/local/kyeonghunjeong_920205/nipa_bu/COVID19/3.analysis/9.MIL/scAMIL_cell/scMILD")
from src.utils import *
from src.dataset import InstanceDataset
from src.model import *
from sklearn.model_selection import train_test_split
import scanpy as sc
from scipy import sparse
import modin.pandas as pd


In [2]:
dir_path="NS"
base_path = f"../../data/{dir_path}/"
target_dir = f'{base_path}/AE/'


In [3]:
device_num = 0
device = torch.device(f'cuda:{device_num}' if torch.cuda.is_available() else 'cpu')
print("INFO: Using device: {}".format(device))


INFO: Using device: cuda:0


In [4]:

def save_cell_scores(saved_model_path, exp, test_dataset, label_encoder, device, suffix=None):
    instance_test_dataset = update_instance_labels_with_bag_labels(test_dataset, device)

    model_teacher = torch.load(f'{saved_model_path}/model_teacher_exp{exp}.pt', map_location=device)
    model_encoder = torch.load(f'{saved_model_path}/model_encoder_exp{exp}.pt', map_location=device)
    model_student = torch.load(f'{saved_model_path}/model_student_exp{exp}.pt', map_location=device)

    model_encoder.eval()
    model_student.eval()
    model_teacher.eval()
    with torch.no_grad():
        features = model_encoder(instance_test_dataset.data.clone().detach().float().to(device))[:, :model_teacher.input_dims].detach().requires_grad_(False)
        cell_score_teacher = model_teacher.attention_module(features).squeeze(0)
    
    features_np = features.cpu().detach().numpy()
    cell_score_teacher_np = cell_score_teacher.cpu().detach().numpy()

    df = pd.DataFrame(features_np, columns=[f'feature_{i}' for i in range(features_np.shape[1])])
    df['cell_type'] = label_encoder.inverse_transform(instance_test_dataset.instance_labels.cpu().detach().numpy())
    df['cell_score'] = cell_score_teacher_np
    df['bag_labels'] = instance_test_dataset.bag_labels.cpu().detach().numpy()
    df['instance_labels'] = instance_test_dataset.instance_labels.cpu().detach().numpy()
    df['cell_score_minmax'] = (df['cell_score'].values - min(df['cell_score'].values)) / (max(df['cell_score'].values) - min(df['cell_score'].values))
    if suffix is not None: 
        df.to_csv(f'cell_score_{exp}_{suffix}.csv', index=False)    
    else: 
        df.to_csv(f'cell_score_{exp}.csv', index=False)
        
    return 0


In [5]:

saved_model_paths = [
        # '/home/local/kyeonghunjeong_920205/nipa_bu/COVID19/3.analysis/9.MIL/scAMIL_cell/scMILD/results/NS_model_ae_ed128_md16_lr0.0001_100_0.3_1_10__0607_433_op_gmm_device4_only_using_loss_switch',
        # '/home/local/kyeonghunjeong_920205/nipa_bu/COVID19/3.analysis/9.MIL/scAMIL_cell/scMILD/results/NS_baseline_model_ae_ed128_md16_lr0.0001_100_0.3_1_10__0613',
        '/home/local/kyeonghunjeong_920205/nipa_bu/COVID19/3.analysis/9.MIL/scAMIL_cell/scMILD/results/NS_not_op_model_ae_ed128_md16_lr0.0001_100_0.3_1_10__0613'
    ]

for saved_model_path in saved_model_paths:
    for exp in range(1, 9):
        print(f'Experiment {exp}')
        _, _, test_dataset, label_encoder = load_dataset_and_preprocessors(base_path, exp, device)
        # suffix = 'baseline' if 'baseline' in saved_model_path else None
        suffix = 'not_op' if 'not_op' in saved_model_path else None
        save_cell_scores(saved_model_path, exp, test_dataset, label_encoder, device, suffix)
        torch.cuda.empty_cache()

Experiment 1



    import ray
    ray.init()

2024-06-13 11:18:42,784	INFO worker.py:1724 -- Started a local Ray instance.


Experiment 2




Experiment 3




Experiment 4




Experiment 5




Experiment 6




Experiment 7




Experiment 8




*** SIGTERM received at time=1718279657 on cpu 10 ***
PC: @     0x7f10460a068e  (unknown)  epoll_wait
    @     0x7f10462e1420  (unknown)  (unknown)
[2024-06-13 11:54:17,787 E 837338 837338] logging.cc:361: *** SIGTERM received at time=1718279657 on cpu 10 ***
[2024-06-13 11:54:17,787 E 837338 837338] logging.cc:361: PC: @     0x7f10460a068e  (unknown)  epoll_wait
[2024-06-13 11:54:17,787 E 837338 837338] logging.cc:361:     @     0x7f10462e1420  (unknown)  (unknown)


: 

In [19]:

dat = sparse.load_npz(os.path.join(base_path, "RawCounts.npz"))
genes = open(os.path.join(base_path, "genes.txt")).read().strip().split("\n")
barcodes = open(os.path.join(base_path, "barcodes.txt")).read().strip().split("\n")
meta = pd.read_csv(os.path.join(base_path, "20210701_NasalSwab_MetaData.txt"), sep="\t").drop(axis=0,index=0).reset_index(drop=True)

cell_types = pd.read_csv(os.path.join(base_path, "20210220_NasalSwab_UMAP.txt"), sep="\t").drop(axis=0,index=0).reset_index(drop=True)["Category"]
ct_id = sorted(set(cell_types))
mapping_ct = {c:idx for idx, c in enumerate(ct_id)}

X = []
y = []
ct = []

adata = sc.AnnData(dat.astype(np.float32), obs=barcodes, var=genes)

print(adata.shape)
barcodes = adata.obs[0].tolist()

meta_subset = meta[meta['NAME'].isin(barcodes)]
meta_subset.set_index('NAME', inplace=True)
meta_subset = meta_subset.reindex(adata.obs[0])

adata.obs['ind_cov'] = meta_subset['donor_id'].values
adata.obs['ct_cov'] = meta_subset['Coarse_Cell_Annotations'].values
adata.obs['disease_cov'] = meta_subset['disease__ontology_label'].values

adata = adata[adata.obs['disease_cov'].isin(['normal', 'COVID-19'])]
print(adata.shape)

sc.pp.filter_genes(adata, min_cells=5)
print("Preprocessing Complete!")
print(adata.shape)
mapping = {'normal': 0, 'COVID-19': 1}
adata.obs['disease_numeric'] = adata.obs['disease_cov'].map(mapping)
adata.obs['sample_id_numeric'], _ = pd.factorize(adata.obs['ind_cov'])
sample_labels = adata.obs[['disease_numeric', 'sample_id_numeric']].drop_duplicates()




(32588, 32871)
(26947, 32871)




Preprocessing Complete!
(26947, 27765)


In [20]:

saved_model_path = '../../results/model_NS_ae_ed128_md64_lr0.0001_500_0.3_3_45_reported'
saved_baseline_path = '../../results/model_NS_ae_ed128_md64_lr0.0001_500_0.3_3_45_baseline'


In [23]:
for exp in range(1,9):
    print(f'Experiment {exp}')
    train_dataset, val_dataset, test_dataset, label_encoder = load_dataset_and_preprocessors(base_path, exp, device)
    instance_test_dataset = update_instance_labels_with_bag_labels(test_dataset, device)
    model_teacher = torch.load(f'{saved_model_path}/model_teacher_exp{exp}.pt')
    model_encoder = torch.load(f'{saved_model_path}/model_encoder_exp{exp}.pt')
    
    baseline_teacher = torch.load(f'{saved_baseline_path}/model_teacher_exp{exp}.pt')
    baseline_encoder = torch.load(f'{saved_baseline_path}/model_encoder_exp{exp}.pt')

    model_encoder.to(device)
    model_teacher.to(device)

    model_encoder.eval()
    model_teacher.eval()
    
    with torch.no_grad():
        features = model_encoder(instance_test_dataset.data.clone().detach().float().to(device))[:, :model_teacher.input_dims].detach().requires_grad_(False)
        cell_score_teacher = model_teacher.attention_module(features).squeeze(0)
    features_np = features.cpu().detach().numpy()
    cell_score_teacher_np = cell_score_teacher.cpu().detach().numpy()

    df = pd.DataFrame(features_np, columns = [f'feature_{i}' for i in range(features_np.shape[1])])

    df['cell_type']= label_encoder.inverse_transform(instance_test_dataset.instance_labels.cpu().detach().numpy())
    df['cell_score'] = cell_score_teacher_np
    df['bag_labels'] = instance_test_dataset.bag_labels.cpu().detach().numpy()
    df['instance_labels'] = instance_test_dataset.instance_labels.cpu().detach().numpy()
    df['cell_score_minmax']= (df['cell_score'].values - min(df['cell_score'].values)) / (max(df['cell_score'].values)- min(df['cell_score'].values))

    df.to_csv(f'cell_score_{exp}.csv', index=False)
    split_ratio = [0.5, 0.25, 0.25]
    train_val_set, test_set = train_test_split(sample_labels, test_size=split_ratio[2], random_state=exp, stratify=sample_labels['disease_numeric'])
    train_set, val_set = train_test_split(train_val_set, test_size=split_ratio[1] / (1 - split_ratio[2]), random_state=exp,stratify=train_val_set['disease_numeric'])
    test_data = adata[adata.obs['sample_id_numeric'].isin(test_set['sample_id_numeric'])]
    

    test_data.obs.rename(columns={0: 'cell.names'}, inplace=True)
    test_data.var.rename(columns={0: 'gene.names'}, inplace=True)
    test_data.obs.columns = [sub.replace('(', '') for sub in test_data.obs.columns]
    test_data.obs.columns = [sub.replace(')', '') for sub in test_data.obs.columns]
    test_data.obs.columns = [sub.replace('/', '') for sub in test_data.obs.columns]
    test_data.obs.columns = [sub.replace('=', '.') for sub in test_data.obs.columns]
    test_data.obs.columns = [sub.replace(' ', '_') for sub in test_data.obs.columns]
    test_data.obs.columns = [sub.replace('-', '_') for sub in test_data.obs.columns]

    test_data.obs.index = test_data.obs['cell.names']
    test_data.var.index = test_data.var['gene.names']
    test_data.write(filename=f"anndata_{exp}.h5ad")
    test_data.obs.to_csv(f"obs_{exp}.csv")



Experiment 1




Experiment 2




Experiment 3




Experiment 4




Experiment 5




Experiment 6




Experiment 7




Experiment 8




In [None]:
saved_baseline_path = '../../results/model_NS_ae_ed128_md64_lr0.0001_500_0.3_3_45_reported'

In [24]:
print(saved_model_path)

adata.obs.rename(columns={0: 'cell.names'}, inplace=True)
adata.var.rename(columns={0: 'gene.names'}, inplace=True)
adata.obs.columns = [sub.replace('(', '') for sub in adata.obs.columns]
adata.obs.columns = [sub.replace(')', '') for sub in adata.obs.columns]
adata.obs.columns = [sub.replace('/', '') for sub in adata.obs.columns]
adata.obs.columns = [sub.replace('=', '.') for sub in adata.obs.columns]
adata.obs.columns = [sub.replace(' ', '_') for sub in adata.obs.columns]
adata.obs.columns = [sub.replace('-', '_') for sub in adata.obs.columns]

adata.obs.index = adata.obs['cell.names']
adata.var.index = adata.var['gene.names']
adata.write(filename=f"whole_anndata.h5ad")
adata.obs.to_csv(f"whole_obs.csv")

../../results/model_NS_ae_ed128_md64_lr0.0001_500_0.3_3_45_reported
