## LS_Lab train

In [4]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata
import logging
import copy
import sys
from tqdm import tqdm
import random
import h5py
from scipy.sparse import csr_matrix, coo_matrix, csc_matrix
from scipy.io import mmread, mmwrite
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data.dataset import Dataset
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
sc.set_figure_params(figsize=(6, 6), frameon=False)
sns.set_theme()
%config InlineBackend.print_figure_kwargs={"facecolor": "w"}
%config InlineBackend.figure_format="retina"

In [3]:
meta = { 'functionality_name': 'lslab' }

In [12]:
input_path = '../'
save_path = './Results/'
rna4intra_raw = sc.read_h5ad(f'{input_path}/0_data/processed_data/m_brain_paired_rna_raw.h5ad')
input_train_mod2 = sc.read_h5ad(f'{input_path}/1_XChrom/0_model_train/train_data/ad_trainval.h5ad')  ## training data(label)
input_test_mod2 = sc.read_h5ad(f'{input_path}/1_XChrom/0_model_train/train_data/ad_crosscell.h5ad')  ## test data(label)
with h5py.File(f'{input_path}/1_XChrom/0_model_train/train_data/splits.h5', 'r') as hf:
    trainval_cellid = hf['trainval_cell'][:]
    test_cellid = hf['test_cell'][:]
    trainval_peakid = hf['trainval_peak'][:]
input_train_mod1 = rna4intra_raw[trainval_cellid,]  ## training data(input)
final_input_test_mod1 = rna4intra_raw[test_cellid,]  ## test data(input)
input_train_mod1.var_names_make_unique()
final_input_test_mod1.var_names_make_unique()

  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


### The following files are required for calculating ns&ls

In [9]:
rna4intra = sc.read_h5ad(f'{input_path}/0_data/processed_data/m_brain_paired_rna.h5ad') ## provide cell type
rna = rna4intra[test_cellid,]
print(rna.shape,rna4intra.shape)

(487, 2214) (4878, 2214)


In [10]:
atac4intra_raw = sc.read_h5ad(f'{input_path}/0_data/processed_data/m_brain_paired_atac.h5ad')
atac4intra = atac4intra_raw[:,trainval_peakid] ## all data raw_ATAC
print(atac4intra_raw.shape,atac4intra.shape)

(4878, 40313) (4878, 36282)


In [13]:
input_train_mod1 = input_train_mod1.copy()
input_train_mod1.var_names_make_unique()
input_train_mod1.var['feature_types'] = pd.Categorical(len(input_train_mod1.var_names)*['ATAC'])
input_train_mod1.obs['batch'] = pd.Categorical(len(input_train_mod1.obs)*['batch1'])
input_train_mod1.uns = {'dataset_id': 'dataset1', 'organism': 'mouse'}

input_train_mod2 = input_train_mod2.copy()
input_train_mod2.var['feature_types'] = pd.Categorical(len(input_train_mod2.var_names)*['ATAC'])
input_train_mod2.obs['batch'] = pd.Categorical(len(input_train_mod2.obs)*['batch1'])
input_train_mod2.uns = {'dataset_id': 'dataset1', 'organism': 'mouse'}

In [15]:
common_genes = input_train_mod1.var_names.intersection(final_input_test_mod1.var_names)
# common genes
input_train_mod1 = input_train_mod1[:, common_genes]
final_input_test_mod1 = final_input_test_mod1[:, common_genes]
print(input_train_mod1.shape,final_input_test_mod1.shape)

(4391, 10721) (487, 10721)


In [16]:
dataset_id = "gex2atac"
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
batches = set(input_train_mod1.obs["batch"])
batch_dict = {batch:i for i, batch in enumerate(batches)}
y = []
for i in range(input_train_mod1.n_obs):
    y.append(int(batch_dict[input_train_mod1.obs["batch"][i]]))
print('y length:',len(y),y[:3])
# Encode the labels of each batch and store them in the y list for stratified sampling during cross-validation.
X = input_train_mod1.obs
batches = np.array(y)
inp_train_mod1 = input_train_mod1.copy()
inp_train_mod2 = input_train_mod2.copy()
print('X',X[:2])

y length: 4391 [0, 0, 0]
X                     n_genes   batch
AAACAGCCAACCGCCA-1     4382  batch1
AAACAGCCAAGGTCGA-1     2814  batch1


In [17]:
fold = 0
out1, out2 = 0, 0
if "gex2atac" in dataset_id:
    out_knn = 0
    out_knn2=0
    for train_index, test_index in skf.split(X, y):
        ## skf.split() returns a pair of training set index and validation set index pairs of cells. 
        ## Traversing them can achieve cross-validation.
        print(fold)
        fold += 1
        input_test_mod1 = inp_train_mod1[test_index, :]
        true_test_mod2 = inp_train_mod2[test_index, :]
        input_train_mod1 = inp_train_mod1[train_index, :]
        input_train_mod2 = inp_train_mod2[train_index, :]
        input_mod1 = ad.concat(
                {"train": input_train_mod1, "val": input_test_mod1, "test": final_input_test_mod1},
                axis=0,
                join="outer",
                label="group",
                fill_value=0,
                index_unique="-",
            )
        logging.info('Performing dimensionality reduction on modality 1 values...')
        embedder_mod1 = TruncatedSVD(n_components=50)
        mod1_pca = embedder_mod1.fit_transform(input_mod1.X)
        logging.info('Performing dimensionality reduction on modality 2 values...')
        embedder_mod2 = TruncatedSVD(n_components=50)
        mod2_pca = embedder_mod2.fit_transform(input_train_mod2.X)
        X_train = mod1_pca[input_mod1.obs['group'] == 'train']
        X_test = mod1_pca[input_mod1.obs['group'] == 'test'] 
        X_all = mod1_pca
        y_train = mod2_pca
        logging.info('Running Linear regression...')
        reg = KNeighborsRegressor(n_neighbors=25, metric='minkowski')
        reg.fit(X_train, y_train)
        y_pred = reg.predict(X_test)
        y_pred = y_pred @ embedder_mod2.components_
        out_knn += y_pred
        y_pred = out_knn / 10
        y_pred = csc_matrix(y_pred)
        y_all = reg.predict(X_all)
        # print('1',y_all,input_mod1.obs_names)
        y_all = y_all @ embedder_mod2.components_
        y_all = pd.DataFrame(y_all, index=['-'.join(name.split('-')[:-1]) for name in input_mod1.obs_names])
        y_all = y_all.reindex(atac4intra.obs_names) 
        out_knn2 += y_all
        y_all = out_knn2 / 10
        y_all = csc_matrix(y_all)
        adata_all = ad.AnnData(
            X=y_all,
        obs=atac4intra.obs,
        var=inp_train_mod2.var,
        uns={
            'dataset_id': dataset_id,
            'method_id': meta["functionality_name"]})
        adata = ad.AnnData(
            X=y_pred,
        obs=final_input_test_mod1.obs,
        var=inp_train_mod2.var,
        uns={
            'dataset_id': dataset_id,
            'method_id': meta["functionality_name"],
        },
        )
        logging.info('Storing annotated data...')

0
1
2
3
4
5
6
7
8
9


In [18]:
adata.write_h5ad(f'{save_path}/lslab_pred.h5ad')  ## cross-cell prediction
adata_all.write_h5ad(f'{save_path}/lslab_pred_all.h5ad')  ## all data denoise

In [19]:
set(adata_all.obs_names) == set(atac4intra.obs_names),set(adata_all.var_names) == set(atac4intra.var_names)

(True, True)

In [20]:
set(adata.obs_names) == set(input_test_mod2.obs_names),set(adata.var_names) == set(input_test_mod2.var_names)

(True, True)