## LS_Lab train

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata
import logging
import copy
import sys
from tqdm import tqdm
import random
import h5py
from scipy.sparse import csr_matrix, coo_matrix, csc_matrix
from scipy.io import mmread, mmwrite
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data.dataset import Dataset
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
meta = { 'functionality_name': 'lslab' }

In [4]:
input_path = '../'
input_train_mod1 = sc.read_h5ad(f'{input_path}/0_preprocess/processed_data/train_rna.h5ad')
input_train_mod2 = sc.read_h5ad(f'{input_path}/0_preprocess/processed_data/train_atac.h5ad')
final_input_test_mod1 = sc.read_h5ad(f'{input_path}/0_preprocess/processed_data/test_rna.h5ad')
input_test_mod2 = sc.read_h5ad(f'{input_path}/0_preprocess/processed_data/test_atac.h5ad')
save_path = './Results/'

In [5]:
input_train_mod1.var['feature_types'] = pd.Categorical(len(input_train_mod1.var_names)*['ATAC'])
input_train_mod1.obs['batch'] = pd.Categorical(len(input_train_mod1.obs)*['batch1'])
input_train_mod1.uns = {'dataset_id': 's1d1', 'organism': 'human'}

input_train_mod2.var['feature_types'] = pd.Categorical(len(input_train_mod2.var_names)*['ATAC'])
input_train_mod2.obs['batch'] = pd.Categorical(len(input_train_mod2.obs)*['batch1'])
input_train_mod2.uns = {'dataset_id': 's1d1', 'organism': 'human'}

In [6]:
common_genes = input_train_mod1.var_names.intersection(final_input_test_mod1.var_names)
# common genes
input_train_mod1 = input_train_mod1[:, common_genes]
final_input_test_mod1 = final_input_test_mod1[:, common_genes]
print(input_train_mod1.shape,final_input_test_mod1.shape)

(6224, 12278) (4220, 12278)


In [7]:
dataset_id = "gex2atac"
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
batches = set(input_train_mod1.obs["batch"])
batch_dict = {batch:i for i, batch in enumerate(batches)}
y = []
for i in range(input_train_mod1.n_obs):
    y.append(int(batch_dict[input_train_mod1.obs["batch"][i]]))
print('y length:',len(y),y[:3])
# Encode the labels of each batch and store them in the y list for stratified sampling during cross-validation.
X = input_train_mod1.obs
batches = np.array(y)
inp_train_mod1 = input_train_mod1.copy()
inp_train_mod2 = input_train_mod2.copy()
print('X',X[:2])

y length: 6224 [0, 0, 0]
X                              cell_type   batch             Samplename  n_genes
TAGTTGTCACCCTCAC-1-s1d1  Naive CD20+ B  batch1  site1_donor1_multiome      994
CTATGGCCATAACGGG-1-s1d1     CD14+ Mono  batch1  site1_donor1_multiome     1057


In [11]:
fold = 0
out1, out2 = 0, 0
if "gex2atac" in dataset_id:
    out_knn = 0
    for train_index, test_index in skf.split(X, y):
        print(fold)
        fold += 1
        input_test_mod1 = inp_train_mod1[test_index, :]
        true_test_mod2 = inp_train_mod2[test_index, :]
        input_train_mod1 = inp_train_mod1[train_index, :]
        input_train_mod2 = inp_train_mod2[train_index, :]
        input_mod1 = anndata.concat(
                {"train": input_train_mod1, "val": input_test_mod1, "test": final_input_test_mod1},
                axis=0,
                join="outer",
                label="group",
                fill_value=0,
                index_unique="-",
            )
        logging.info('Performing dimensionality reduction on modality 1 values...')
        embedder_mod1 = TruncatedSVD(n_components=50)
        mod1_pca = embedder_mod1.fit_transform(input_mod1.X)
        logging.info('Performing dimensionality reduction on modality 2 values...')
        embedder_mod2 = TruncatedSVD(n_components=50)
        mod2_pca = embedder_mod2.fit_transform(input_train_mod2.X)
        X_train = mod1_pca[input_mod1.obs['group'] == 'train']
        X_test = mod1_pca[input_mod1.obs['group'] == 'test']
        y_train = mod2_pca
        logging.info('Running Linear regression...')
        reg = KNeighborsRegressor(n_neighbors=25, metric='minkowski')
        reg.fit(X_train, y_train)
        y_pred = reg.predict(X_test)
        y_pred = y_pred @ embedder_mod2.components_
        out_knn += y_pred
        y_pred = out_knn / 10
        y_pred = csc_matrix(y_pred)
        adata = anndata.AnnData(
            X=y_pred,
        obs=final_input_test_mod1.obs,
        var=inp_train_mod2.var,
        uns={
            'dataset_id': dataset_id,
            'method_id': meta["functionality_name"],
        })
        logging.info('Storing annotated data...')

0
ERROR! Session/line number was not unique in database. History logging moved to new session 3019
1
2
3
4
5
6
7
8
9


In [12]:
adata.write_h5ad(f'{save_path}/lslab_pred.h5ad')  ## cross-samples prediction