In [None]:
import anndata
import numpy as np
import pandas as pd
import os
import sys
from tqdm import tqdm
import json

sys.path.append('/code/decima/src/decima/')
import preprocess

## paths

In [None]:
save_dir = '/gstore/data/resbioai/grelu/decima/20240823/processed_pseudobulks/'

In [None]:
sc_file = os.path.join(save_dir, 'scimilarity_processed.h5ad')
br_file = os.path.join(save_dir, 'brain_processed.h5ad')
sk_file = os.path.join(save_dir, 'skin_processed.h5ad')
ret_file = os.path.join(save_dir, 'retina_processed.h5ad')

## Load

In [None]:
%%time
sc = anndata.read_h5ad(sc_file)
br = anndata.read_h5ad(br_file)
sk = anndata.read_h5ad(sk_file)
ret = anndata.read_h5ad(ret_file)

In [None]:
%%time
gtf = resources.load_gtf(
    file='/gstore/data/resbioai/grelu/decima/refdata-gex-GRCh38-2020-A/genes/genes.gtf',
    feature="transcript")

genes20 = preprocess.merge_transcripts(gtf)

gtf = resources.load_gtf(
    file='/gstore/data/resbioai/grelu/decima/refdata-gex-GRCh38-2024-A/genes/genes.gtf',
    feature="transcript")

genes24 = preprocess.merge_transcripts(gtf)

In [None]:
genes24 = genes24[~(genes24.index.isin(genes20.index))]
print(len(genes20), len(genes24))

## Process scimilarity data

### Match gene names to cellranger

In [None]:
sc.var = sc.var.merge(genes20, left_index=True, right_index=True, how="left")
sc.var.head(2)

In [None]:
preprocess.match_cellranger_2024(sc, genes24=genes24)

### Match remaining gene names to NCBI

In [None]:
sc.var['symbol'] = None

In [None]:
unm = sc.var.index[sc.var.chrom.isna()].tolist()
len(unm)

In [None]:
%%time
ncbi = !datasets summary gene symbol {" ".join(unm)} --report gene

In [None]:
ncbi = preprocess.load_ncbi_string(ncbi)
ncbi = ncbi[(ncbi.gene_id is None) or (~ncbi.gene_id.isin(sc.var.gene_id))]
print(len(ncbi), ncbi['gene_id'].value_counts().max(), ncbi.symbol.value_counts().max())

In [None]:
preprocess.match_ncbi(sc, ncbi)

In [None]:
sc.var.chrom.isna().sum(), sc.var.gene_id.value_counts().max()

## Process skin atlas data

### match gene names to cellranger

In [None]:
sk.var = sk.var.merge(genes20, left_index=True, right_index=True, how="left")
display(sk.var.head(2))
print(sk.var.chrom.isna().sum())

In [None]:
preprocess.match_cellranger_2024(sk, genes24=genes24)

In [None]:
sk.var['symbol'] = None

In [None]:
preprocess.match_ref_ad(sk, sc)

In [None]:
print(sk.var.chrom.isna().sum()), sk.var.gene_id.value_counts().max()

### match remaining gene names to NCBI

In [None]:
unm = sk.var.index[sk.var.chrom.isna()].tolist()
len(unm)

In [None]:
unm_arrs = np.array_split(unm, 50)
df = []

for unm in tqdm(unm_arrs):
    ncbi = !datasets summary gene symbol {" ".join(list(unm))} --report gene
    try:
        curr_df = preprocess.load_ncbi_string(ncbi)
        df.append(curr_df)
    except:
        print(ncbi)

ncbi = pd.concat(df)

In [None]:
ncbi = ncbi[(ncbi.gene_id is None) or (~ncbi.gene_id.isin(sk.var.gene_id))]
print(len(ncbi), ncbi['gene_id'].value_counts().max(), ncbi.symbol.value_counts().max())

In [None]:
ncbi = ncbi[ncbi.gene_id.isin(
    ncbi.gene_id.value_counts()[ncbi.gene_id.value_counts()==1].index
)]

In [None]:
print(len(ncbi), ncbi['gene_id'].value_counts().max(), ncbi.symbol.value_counts().max())

In [None]:
preprocess.match_ncbi(sk, ncbi)
sk.var.chrom.isna().sum(), sk.var.gene_id.value_counts().max()

## Process retina data

### Match gene names to cellranger

In [None]:
ret.var = ret.var.merge(genes20, left_index=True, right_index=True, how="left")
display(ret.var.head(2))
print(ret.var.chrom.isna().sum())

In [None]:
preprocess.match_cellranger_2024(ret, genes24=genes24)

In [None]:
ret.var['symbol'] = None

In [None]:
preprocess.match_ref_ad(ret, sc)

In [None]:
preprocess.match_ref_ad(ret, sk)

### Match remaining gene names to NCBI

In [None]:
unm = ret.var.index[ret.var.chrom.isna()].tolist()
len(unm)

In [None]:
unm_arrs = np.array_split(unm, 100)
df = []

for unm in tqdm(unm_arrs):
    ncbi = !datasets summary gene symbol {" ".join(list(unm))} --report gene
    try:
        curr_df = preprocess.load_ncbi_string(ncbi)
        df.append(curr_df)
    except:
        print(ncbi)

ncbi = pd.concat(df)

In [None]:
ncbi = ncbi[(ncbi.gene_id is None) or (~ncbi.gene_id.isin(ret.var.gene_id))]
print(len(ncbi), ncbi['gene_id'].value_counts().max(), ncbi.symbol.value_counts().max())

In [None]:
ncbi = ncbi[ncbi.symbol!='EFCAB3P1']

In [None]:
print(len(ncbi), ncbi['gene_id'].value_counts().max(), ncbi.symbol.value_counts().max())

In [None]:
preprocess.match_ncbi(ret, ncbi)
ret.var.chrom.isna().sum(), ret.var.gene_id.value_counts().max()

## Process Brain data

### match gene names to cellranger

In [None]:
br.var = br.var.merge(genes20, left_index=True, right_index=True, how="left")
print(br.var.chrom.isna().sum())

In [None]:
preprocess.match_cellranger_2024(br, genes24=genes24)

In [None]:
br.var['symbol'] = None

In [None]:
preprocess.match_ref_ad(br, sc)

In [None]:
preprocess.match_ref_ad(br, sk)

In [None]:
preprocess.match_ref_ad(br, ret)

In [None]:
print(len(br), br.var['gene_id'].value_counts().max(), br.var.symbol.value_counts().max())

## Drop unannotated genes from all datasets

In [None]:
print(sc.shape)
sc = sc[:, ~sc.var.chrom.isna()]
print(sc.shape)

In [None]:
print(sk.shape)
sk = sk[:, ~sk.var.chrom.isna()]
print(sk.shape)

In [None]:
print(ret.shape)
ret = ret[:, ~ret.var.chrom.isna()]
print(ret.shape)

In [None]:
print(br.shape)
br = br[:, ~br.var.chrom.isna()]
print(br.shape)

## Combine all datasets

In [None]:
sc.var = sc.var.reset_index(names='gene_name').set_index('gene_id')
sk.var = sk.var.reset_index(names='gene_name').set_index('gene_id')
ret.var = ret.var.reset_index(names='gene_name').set_index('gene_id')
br.var = br.var.reset_index(names='gene_name').set_index('gene_id')

In [None]:
sc.var.index = sc.var.index.astype(str)
sc.var_names = sc.var.index.astype(str)

sk.var.index = sk.var.index.astype(str)
sk.var_names = sk.var.index.astype(str)

ret.var.index = ret.var.index.astype(str)
ret.var_names = ret.var.index.astype(str)

br.var.index = br.var.index.astype(str)
br.var_names = br.var.index.astype(str)

In [None]:
common_genes = list(set(
    sc.var_names).intersection(
    sk.var_names).intersection(
    ret.var_names).intersection(
    br.var_names)
)

len(common_genes)

In [None]:
%%time
sc_common = sc[:, common_genes].copy()
sk_common = sk[:, common_genes].copy()
ret_common = ret[:, common_genes].copy()
br_common = br[:, common_genes].copy()

In [None]:
sc_common.var.start = sc_common.var.start.astype(int)
sc_common.var.end = sc_common.var.end.astype(int)

In [None]:
ad_inner = anndata.concat(
    [sc_common, sk_common, ret_common, br_common], join='inner', label='dataset',
    keys=['scimilarity', 'skin_atlas', 'retina_atlas', 'brain_atlas'],
    merge='same'
)

## Format the combined pseudobulk matrix

### Combine .var

In [None]:
np.all(ad_inner.var.index == sc_common.var.index)

In [None]:
ad_inner.var = sc_common.var.copy().drop(columns='symbol')

In [None]:
for gene_id in tqdm(ad_inner.var.index):
    names = []
    gene_name = ad_inner.var.loc[gene_id, 'gene_name']
    sk_name = sk.var.loc[gene_id, 'gene_name']      
    ret_name = ret.var.loc[gene_id, 'gene_name']
    br_name = br.var.loc[gene_id, 'gene_name']
    for name in [sk_name, ret_name, br_name]:
        if (name != gene_name) and (name not in names):
            names.append(name)
    if len(names) > 0:
        ad_inner.var.loc[gene_id, 'other_names'] = ",".join(names)
    else:
        ad_inner.var.loc[gene_id, 'other_names'] = None

In [None]:
ad_inner.var = preprocess.change_values(ad_inner.var, col="gene_type", value_dict={
    'PROTEIN_CODING':'protein_coding',
    'ncRNA':'lncRNA',
    'PSEUDO':'pseudogene'
})

### Combine .obs

In [None]:
ad_inner.obs[['study', 'dataset']].drop_duplicates().value_counts().max()

In [None]:
all_obs = pd.concat([
    sc_common.obs,
    sk_common.obs,
    ret_common.obs,
    br_common.obs
])

In [None]:
np.all(all_obs.index == ad_inner.obs.index)

In [None]:
all_obs.loc[all_obs.tissue=="head of femur", "organ"] = "bone"

In [None]:
all_obs['dataset'] = ad_inner.obs.dataset.tolist()

In [None]:
ad_inner.obs = all_obs

## Save

In [None]:
out_file = os.path.join(save_dir, "combined_inner.h5ad")
ad_inner.write_h5ad(out_file)