## Concatenation of all datasets into one matrix

We are finally concatenating the full matrix for all datasets. This is to increase the amount of genes we are testing.

__author__ = "Ciro Ramírez-Suástegui"

__copyright__ = "Copyright 2022-09-14, Helmholtz Zentrum Muenchen"

__license__ = "GPL"

__version__ = "0.0.9"

__email__ = "ciro.suastegui@helmholtz-muenchen.de, ksuasteguic@gmail.com"

__status__ = "Prototype"

#### Structure

* [Global variables and paths](#bullet1)
* [Loading data](#bullet2)
* [Pre-processing](#bullet3)
* [Main](#bullet4)
* [Conclusions](#bullet5)
* [Save](#bullet6)

### Environment setup

In [1]:
%load_ext autoreload
%autoreload 2
import importlib

spam_spec = importlib.util.find_spec("lab_black")
if spam_spec is not None:
    %load_ext lab_black

In [2]:
# basic modules
import warnings, os, re
import time, sys
import gc
from datetime import datetime
from collections import Counter
from collections import defaultdict
import itertools

warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=UserWarning)

In [3]:
# in-house/developing modules
# tools modules
import scanpy as sc
import numpy as np
import pandas as pd
import anndata as ad

In [4]:
def list_duplicates(seq):
    tally = defaultdict(list)
    for i, item in enumerate(seq):
        tally[item].append(i)
    return ((key, locs) for key, locs in tally.items() if len(locs) > 1)

In [5]:
sc.logging.print_versions()

-----
anndata     0.8.0
scanpy      1.9.1
-----
7b32b9a39ad70713acde__mypyc NA
PIL                         9.2.0
autoreload                  NA
backcall                    0.2.0
beta_ufunc                  NA
binom_ufunc                 NA
black                       22.6.0
blib2to3                    NA
cffi                        1.15.1
click                       8.1.3
cycler                      0.10.0
cython_runtime              NA
dateutil                    2.8.2
debugpy                     1.5.1
decorator                   5.1.1
defusedxml                  0.7.1
entrypoints                 0.4
h5py                        3.7.0
igraph                      0.9.11
ipykernel                   6.15.2
ipython_genutils            0.2.0
jedi                        0.18.2
joblib                      1.1.0
jupyter_server              1.23.4
kiwisolver                  1.4.4
lab_black                   NA
leidenalg                   0.8.10
llvmlite                    0.39.0
matplotlib    

In [6]:
print("Environment:", re.sub(".os.py", "", os.__file__))
parentpath = lambda _path, n: os.sep.join(_path.split(os.sep)[:-n])
print("Working at:", re.sub(parentpath(os.getcwd(), 2), "", os.getcwd()))

Environment: /home/icb/ciro.suastegui/miniconda3/envs/HLCA_basic/lib/python3.7
Working at: /notebooks/3_atlas_extension


### Global variables and paths <a class="anchor" id="bullet1"></a>

In [7]:
dset0_dir = "../../data/HLCA_extended/extension_datasets/ready/full"
feats_dir = "../../data/HLCA_extended/extension_datasets/features/"
feats_inp = (
    "../../data/HLCA_extended/extension_datasets/features/all_update_digested.pk"
)
adata_o_out = "../../data/HLCA_extended/HLCA_extended/HLCA_extended_full.h5ad"
embed_inp = "../../data/HLCA_extended/HLCA_extended/HLCA_extended_scarches_emb.h5ad"  # we need the metadata

In [8]:
adata_dict = {
    "HLCA": ["../../data/HLCA_core_h5ads/HLCA_v2.h5ad"],
    "Kaminski_2020": [f"{dset0_dir}/adams.h5ad"],
    "Meyer_2021": [f"{dset0_dir}/meyer_2021.h5ad"],
    "Meyer_Nikolic_2022": [f"{dset0_dir}/meyer_nikolic_unpubl.h5ad"],
    "Barbry_unpubl": [f"{dset0_dir}/barbry.h5ad"],
    "Regev_2021": [
        f"{dset0_dir}/delorey_cryo.h5ad",
        f"{dset0_dir}/delorey_fresh.h5ad",
        f"{dset0_dir}/delorey_nuclei.h5ad",
    ],
    "Thienpont_2018": [f"{dset0_dir}/lambrechts.h5ad"],
    "Budinger_2020": [f"{dset0_dir}/bharat.h5ad"],
    "Banovich_Kropski_2020": [f"{dset0_dir}/haberman.h5ad"],
    "Sheppard_2020": [f"{dset0_dir}/tsukui.h5ad"],
    "Wunderink_2021": [f"{dset0_dir}/grant_cryo.h5ad", f"{dset0_dir}/grant_fresh.h5ad"],
    "Lambrechts_2021": [
        f"{dset0_dir}/wouters.h5ad"  # , f"{dset0_dir}/wouters_labs.h5ad"
    ],
    "Zhang_2021": [f"{dset0_dir}/liao.h5ad"],
    "Duong_lungMAP_unpubl": [f"{dset0_dir}/duong.h5ad"],
    "Janssen_2020": [f"{dset0_dir}/mould.h5ad"],  # duplicated cells
    "Sun_2020": [
        f"{dset0_dir}/wang_sub_batch1.h5ad",
        f"{dset0_dir}/wang_sub_batch2.h5ad",
        f"{dset0_dir}/wang_sub_batch3.h5ad",
        f"{dset0_dir}/wang_sub_batch4.h5ad",
    ],
    "Gomperts_2021": [
        f"{dset0_dir}/carraro_ucla.h5ad",
        f"{dset0_dir}/carraro_cff.h5ad",
        f"{dset0_dir}/carraro_csmc.h5ad",
    ],
    "Eils_2020": [f"{dset0_dir}/lukassen.h5ad"],
    "Schiller_2020": [f"{dset0_dir}/mayr.h5ad"],
    "Misharin_Budinger_2018": [f"{dset0_dir}/reyfman_disease.h5ad"],
    "Shalek_2018": [f"{dset0_dir}/ordovasmontanes.h5ad"],
    "Schiller_2021": [f"{dset0_dir}/schiller_discovair.h5ad"],
    "Peer_Massague_2020": [f"{dset0_dir}/laughney.h5ad"],
    "Lafyatis_2019": [f"{dset0_dir}/valenzi.h5ad"],
    "Tata_unpubl": [f"{dset0_dir}/tata_unpubl.h5ad"],
    "Xu_2020": [f"{dset0_dir}/guo.h5ad"],
    "Sims_2019": [f"{dset0_dir}/szabo.h5ad"],
    "Schultze_unpubl": [f"{dset0_dir}/schultze_unpubl.h5ad"],
}

In [9]:
adata_split = {"HLCA": {"study": []}}

### Loading data <a class="anchor" id="bullet2"></a>

In [10]:
feature_conversion = pd.read_pickle(feats_inp)

In [11]:
feature_conversion.keys()

dict_keys(['Kaminski_2020', 'Meyer_2021', 'Barbry_unpubl', 'Regev_2021', 'Thienpont_2018', 'Budinger_2020', 'Banovich_Kropski_2020', 'Sheppard_2020', 'Wunderink_2021', 'Lambrechts_2021', 'Zhang_2021', 'Duong_lungMAP_unpubl', 'Janssen_2020', 'Sun_2020', 'Gomperts_2021', 'Eils_2020', 'Schiller_2020', 'Misharin_Budinger_2018', 'Shalek_2018', 'Schiller_2021', 'Peer_Massague_2020', 'Lafyatis_2019', 'Tata_unpubl', 'Xu_2020', 'Sims_2019', 'Schultze_unpubl', 'Meyer_Nikolic_2022', 'Nawijn_2021_HLCA', 'Barbry_Leroy_2020_HLCA', 'Meyer_2019_HLCA', 'Banovich_Kropski_2020_HLCA', 'Seibold_2020_HLCA', 'Jain_Misharin_2021_HLCA', 'Teichmann_Meyer_2019_HLCA', 'Misharin_2021_HLCA', 'Lafyatis_Rojas_2019_HLCA', 'Krasnow_2020_HLCA', 'Misharin_Budinger_2018_HLCA'])

In [12]:
feature_conversion["Kaminski_2020"]

Unnamed: 0,original,ensembl_id,new,repeat
0,TSPAN6,ENSG00000000003.15,TSPAN6,False
1,TNMD,ENSG00000000005.6,TNMD,False
2,DPM1,ENSG00000000419.14,DPM1,False
3,SCYL3,ENSG00000000457.14,SCYL3,False
4,C1orf112,ENSG00000000460.17,C1ORF112,False
...,...,...,...,...
45942,ENSG00000227029,ENSG00000227029.1,ENSG00000227029.1,False
45943,RN7SL782P,ENSG00000239708.3,RN7SL782P,False
45944,ENSG00000274532,ENSG00000274532.1,ENSG00000274532.1,False
45945,ENSG00000277705,ENSG00000277705.1,ENSG00000277705.1,False


In [13]:
%%time
adatas = dict()
for ds in adata_dict.keys():
    ds_files = adata_dict[ds]
    if all([os.path.isfile(i) for i in ds_files]):
        print(f"\033[92m*******************\033[0m {ds}", end=" ")
    else:
        print(f"\033[1m\033[91m*******************\033[0m {ds}", end=" ")
    if len(ds_files) == 1:
        print(f"\033[1m\033[97mS\033[0m", end="")
        adatas[ds] = sc.read(ds_files[0])
    else:
        print(f"\033[1m\033[94mM\033[0m", end="")
        ds_adatas = dict()
        for ds_f in ds_files:
            print(".", end="")
            ds_adatas[ds_f] = sc.read(ds_f)
        adatas[ds] = sc.AnnData.concatenate(
            *ds_adatas.values(), join="outer",
            batch_key=None,
            # batch_categories=[re.sub(".*/|.h5ad", "", i) for i in ds_files],
            index_unique=None # "_"
        )
        del ds_adatas
    print("")
    del adatas[ds].uns
    del adatas[ds].obsm
    del adatas[ds].varm
    del adatas[ds].obsp
    print(adatas[ds].shape)
    if len(adatas[ds].layers) > 0:
        temp = list(adatas[ds].layers.keys())
        print("Layers found:", ", ".join(temp), f"- using {temp[0]}")
        adatas[ds].X = adatas[ds].layers[temp[0]].copy()
        del adatas[ds].layers
    if ds in adata_split.keys():
        print(f"\033[1m\033[93mFor splitting\033[0m")
        ds_column = list(adata_split[ds].keys())[0]
        for ds_i in list(set(adatas[ds].obs[ds_column])):
            print(f" {ds_i}", end=" ")
            ds_new = ds_i + "_" + ds
            if not ds_new in adata_split[ds][ds_column]:
                adata_split[ds][ds_column].append(ds_new)
            adatas[ds_new] = adatas[ds][adatas[ds].obs[ds_column] == ds_i].copy()
            print(f"{adatas[ds_new].shape}")
        del adatas[ds]

[92m*******************[0m HLCA [1m[97mS[0m
(584944, 28527)
Layers found: counts - using counts
[1m[93mFor splitting[0m
 Lafyatis_Rojas_2019 (24181, 28527)
 Banovich_Kropski_2020 (121894, 28527)
 Nawijn_2021 (70402, 28527)
 Seibold_2020 (33593, 28527)
 Misharin_2021 (64843, 28527)
 Teichmann_Meyer_2019 (12231, 28527)
 Jain_Misharin_2021 (45557, 28527)
 Meyer_2019 (35554, 28527)
 Barbry_Leroy_2020 (74487, 28527)
 Krasnow_2020 (60982, 28527)
 Misharin_Budinger_2018 (41220, 28527)
[92m*******************[0m Kaminski_2020 [1m[97mS[0m
(307650, 45947)
[92m*******************[0m Meyer_2021 [1m[97mS[0m
(129340, 20922)
[92m*******************[0m Meyer_Nikolic_2022 [1m[97mS[0m
(236977, 33582)
[92m*******************[0m Barbry_unpubl [1m[97mS[0m
(100211, 16859)
[92m*******************[0m Regev_2021 [1m[94mM[0m...
(96060, 30983)
[92m*******************[0m Thienpont_2018 [1m[97mS[0m
(93575, 27958)
[92m*******************[0m Budinger_2020 [1m[97mS[0m
(91980

In [14]:
embed = sc.read(embed_inp, backed="r")

In [15]:
embed

AnnData object with n_obs × n_vars = 2382658 × 30 backed at '../../data/HLCA_extended/HLCA_extended/HLCA_extended_scarches_emb.h5ad'
    obs: 'sample', 'original_celltype_ann', 'study_long', 'study', 'last_author_PI', 'subject_ID', 'subject_ID_as_published', 'pre_or_postnatal', 'age_in_years', 'age_range', 'sex', 'smoking_status', 'smoking_history', 'BMI', 'known_lung_disease', 'condition', 'subject_type', 'cause_of_death', 'sample_type', 'anatomical_region_coarse', 'anatomical_region_detailed', 'tissue_dissociation_protocol', 'cells_or_nuclei', 'single_cell_platform', "3'_or_5'", 'enrichment', 'sequencing_platform', 'reference_genome_coarse', 'ensembl_release_reference_genome', 'cell_ranger_version', 'disease_status', 'fresh_or_frozen', 'cultured', 'cell_viability_%', 'comments', 'Processing_site', 'dataset', 'anatomical_region_level_1', 'anatomical_region_level_2', 'anatomical_region_level_3', 'anatomical_region_highest_res', 'age', 'ann_level_1', 'ann_level_2', 'ann_level_3', 'ann_l

In [16]:
embed_obs = embed.obs.copy()
del embed
gc.collect()

3164

### Pre-processing <a class="anchor" id="bullet3"></a>

In [17]:
for ds in adatas.keys():
    ds_vars = adatas[ds].var_names.tolist()
    temp = set(feature_conversion[ds]["original"]) - set(ds_vars)
    tvar = set(ds_vars) - set(feature_conversion[ds]["original"])
    if (len(temp) == 0) and (len(tvar) == 0):
        print(f"\033[92m*******************\033[0m {ds}", end="")
        if ds_vars == feature_conversion[ds]["original"].tolist():
            print(" - same order")
        else:
            print(" - \033[0;31morder off\033[0m")
    else:
        print(f"\033[1m\033[91m*******************\033[0m {ds}")
        for i in temp:
            print(
                adatas[ds].shape[1],
                len(feature_conversion[ds]["original"]),
                len(feature_conversion[ds]["new"]),
            )
            for j, nam in enumerate(feature_conversion[ds]["original"]):
                if i == feature_conversion[ds]["original"][j]:
                    oo = feature_conversion[ds]["original"][j]
                    nn = feature_conversion[ds]["new"][j]
                    print(f"Matrix: '{ds_vars[j]}', Old: '{oo}', Harmonised: '{nn}'")

[92m*******************[0m Lafyatis_Rojas_2019_HLCA - same order
[92m*******************[0m Banovich_Kropski_2020_HLCA - same order
[92m*******************[0m Nawijn_2021_HLCA - same order
[92m*******************[0m Seibold_2020_HLCA - same order
[92m*******************[0m Misharin_2021_HLCA - same order
[92m*******************[0m Teichmann_Meyer_2019_HLCA - same order
[92m*******************[0m Jain_Misharin_2021_HLCA - same order
[92m*******************[0m Meyer_2019_HLCA - same order
[92m*******************[0m Barbry_Leroy_2020_HLCA - same order
[92m*******************[0m Krasnow_2020_HLCA - same order
[92m*******************[0m Misharin_Budinger_2018_HLCA - same order
[92m*******************[0m Kaminski_2020 - same order
[92m*******************[0m Meyer_2021 - same order
[92m*******************[0m Meyer_Nikolic_2022 - same order
[92m*******************[0m Barbry_unpubl - same order
[92m*******************[0m Regev_2021 - same order
[92m*************

Mannually selected some

In [18]:
check_after = [
    "ENSG00000122674.12_CCZ1",
    "ENSG00000114374.13_USP9Y",
    "ENSG00000099977.15_DDT",
]

Getting dataset sufixes

In [19]:
df = pd.crosstab(embed_obs["study"], embed_obs["core_or_extension"])
df

core_or_extension,core,extension
study,Unnamed: 1_level_1,Unnamed: 2_level_1
Banovich_Kropski_2020,121894,82692
Barbry_Leroy_2020,74487,0
Barbry_unpubl,0,100211
Budinger_2020,0,91980
Duong_lungMAP_unpubl,0,53904
Eils_2020,0,39778
Gomperts_2021,0,40709
Jain_Misharin_2021,45557,0
Janssen_2020,0,49384
Kaminski_2020,0,307650


In [20]:
for ds_merged in adata_split.keys():
    print(f"\033[1m\033[93mChecking completeness:\033[0m {ds_merged}")
    ds_column = list(adata_split[ds_merged].keys())[0]
    temp = [re.sub(f"_{ds_merged}$", "", i) for i in adata_split[ds_merged][ds_column]]
    temp.extend(list(adatas.keys()))
    adata_presence = [i in temp for i in df.index]
    print(f"All in collected matrices? {all(adata_presence)}")
    if not all(adata_presence):
        print("Missing:")
        [i for i in df.index if not i in temp]

[1m[93mChecking completeness:[0m HLCA
All in collected matrices? True


Inspect for "dataset" from core and extension that share the same name.

In [21]:
embed_obs["study_coe"] = (
    embed_obs["study"].astype(str) + "-" + embed_obs["core_or_extension"].astype(str)
)

In [22]:
df = pd.crosstab(embed_obs["study_coe"], embed_obs["dataset"])
df["study"] = [re.sub("-.*", "", i) for i in df.index.tolist()]
df["core_or_extension"] = [re.sub(".*-", "", i) for i in df.index.tolist()]
df = df.melt(id_vars=["study", "core_or_extension"])
pd.set_option("display.max_rows", df.shape[0])
df = df.loc[df["value"].astype(int) > 0, :]
df

Unnamed: 0,study,core_or_extension,dataset,value
0,Banovich_Kropski_2020,core,Banovich_Kropski_2020,121894
1,Banovich_Kropski_2020,extension,Banovich_Kropski_2020,82692
40,Barbry_Leroy_2020,core,Barbry_Leroy_2020,74487
79,Barbry_unpubl,extension,Barbry_unpubl,100211
118,Budinger_2020,extension,Budinger_2020,91980
157,Duong_lungMAP_unpubl,extension,Duong_lungMAP_unpubl,53904
196,Eils_2020,extension,Eils_2020,39778
235,Gomperts_2021,extension,Gomperts2021_UCLA,19239
273,Gomperts_2021,extension,Gomperts_2021_CFF,6995
311,Gomperts_2021,extension,Gomperts_2021_CSMC,14475


Separating datasets present in both core and extension

In [23]:
df["temp"] = df["study"] + df["dataset"]
dataset_repeated = df.loc[df["temp"].duplicated(), "study"].tolist()
del df["temp"]
df.loc[df["study"].isin(dataset_repeated), :]

Unnamed: 0,study,core_or_extension,dataset,value
0,Banovich_Kropski_2020,core,Banovich_Kropski_2020,121894
1,Banovich_Kropski_2020,extension,Banovich_Kropski_2020,82692
893,Misharin_Budinger_2018,core,Misharin_Budinger_2018,41220
894,Misharin_Budinger_2018,extension,Misharin_Budinger_2018,37097


In [24]:
%%time
temp = []
for i in embed_obs.index:
    ds_name = embed_obs.loc[i, "dataset"]
    if ds_name in dataset_repeated:
        temp.append(ds_name + "_" + embed_obs.loc[i, "core_or_extension"])
    else:
        temp.append(ds_name)
embed_obs["dataset_temp"] = temp

CPU times: user 28.2 s, sys: 20.4 ms, total: 28.2 s
Wall time: 28.3 s


In [25]:
core_or_extension_column = [
    i for i, v in enumerate(embed_obs.columns) if v == "core_or_extension"
]
for ds in set(embed_obs["dataset_temp"].tolist()):
    df = embed_obs.loc[embed_obs["dataset_temp"].isin([ds]), :]
    temp = df.iloc[:, core_or_extension_column[0]].value_counts()
    temp = temp[temp > 0].index.tolist()
    if len(temp) > 1:
        print(f"\033[91m*******************\033[0m {ds}")
    else:
        print(f"\033[1m\033[92m*******************\033[0m {ds}")
    print(f"Study: {df.iloc[0, 3]} ({temp})")
    print(f"Barcode: '{df.index[0]}'")

[1m[92m*******************[0m Zhang_2021
Study: Zhang_2021 (['extension'])
Barcode: 'AAACCCACAGCTACAT_3_liao'
[1m[92m*******************[0m Jain_Misharin_2021_10Xv2
Study: Jain_Misharin_2021 (['core'])
Barcode: 'GTAGTCACATGCCTAA-1-4'
[1m[92m*******************[0m Wunderink_2021_fresh
Study: Wunderink_2021 (['extension'])
Barcode: 'AAACCTGAGAAACCGC-0_grant_fresh'
[1m[92m*******************[0m Nawijn_2021
Study: Nawijn_2021 (['core'])
Barcode: 'CAGCCACAGTCACTA_GRO-03_biopsy'
[1m[92m*******************[0m Meyer_2021_5prime
Study: Meyer_2021 (['extension'])
Barcode: 'AAACCTGAGACTAGGC-5841STDY7991475-0_meyer_unpubl'
[1m[92m*******************[0m Gomperts2021_UCLA
Study: Gomperts_2021 (['extension'])
Barcode: 'BG9_2_AAAACGAGAGTT_carraro_ucla'
[1m[92m*******************[0m Peer_Massague_2020
Study: Peer_Massague_2020 (['extension'])
Barcode: '0_laughney'
[1m[92m*******************[0m Sheppard_2020
Study: Sheppard_2020 (['extension'])
Barcode: 'TCGTACCCATGATCCA_SCD1_tsu

In [26]:
adata_dict_conversion = {
    "Meyer_2019": ["", "Meyer_2019"],
    "Janssen_2020": ["sub_mould", "Janssen_2020"],
    "Zhang_2021": ["liao", "Zhang_2021"],
    "Barbry_Leroy_2020": ["", "Barbry_Leroy_2020"],
    "Peer_Massague_2020": ["laughney", "Peer_Massague_2020"],
    "Jain_Misharin_2021_10Xv1": ["", "Jain_Misharin_2021"],
    "Eils_2020": ["lukassen", "Eils_2020"],
    "Sheppard_2020": ["tsukui", "Sheppard_2020"],
    "Shalek_2018": ["ordovasmontanes", "Shalek_2018"],
    "Misharin_Budinger_2018_extension": [
        "reyfman_disease",
        "Misharin_Budinger_2018",
    ],  # see above the one shown doesn't have 'reyfman_disease', I suspect it's the core vs extension difference
    "Misharin_Budinger_2018_core": [
        "",
        "Misharin_Budinger_2018",
    ],
    "Gomperts_2021_CFF": ["carraro_cff", "Gomperts_2021"],
    "Kaminski_2020": ["adams", "Kaminski_2020"],
    "Schiller_2021": ["schiller_discovair", "Schiller_2021"],
    "Gomperts_2021_CSMC": ["carraro_csmc", "Gomperts_2021"],
    "Lambrechts_2021": ["wouters", "Lambrechts_2021"],
    "Meyer_2021_3prime": ["meyer_unpubl", "Meyer_2021"],
    "Barbry_unpubl": ["barbry", "Barbry_unpubl"],
    "Sun_2020_batch3": ["sub_wang_sub_batch3", "Sun_2020"],
    "Regev_2021_Fresh": ["delorey_fresh", "Regev_2021"],
    "Thienpont_2018_10Xv2": ["lambrechts", "Thienpont_2018"],
    "Lafyatis_Rojas_2019_10Xv1": ["", "Lafyatis_Rojas_2019"],
    "Budinger_2020": ["bharat", "Budinger_2020"],
    "Krasnow_2020": ["", "Krasnow_2020"],
    "Sun_2020_batch4": ["sub_wang_sub_batch4", "Sun_2020"],
    "Meyer_2021_5prime": ["meyer_unpubl", "Meyer_2021"],
    "Schiller_2020": ["mayr", "Schiller_2020"],
    "Tata_unpubl": ["tata_unpubl", "Tata_unpubl"],
    "Gomperts2021_UCLA": ["carraro_ucla", "Gomperts_2021"],
    "Xu_2020_LAM1_3": ["sub_guo", "Xu_2020"],
    "Banovich_Kropski_2020_extension": [
        "haberman",
        "Banovich_Kropski_2020",
    ],  # see above the one shown doesn't have 'haberman', I suspect it's the core vs extension difference
    "Banovich_Kropski_2020_core": [
        "",
        "Banovich_Kropski_2020",
    ],
    "Seibold_2020_10Xv3": ["", "Seibold_2020"],
    "Regev_2021_Cryo": ["delorey_cryo", "Regev_2021"],
    "Wunderink_2021_fresh": ["grant_fresh", "Wunderink_2021"],
    "Sun_2020_batch1": ["sub_wang_sub_batch1", "Sun_2020"],
    "MeyerNikolic_unpubl_UCL": ["meyer_nikolic_unpubl", "Meyer_Nikolic_2022"],
    "Regev_2021_Nuclei": ["delorey_nuclei", "Regev_2021"],
    "Schultze_unpubl": ["schultze_unpubl", "Schultze_unpubl"],
    "Teichmann_Meyer_2019": ["", "Teichmann_Meyer_2019"],
    "Seibold_2020_10Xv2": ["", "Seibold_2020"],
    "Sun_2020_batch2": ["sub_wang_sub_batch2", "Sun_2020"],
    "Nawijn_2021": ["", "Nawijn_2021"],  # biopsy
    "Lafyatis_Rojas_2019_10Xv2": ["", "Lafyatis_Rojas_2019"],
    "Sims_2019": ["szabo", "Sims_2019"],
    "Misharin_2021": ["", "Misharin_2021"],
    "Jain_Misharin_2021_10Xv2": ["", "Jain_Misharin_2021"],
    "Lafyatis_2019": ["valenzi", "Lafyatis_2019"],
    "Thienpont_2018_10Xv1": ["lambrechts", "Thienpont_2018"],
    "Wunderink_2021_cryo": ["grant_cryo", "Wunderink_2021"],
    "Duong_lungMAP_unpubl": ["duong", "Duong_lungMAP_unpubl"],
}

We will will replace the sufixes for the study to be able to match the metadatas. We are using dataset.

This approach is forcing the old barcodes to match the result of the new concatenation. I suspect it would've been easier to add the sufixes found in the embedding to the appropiate dataset. The good thing about not doing that is that we end up with a "standardised" barcode.

In [27]:
gc.collect()

3826

In [28]:
cellnames_old = list()
cellnames_new = list()
for ds in set(embed_obs["dataset_temp"]):
    if ds in adata_dict_conversion.keys():
        print(f"\033[1m\033[92m*******************\033[0m {ds}")
        sufix_change = adata_dict_conversion[ds]  # inferred sufix
    else:
        print(f"\033[1m\033[91m*******************\033[0m {ds}")
        sufix_change = ["", ds]
    print(sufix_change)
    cellname_old = embed_obs.loc[embed_obs["dataset_temp"].isin([ds]), :].index.tolist()
    if sufix_change[0] == "re":  # modify barcode and add sufix
        cellname_new = [
            re.sub(sufix_change[1], sufix_change[2], i) + "_" + sufix_change[3]
            for i in cellname_old
        ]
    elif sufix_change[0] != "":  # replace old sufix
        cellname_new = [
            re.sub(
                "_" + sufix_change[0] + "|-" + sufix_change[0], "_" + sufix_change[1], i
            )
            for i in cellname_old
        ]
    else:  # add new sufix
        cellname_new = [i + "_" + sufix_change[1] for i in cellname_old]
    print(cellname_old[:3])
    print(cellname_new[:3])
    cellnames_old.extend(cellname_old)
    cellnames_new.extend(cellname_new)

[1m[92m*******************[0m Zhang_2021
['liao', 'Zhang_2021']
['AAACCCACAGCTACAT_3_liao', 'AAACCCATCCACGGGT_3_liao', 'AAACGAACAAACAGGC_3_liao']
['AAACCCACAGCTACAT_3_Zhang_2021', 'AAACCCATCCACGGGT_3_Zhang_2021', 'AAACGAACAAACAGGC_3_Zhang_2021']
[1m[92m*******************[0m Jain_Misharin_2021_10Xv2
['', 'Jain_Misharin_2021']
['GTAGTCACATGCCTAA-1-4', 'AAGGTTCTCGTAGATC-1-6', 'TAAGAGACAGCATACT-1-6']
['GTAGTCACATGCCTAA-1-4_Jain_Misharin_2021', 'AAGGTTCTCGTAGATC-1-6_Jain_Misharin_2021', 'TAAGAGACAGCATACT-1-6_Jain_Misharin_2021']
[1m[92m*******************[0m Wunderink_2021_fresh
['grant_fresh', 'Wunderink_2021']
['AAACCTGAGAAACCGC-0_grant_fresh', 'AAACCTGAGCGTTTAC-0_grant_fresh', 'AAACCTGCAACTGCGC-0_grant_fresh']
['AAACCTGAGAAACCGC-0_Wunderink_2021', 'AAACCTGAGCGTTTAC-0_Wunderink_2021', 'AAACCTGCAACTGCGC-0_Wunderink_2021']
[1m[92m*******************[0m Nawijn_2021
['', 'Nawijn_2021']
['CAGCCACAGTCACTA_GRO-03_biopsy', 'TCAGGTAGTGGTGTA_GRO-10_biopsy', 'AGTCCGGTCTCCCTA_GRO-08_biop

Check if repeated indexes are introduced

In [29]:
temp = [k for k, v in Counter(cellnames_new).items() if v > 1]
temp
temp = [k for k, v in Counter(cellnames_old).items() if v > 1]
temp

[]

In [30]:
print(len(cellnames_new), embed_obs.shape[0])

2382658 2382658


In [31]:
df_cellnames = pd.DataFrame(index=cellnames_old)
df_cellnames["old"] = cellnames_old
df_cellnames["new"] = cellnames_new
df_cellnames["study"] = embed_obs.loc[df_cellnames["old"], "study"]

In [32]:
df_cellnames

Unnamed: 0,old,new,study
AAACCCACAGCTACAT_3_liao,AAACCCACAGCTACAT_3_liao,AAACCCACAGCTACAT_3_Zhang_2021,Zhang_2021
AAACCCATCCACGGGT_3_liao,AAACCCATCCACGGGT_3_liao,AAACCCATCCACGGGT_3_Zhang_2021,Zhang_2021
AAACGAACAAACAGGC_3_liao,AAACGAACAAACAGGC_3_liao,AAACGAACAAACAGGC_3_Zhang_2021,Zhang_2021
AAACGAAGTCGCACAC_3_liao,AAACGAAGTCGCACAC_3_liao,AAACGAAGTCGCACAC_3_Zhang_2021,Zhang_2021
AAACGAAGTGTAGTGG_3_liao,AAACGAAGTGTAGTGG_3_liao,AAACGAAGTGTAGTGG_3_Zhang_2021,Zhang_2021
...,...,...,...
TTTGTCAGTACCATCA-1-WTDAtest7732270-0_meyer_unpubl,TTTGTCAGTACCATCA-1-WTDAtest7732270-0_meyer_unpubl,TTTGTCAGTACCATCA-1-WTDAtest7732270-0_Meyer_2021,Meyer_2021
TTTGTCAGTAGCCTCG-1-WTDAtest7732270-0_meyer_unpubl,TTTGTCAGTAGCCTCG-1-WTDAtest7732270-0_meyer_unpubl,TTTGTCAGTAGCCTCG-1-WTDAtest7732270-0_Meyer_2021,Meyer_2021
TTTGTCAGTTAAGACA-1-WTDAtest7732270-0_meyer_unpubl,TTTGTCAGTTAAGACA-1-WTDAtest7732270-0_meyer_unpubl,TTTGTCAGTTAAGACA-1-WTDAtest7732270-0_Meyer_2021,Meyer_2021
TTTGTCATCACCATAG-1-WTDAtest7732270-0_meyer_unpubl,TTTGTCATCACCATAG-1-WTDAtest7732270-0_meyer_unpubl,TTTGTCATCACCATAG-1-WTDAtest7732270-0_Meyer_2021,Meyer_2021


### Main <a class="anchor" id="bullet4"></a>

We can now replace the genes.

In [33]:
gc.collect()

1823

In [34]:
check_before_after = [
    ("SPP1", "SPP1"),  # ENSG00000118785.15_
    ("TREM2", "TREM2"),  # ENSG00000095970.17_
    ("FABP4", "FABP4"),  # ENSG00000170323.9_
]

In [35]:
%%time
query_adatas = dict()
ds_list = np.array(list(adatas.keys()))
for ds in ds_list:
    print(f"\033[1;94m*******************\033[0m {ds} {np.where(ds_list == ds)[0][0]}/{len(ds_list)}")
    print(f"\033[0;33mMatrix:\033[0m {adatas[ds].shape} ", end="")
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    feats_df = feature_conversion[ds].copy().set_index("original", drop=False)
    if not ds in query_adatas.keys():
        features_rept = feats_df.loc[feats_df["repeat"], "original"].tolist()
        x_raw = pd.DataFrame(
            adatas[ds][:,features_rept].X.A.T,
            columns=adatas[ds].obs_names,
            index=features_rept,
        )
        x_raw["gene_new"] = feats_df.loc[feats_df["repeat"], "new"].tolist()
        x_raw_summed = x_raw.groupby("gene_new").sum()
        print(f"\033[1;33mRepeated features\033[0m {x_raw.shape[0]} => {x_raw_summed.shape[0]}")
        if ds == "Wunderink_2021" and x_raw_summed.shape[1] < 500:
            print(f"-------- \033[1m\033[91mBefore\033[0m")
            check_previ = x_raw.loc[x_raw["gene_new"].isin(check_after), :].index
            temp = x_raw.loc[
                check_previ,
            ]
            ds_cells = (temp != 0).any().index.tolist()
            ds_cells.append("sum")
            ds_cells = ds_cells[-5:]
            x_raw["sum"] = x_raw.sum(axis=1)
            print(x_raw.loc[check_previ, ds_cells])
            ds_cells.remove("gene_new")
            x_raw_summed["sum"] = x_raw_summed.sum(axis=1)
            print(f"-------- \033[1m\033[92mAfter\033[0m")
            print(x_raw_summed.loc[check_after, ds_cells])
        else:
            del x_raw
        query_adatas[ds] = ad.AnnData(X=x_raw_summed.T, obs=adatas[ds].obs)
        # Let's temporarily take the old names, so it's easy to replace them
        # when we have the full matrix, since the concatenation will have the old names
        temp = (
            feats_df
            .loc[feats_df["new"].isin(query_adatas[ds].var_names), :]
            .drop_duplicates(subset=["new"])
            .set_index("new")
        )
        query_adatas[ds].var = pd.DataFrame(index=temp.loc[query_adatas[ds].var_names,"original"].tolist())
        query_adatas[ds] = query_adatas[ds].T # we need to concatenate it var-wise
        features_uniq = (
            feats_df.loc[~feats_df["repeat"], "original"].tolist()
        )
        query_adatas[ds] = query_adatas[ds].concatenate(adatas[ds][:,features_uniq].copy().T, index_unique=None)
        query_adatas[ds] = query_adatas[ds].T # put vars back to the columns
        # Finishing
        cellname = adatas[ds].obs_names.tolist()
        query_adatas[ds] = query_adatas[ds][cellname]
        query_adatas[ds].obs = pd.DataFrame(index=cellname)
        query_adatas[ds].obs["cellname"] = cellname
        query_adatas[ds].obs["study"] = ds
        if query_adatas[ds].shape[0] < 100000:
            temp = [i for i, j in check_before_after if i in adatas[ds].var_names]
            if len(temp) > 0:
                print(temp)
                print(adatas[ds][:, temp].X.sum(axis=0))
            temp = [i for i, j in check_before_after if i in query_adatas[ds].var_names]
            if len(temp) > 0:
                print(temp)
                print(query_adatas[ds][:, temp].X.sum(axis=0))
        # Taking the new names
        query_adatas[ds].var = pd.DataFrame(
            index=feats_df.loc[query_adatas[ds].var_names, "new"].tolist()
        )
        if query_adatas[ds].shape[0] < 100000:
            temp = [j for i, j in check_before_after if j in query_adatas[ds].var_names]
            if len(temp) > 0:
                print(temp)
                print(query_adatas[ds][:, temp].X.sum(axis=0))
        gc.collect()
    print(f"\033[0;33mFinal\033[0m {query_adatas[ds].shape}")

[1;94m*******************[0m Lafyatis_Rojas_2019_HLCA 0/38
[0;33mMatrix:[0m (24181, 28527) 2023-02-07 21:49:03
[1;33mRepeated features[0m 778 => 359
['SPP1', 'TREM2', 'FABP4']
[[ 17730.  14560. 337551.]]
['SPP1', 'TREM2', 'FABP4']
[[ 17730.  14560. 337551.]]
['SPP1', 'TREM2', 'FABP4']
[[ 17730.  14560. 337551.]]
[0;33mFinal[0m (24181, 28108)
[1;94m*******************[0m Banovich_Kropski_2020_HLCA 1/38
[0;33mMatrix:[0m (121894, 28527) 2023-02-07 21:49:07
[1;33mRepeated features[0m 778 => 359
[0;33mFinal[0m (121894, 28108)
[1;94m*******************[0m Nawijn_2021_HLCA 2/38
[0;33mMatrix:[0m (70402, 28527) 2023-02-07 21:49:20
[1;33mRepeated features[0m 778 => 359
['SPP1', 'TREM2', 'FABP4']
[[3818. 1902. 7741.]]
['SPP1', 'TREM2', 'FABP4']
[[3818. 1902. 7741.]]
['SPP1', 'TREM2', 'FABP4']
[[3818. 1902. 7741.]]
[0;33mFinal[0m (70402, 28108)
[1;94m*******************[0m Seibold_2020_HLCA 3/38
[0;33mMatrix:[0m (33593, 28527) 2023-02-07 21:49:31
[1;33mRepeated feature

In [36]:
ephi = set(
    list(
        itertools.chain.from_iterable(
            [feature_conversion[ds]["original"] for ds in feature_conversion.keys()]
        )
    )
)
temp = set(
    list(
        itertools.chain.from_iterable(
            [query_adatas[ds].var_names for ds in query_adatas.keys()]
        )
    )
)
tvar = set(
    list(
        itertools.chain.from_iterable(
            [feature_conversion[ds]["new"] for ds in feature_conversion.keys()]
        )
    )
)
print(f"Before harmonization: total genes: {len(ephi)}")
print(f"After harmonization: total theoretical genes: {len(tvar)}")
print(f"After harmonization: total object genes: {len(temp)}")

Before harmonization: total genes: 94538
After harmonization: total theoretical genes: 59574
After harmonization: total object genes: 59574


In [37]:
if temp != tvar:
    for ds in feature_conversion.keys():
        var_diff = list(
            set(feature_conversion[ds]["new"]) - set(query_adatas[ds].var_names)
        )
        if len(var_diff) > 0:
            print(f"{ds} ({len(var_diff)}):\n", var_diff[:3], "\n", var_diff[-3:])

In [38]:
del adatas

In [39]:
gc.collect()

36101

Finally the concatenation!

In [40]:
%%time
adata_o = sc.AnnData.concatenate(
    *query_adatas.values(),
    join="outer",
    batch_key=None,
    batch_categories=list(query_adatas.keys()),
    index_unique="_"
)

CPU times: user 1min 15s, sys: 18.8 s, total: 1min 33s
Wall time: 1min 34s


In [41]:
adata_o

AnnData object with n_obs × n_vars = 2500001 × 59574
    obs: 'cellname', 'study'

Now checking overlap of obs with the embedding object so we can add the matadata.

In [42]:
temp = "_HLCA$"  # |carraro_cff_|carraro_csmc_|carraro_ucla_|sub_wang_sub_batch3_|sub_wang_sub_batch4_|sub_wang_sub_batch1_|sub_wang_sub_batch2_|delorey_fresh_|delorey_cryo_|delorey_nuclei_|grant_fresh_|grant_cryo_"
adata_o.obs_names = [re.sub(temp, "", i) for i in adata_o.obs_names]

In [43]:
print("Total merged:", len(adata_o.obs_names))
print("Total in Embedding:", embed_obs.shape[0])
print(
    "Concatenation overlap with old Embedding names:",
    len(adata_o.obs_names.intersection(df_cellnames["old"])),
)
print(
    "Concatenation overlap with new names:",
    len(adata_o.obs_names.intersection(df_cellnames["new"])),
)

Total merged: 2500001
Total in Embedding: 2382658
Concatenation overlap with old Embedding names: 0
Concatenation overlap with new names: 2382658


<b style='color:red;'>We want to recover all cells in the embedding</b>

Check missmatching barcodes

In [44]:
temp = list(set(df_cellnames["new"]) - set(adata_o.obs_names))
df_cellnames_missed = df_cellnames.loc[df_cellnames.new.isin(temp), :]
cellnames_missed = df_cellnames_missed["study"].value_counts()
cellnames_missed = cellnames_missed[cellnames_missed > 0]
cellnames_missed

Series([], Name: study, dtype: int64)

In [45]:
df_cellnames["study"].value_counts()[cellnames_missed.index]

Series([], Name: study, dtype: int64)

In [46]:
print("Total:")
df = embed_obs.loc[embed_obs["study"].isin(cellnames_missed.index), :]
pd.crosstab(df["dataset_temp"], df["study"])
print("Missing:")
df = embed_obs.loc[df_cellnames_missed.index, :]
pd.crosstab(df["dataset_temp"], df["study"])

Total:
Missing:


study
dataset_temp


In [47]:
adata_o.obs["study"].value_counts()[cellnames_missed.index]

Series([], Name: study, dtype: int64)

In [48]:
temp = cellnames_missed.index
show_examples = len(temp) == 0
if show_examples:
    temp = list(set(df_cellnames.study))[:3]
for ds in temp:
    print(f"\033[1m\033[94m*******************\033[0m {ds}")
    ds_clean = re.sub("_HLCA$", "", ds)
    df = df_cellnames.loc[df_cellnames.study.isin([ds_clean]), :]
    df = df.loc[
        ~df["new"].isin(adata_o.obs_names),
    ].iloc[:3, :]
    if not show_examples:
        try:
            flag = "(old)"
            temp = embed_obs.loc[df["old"], :]
        except KeyError:
            flag = "(isin)"
            temp = embed_obs.loc[embed_obs.study.isin([ds]), :]
        print(f"Embedding {flag}")
        temp
    if df.shape[0] > 0:
        print("Conversion")
        df
    df_adata_o = adata_o.obs.loc[adata_o.obs.study.isin([ds]), :]
    # df_adata_o = df_adata_o.loc[
    #     list(set(df_adata_o.index) - set(df_cellnames["new"])),
    # ]
    try:
        flag = "(old)"
        temp = df_adata_o.loc[df["old"], :]
    except KeyError:
        try:
            flag = "(new)"
            temp = df_adata_o.loc[df["new"], :]
        except KeyError:
            flag = "(isin)"
            temp = df_adata_o.iloc[:3, :]
    if temp.shape[0] > 0:
        print(f"Concatenated {flag}")
        temp

[1m[94m*******************[0m Zhang_2021
[1m[94m*******************[0m Lafyatis_Rojas_2019
[1m[94m*******************[0m Banovich_Kropski_2020


In [49]:
if not show_examples:
    bcode = "AAACCTGCAAAGGTGC"
    matching = [s for s in adata_o.obs_names if bcode in s]
    adata_o.obs.loc[matching, :]
    matching = [s for s in df_cellnames.index if bcode in s]
    df_cellnames.loc[matching, :]

Renaming barcodes and adding metadata

In [50]:
adata_o.obs

Unnamed: 0,cellname,study
ACCCACTGAACTGC-SC14_Lafyatis_Rojas_2019,ACCCACTGAACTGC-SC14,Lafyatis_Rojas_2019_HLCA
GACCAATGTGCTCTTC-SC156_Lafyatis_Rojas_2019,GACCAATGTGCTCTTC-SC156,Lafyatis_Rojas_2019_HLCA
CTGATCCTCGTGGGAA-SC45_Lafyatis_Rojas_2019,CTGATCCTCGTGGGAA-SC45,Lafyatis_Rojas_2019_HLCA
GCCAAATCAAAGGCGT-SC56_Lafyatis_Rojas_2019,GCCAAATCAAAGGCGT-SC56,Lafyatis_Rojas_2019_HLCA
TACGCCACGATGAA-SC14_Lafyatis_Rojas_2019,TACGCCACGATGAA-SC14,Lafyatis_Rojas_2019_HLCA
...,...,...
Pool_426_TTTGTCTAAGCA_Schultze_unpubl,Pool_426_TTTGTCTAAGCA,Schultze_unpubl
Pool_426_TTTTCGCGGTCA_Schultze_unpubl,Pool_426_TTTTCGCGGTCA,Schultze_unpubl
Pool_426_TTTTCTACGCCT_Schultze_unpubl,Pool_426_TTTTCTACGCCT,Schultze_unpubl
Pool_426_TTTTCTAGCCCC_Schultze_unpubl,Pool_426_TTTTCTAGCCCC,Schultze_unpubl


In [51]:
df_cellnames_order = df_cellnames.loc[
    [i in adata_o.obs_names for i in df_cellnames["new"]], :
]

In [52]:
df_cellnames_order

Unnamed: 0,old,new,study
AAACCCACAGCTACAT_3_liao,AAACCCACAGCTACAT_3_liao,AAACCCACAGCTACAT_3_Zhang_2021,Zhang_2021
AAACCCATCCACGGGT_3_liao,AAACCCATCCACGGGT_3_liao,AAACCCATCCACGGGT_3_Zhang_2021,Zhang_2021
AAACGAACAAACAGGC_3_liao,AAACGAACAAACAGGC_3_liao,AAACGAACAAACAGGC_3_Zhang_2021,Zhang_2021
AAACGAAGTCGCACAC_3_liao,AAACGAAGTCGCACAC_3_liao,AAACGAAGTCGCACAC_3_Zhang_2021,Zhang_2021
AAACGAAGTGTAGTGG_3_liao,AAACGAAGTGTAGTGG_3_liao,AAACGAAGTGTAGTGG_3_Zhang_2021,Zhang_2021
...,...,...,...
TTTGTCAGTACCATCA-1-WTDAtest7732270-0_meyer_unpubl,TTTGTCAGTACCATCA-1-WTDAtest7732270-0_meyer_unpubl,TTTGTCAGTACCATCA-1-WTDAtest7732270-0_Meyer_2021,Meyer_2021
TTTGTCAGTAGCCTCG-1-WTDAtest7732270-0_meyer_unpubl,TTTGTCAGTAGCCTCG-1-WTDAtest7732270-0_meyer_unpubl,TTTGTCAGTAGCCTCG-1-WTDAtest7732270-0_Meyer_2021,Meyer_2021
TTTGTCAGTTAAGACA-1-WTDAtest7732270-0_meyer_unpubl,TTTGTCAGTTAAGACA-1-WTDAtest7732270-0_meyer_unpubl,TTTGTCAGTTAAGACA-1-WTDAtest7732270-0_Meyer_2021,Meyer_2021
TTTGTCATCACCATAG-1-WTDAtest7732270-0_meyer_unpubl,TTTGTCATCACCATAG-1-WTDAtest7732270-0_meyer_unpubl,TTTGTCATCACCATAG-1-WTDAtest7732270-0_Meyer_2021,Meyer_2021


In [53]:
adata_o = adata_o[df_cellnames_order["new"], :]

In [54]:
del query_adatas
gc.collect()

4847

In [55]:
adata_o_obs = adata_o.obs.copy()
adata_o_obs

Unnamed: 0,cellname,study
AAACCCACAGCTACAT_3_Zhang_2021,AAACCCACAGCTACAT_3,Zhang_2021
AAACCCATCCACGGGT_3_Zhang_2021,AAACCCATCCACGGGT_3,Zhang_2021
AAACGAACAAACAGGC_3_Zhang_2021,AAACGAACAAACAGGC_3,Zhang_2021
AAACGAAGTCGCACAC_3_Zhang_2021,AAACGAAGTCGCACAC_3,Zhang_2021
AAACGAAGTGTAGTGG_3_Zhang_2021,AAACGAAGTGTAGTGG_3,Zhang_2021
...,...,...
TTTGTCAGTACCATCA-1-WTDAtest7732270-0_Meyer_2021,TTTGTCAGTACCATCA-1-WTDAtest7732270-0,Meyer_2021
TTTGTCAGTAGCCTCG-1-WTDAtest7732270-0_Meyer_2021,TTTGTCAGTAGCCTCG-1-WTDAtest7732270-0,Meyer_2021
TTTGTCAGTTAAGACA-1-WTDAtest7732270-0_Meyer_2021,TTTGTCAGTTAAGACA-1-WTDAtest7732270-0,Meyer_2021
TTTGTCATCACCATAG-1-WTDAtest7732270-0_Meyer_2021,TTTGTCATCACCATAG-1-WTDAtest7732270-0,Meyer_2021


In [56]:
%%time
adata_o.obs = embed_obs.loc[df_cellnames_order["old"], :].copy()

CPU times: user 34.3 s, sys: 22.6 s, total: 56.9 s
Wall time: 57.1 s


Some datasets have the same name even though one is in the core and the other in the extension. Might be a problem but essentially they can belong to the same dataset, just with a different purpose.

In [57]:
df = adata_o.obs.study.value_counts().reset_index()
temp = df_cellnames.study.value_counts()[[re.sub("_HLCA", "", i) for i in df["index"]]]
df["index_all"] = temp.index
df["study_all"] = temp.tolist()
df.sort_values(by=["study", "index"], ascending=False)

Unnamed: 0,index,study,index_all,study_all
0,Kaminski_2020,307650,Kaminski_2020,307650
1,Banovich_Kropski_2020,204586,Banovich_Kropski_2020,204586
2,Meyer_2021,129340,Meyer_2021,129340
3,Meyer_Nikolic_2022,119634,Meyer_Nikolic_2022,119634
4,Barbry_unpubl,100211,Barbry_unpubl,100211
5,Regev_2021,96060,Regev_2021,96060
6,Thienpont_2018,93575,Thienpont_2018,93575
7,Budinger_2020,91980,Budinger_2020,91980
8,Sheppard_2020,80020,Sheppard_2020,80020
9,Misharin_Budinger_2018,78317,Misharin_Budinger_2018,78317


We are adding a pair of column with "harmonised" cell names and the cell names we have so far.

In [58]:
adata_o.obs["cellname_original"] = adata_o_obs.loc[
    df_cellnames_order["new"], "cellname"
].tolist()
# adata_o.obs["cellname"] = df_cellnames_order["old"].tolist()
# adata_o.obs["cellname_harmonised"] = df_cellnames_order["new"].tolist()

### Conclusions <a class="anchor" id="bullet5"></a>

In [59]:
adata_o

AnnData object with n_obs × n_vars = 2382658 × 59574
    obs: 'sample', 'original_celltype_ann', 'study_long', 'study', 'last_author_PI', 'subject_ID', 'subject_ID_as_published', 'pre_or_postnatal', 'age_in_years', 'age_range', 'sex', 'smoking_status', 'smoking_history', 'BMI', 'known_lung_disease', 'condition', 'subject_type', 'cause_of_death', 'sample_type', 'anatomical_region_coarse', 'anatomical_region_detailed', 'tissue_dissociation_protocol', 'cells_or_nuclei', 'single_cell_platform', "3'_or_5'", 'enrichment', 'sequencing_platform', 'reference_genome_coarse', 'ensembl_release_reference_genome', 'cell_ranger_version', 'disease_status', 'fresh_or_frozen', 'cultured', 'cell_viability_%', 'comments', 'Processing_site', 'dataset', 'anatomical_region_level_1', 'anatomical_region_level_2', 'anatomical_region_level_3', 'anatomical_region_highest_res', 'age', 'ann_level_1', 'ann_level_2', 'ann_level_3', 'ann_level_4', 'ann_level_5', 'ann_highest_res', 'ann_new', 'n_genes', 'total_counts',

In [60]:
del adata_o.obs["study_coe"]
del adata_o.obs["dataset_temp"]

### Save <a class="anchor" id="bullet6"></a>

In [61]:
%%time
adata_o.write(filename=adata_o_out)

CPU times: user 36 s, sys: 56.9 s, total: 1min 32s
Wall time: 1min 39s


Done.