In [1]:
import os
import pathlib

import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import decoupler

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pd.options.display.max_rows = 300

In [3]:
%config InlineBackend.figure_format = "retina"

In [4]:
# Nick needs this because he's using Firefox which apparently noone uses anymore
from IPython.display import display, HTML
display(HTML("<style>.jp-OutputArea-output {display:flex}</style>"))

In [5]:
def sanitize_name(name):
    return name.replace(' ', '_').replace('*', '').replace(';', '_and').replace('/', '_')

In [6]:
DATA = pathlib.Path('../../data/31_bal-object/')

In [7]:
ds = sc.read_h5ad(DATA / "03_bal-object/03_bal-object.h5ad")

In [8]:
ds.obs.cell_type.unique().tolist()

['CD4 naive T cells',
 'DC2',
 'TRAM-2',
 'CD4 TEM cells',
 'TRAM-3',
 'TRAM-4',
 'Tregs',
 'MoAM-3 mature',
 'CD8 TRM cells',
 'TRAM-1',
 'TRAM-5 MT1G',
 'TRAM-6 activated',
 'CCR7+ DC',
 'MoAM-4 profibrotic',
 'Proliferating macrophages',
 'B cells',
 'Ciliated cells',
 'Monocytes',
 'Secretory cells',
 'CD8 TEM cells',
 'gd/NKT cells',
 'Activated monocytes',
 'DC1',
 'pDC',
 'Perivascular macrophages',
 'MoAM-1',
 'Mast cells',
 'Proliferating T cells']

In [9]:
raw_object = sc.read_h5ad(DATA / "raw_object.h5ad")

In [10]:
raw_object.obs_names = (
    raw_object.obs["External Sample ID"].astype(str) 
    + "_" 
    + raw_object.obs_names.str.replace("(-\d+)+$", "", regex=True)
)

In [11]:
OUT_DIR = DATA / 'pseudobulk-gsva'

In [12]:
CUTOFF = 50
for ct in ds.obs.cell_type.unique():
    ct_slug = sanitize_name(ct)
    cells = ds.obs.index[ds.obs.cell_type.eq(ct)]
    patient_status = ds.obs.loc[
        cells, 
        ["External Sample ID", "Status", "Sex", "Study", 'Chemistry']
    ].drop_duplicates()
    samples = patient_status["External Sample ID"].unique()
    sample_values = [] # gene sum for sample in this cell type
    sample_ncells = [] # number of cells expressing gene for sample in this cell type
    filtered_samples = [] # samples that pass cutoff filter
    n_cells = []
    for s in samples:
        # cells in particular sample
        s_cells = raw_object.obs_names.isin(cells) & raw_object.obs["External Sample ID"].eq(s)
        if s_cells.sum() >= CUTOFF:
            sample_values.append(raw_object.X[s_cells, :].sum(axis=0).A[0])
            sample_ncells.append((raw_object.X[s_cells, :] > 0).sum(axis=0).A[0])
            filtered_samples.append(s)
            n_cells.append(s_cells.sum())
            
    sample_values = pd.DataFrame(sample_values, index=filtered_samples, columns=raw_object.var_names).T
    sample_ncells = pd.DataFrame(sample_ncells, index=filtered_samples, columns=raw_object.var_names).T

    os.makedirs(f'{OUT_DIR}/{ct_slug}/data', exist_ok=True)
    
    fname = f"{OUT_DIR}/{ct_slug}/data/{ct_slug}.txt"
    sample_values.to_csv(fname, sep="\t")
    
    fname = f"{OUT_DIR}/{ct_slug}/data/{ct_slug}-n_cells.txt"
    sample_ncells.to_csv(fname, sep="\t")
    
    fname = f"{OUT_DIR}/{ct_slug}/data/{ct_slug}-meta.csv"
    patient_status = patient_status.loc[patient_status["External Sample ID"].isin(filtered_samples), :]
    patient_status["n_cells"] = n_cells
    patient_status.to_csv(fname)
    print(f"{ct} done, {len(filtered_samples)}/{len(samples)}")

CD4 naive T cells done, 15/23
DC2 done, 16/23
TRAM-2 done, 22/23
CD4 TEM cells done, 20/23
TRAM-3 done, 23/23
TRAM-4 done, 23/23
Tregs done, 12/22
MoAM-3 mature done, 23/23
CD8 TRM cells done, 18/23
TRAM-1 done, 23/23
TRAM-5 MT1G done, 23/23
TRAM-6 activated done, 22/23
CCR7+ DC done, 6/23
MoAM-4 profibrotic done, 14/22
Proliferating macrophages done, 22/23
B cells done, 11/23
Ciliated cells done, 9/17
Monocytes done, 14/22
Secretory cells done, 6/18
CD8 TEM cells done, 7/23
gd/NKT cells done, 8/22
Activated monocytes done, 2/22
DC1 done, 10/23
pDC done, 1/20
Perivascular macrophages done, 15/22
MoAM-1 done, 14/22
Mast cells done, 1/22
Proliferating T cells done, 0/23


In [13]:
ct_slug = 'global'
cells = ds.obs.index
patient_status = ds.obs.loc[
    cells, 
    ["External Sample ID", "Status", "Sex", "Study", 'Chemistry']
].drop_duplicates()
samples = patient_status["External Sample ID"].unique()
sample_values = [] # gene sum for sample in this cell type
sample_ncells = [] # number of cells expressing gene for sample in this cell type
filtered_samples = [] # samples that pass cutoff filter
n_cells = []
for s in samples:
    # cells in particular sample
    s_cells = raw_object.obs_names.isin(cells) & raw_object.obs["External Sample ID"].eq(s)
    if s_cells.sum() >= CUTOFF:
        sample_values.append(raw_object.X[s_cells, :].sum(axis=0).A[0])
        sample_ncells.append((raw_object.X[s_cells, :] > 0).sum(axis=0).A[0])
        filtered_samples.append(s)
        n_cells.append(s_cells.sum())
        
sample_values = pd.DataFrame(sample_values, index=filtered_samples, columns=raw_object.var_names).T
sample_ncells = pd.DataFrame(sample_ncells, index=filtered_samples, columns=raw_object.var_names).T

os.makedirs(f'{OUT_DIR}/{ct_slug}/data', exist_ok=True)

fname = f"{OUT_DIR}/{ct_slug}/data/{ct_slug}.txt"
sample_values.to_csv(fname, sep="\t")

fname = f"{OUT_DIR}/{ct_slug}/data/{ct_slug}-n_cells.txt"
sample_ncells.to_csv(fname, sep="\t")

fname = f"{OUT_DIR}/{ct_slug}/data/{ct_slug}-meta.csv"
patient_status = patient_status.loc[patient_status["External Sample ID"].isin(filtered_samples), :]
patient_status["n_cells"] = n_cells
patient_status.to_csv(fname)
print(f"{ct_slug} done, {len(filtered_samples)}/{len(samples)}")

global done, 23/23
