In [1]:
import sys
sys.path.insert(0, "../lib")

In [2]:
import math
import os
import pathlib
import re

import scanpy as sc
import numpy as np
import pandas as pd
import sc_utils
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
pd.options.display.max_columns = None

In [4]:
%config InlineBackend.figure_format = "retina"

In [5]:
DATA = pathlib.Path('../../data/31_bal-object/')

In [6]:
adata = sc.read_h5ad(DATA / '03_bal-object/03_bal-object.h5ad')

In [10]:
raw_object = sc.read_h5ad('../../data/31_bal-object/raw_object.h5ad')

In [11]:
adata.obs.cell_type = adata.obs.cell_type.astype(str).replace({
    'MoAM-3 mature': 'MoAM-2',
    'MoAM-4 profibrotic': 'MoAM-3 profibrotic',
    'gd/NKT cells': 'NK + γδT cells',
    'Perivascular macrophages': 'Interstitial macrophages',
}).astype('category')

In [12]:
adata.obs.Status = pd.Categorical(adata.obs.Status.astype(str).replace({
    'control': 'Control',
    'SSc': 'SSc-ILD'
}), categories=['Control', 'SSc-ILD'])

In [13]:
paper_ids = pd.read_csv('../SSc_BAL_GEO_ID_Mapping - ID mapping.csv')

In [14]:
adata.obs['paper_id'] = paper_ids.set_index('External Sample ID')['Publication ID'][
    adata.obs['External Sample ID'].values
].values
adata.obs['paper_id'] = pd.Categorical(adata.obs['paper_id'])

In [15]:
raw_object.obs['paper_id'] = paper_ids.set_index('External Sample ID')['Publication ID'][
    raw_object.obs['External Sample ID'].values
].values
raw_object.obs['paper_id'] = pd.Categorical(raw_object.obs['paper_id'])

In [16]:
raw_object.obs_names = (
    raw_object.obs['paper_id'].astype(str)
    + '_'
    + raw_object.obs_names.str.split('-').str[0]
)

In [18]:
adata.obs.drop(columns=[
    'Sample', 'Sample ID', 'External Sample ID',
    'Patient', 'Smoking status', 'Race', 'BMI',
    'Type', 'Tissue location', 'Additional data',
    'Fastq', 'Genome build', 'Directory', 'Doublet threshold',
    'batch', 'n_counts', 'pct_counts_in_top_20_genes',
    'total_counts_mito', 'total_counts_ribo', '_scvi_batch',
    '_scvi_labels', 'leiden_scVI'
], inplace=True)

In [19]:
adata.obs.rename(columns={
    'Study': 'Cohort',
    'Status': 'Condition',
    'paper_id': 'Sample',
}, inplace=True)

In [20]:
adata.obs = adata.obs.loc[
    :,
    [
        'Cohort', 'Sample', 'Condition', 'Sex', 'Age',
        'Protocol', 'Chemistry', 'n_genes_by_counts', 'total_counts', 'pct_counts_in_top_10_genes',
        'pct_counts_mito', 'pct_counts_ribo', 'cell_type'
    ]
]

In [21]:
adata.obs_names = (
    adata.obs.Sample.astype(str)
    + '_'
    + adata.obs_names.str.split('_').str[-1]
)

In [22]:
adata.obs.Cohort = adata.obs.Cohort.astype(str).replace({
    'this': 'Northwestern/Yale'
}).astype('category')

In [23]:
adata.obs.Sex = adata.obs.Sex.str.capitalize()

In [24]:
adata.obs.Protocol = adata.obs.Protocol.astype(str).replace({
    'Wash, no sorting': 'no FACS',
    'FACSorted': 'FACS'
}).astype('category')

In [25]:
age_bins = {}
for i in range(100):
    bin_start = i // 5 * 5
    age_bins[str(i)] = f'{bin_start}–{bin_start + 4}'

In [26]:
adata.obs.Age = adata.obs.Age.astype(str).replace(age_bins).astype('category')

In [30]:
os.makedirs(DATA / '03x_bal-export', exist_ok=True)

First compute markers on log1p-processed `.raw`

In [31]:
sc.tl.rank_genes_groups(adata, "cell_type", method="t-test", n_genes=200)

... storing 'Sex' as categorical


In [32]:
markers = sc_utils.get_markers(adata, "cell_type")

In [33]:
markers.sort_values(["cluster", "avg_logFC"], ascending=[True, False], inplace=True)

In [34]:
markers.to_csv(DATA / '03x_bal-export/03x_bal-export-markers.csv')

Now subset `raw_object` and put it to `.raw`

In [35]:
current_raw = adata.raw.to_adata()

In [36]:
new_raw = raw_object[current_raw.obs_names].copy()
new_raw.obs = current_raw.obs.copy()

In [44]:
adata.raw = new_raw

In [45]:
adata.write_h5ad(DATA / '03x_bal-export/03x_bal-export.h5ad')