# Dataset Preprocessing

### Library loading

In [1]:
import scanpy as sc, anndata as ad, numpy as np, pandas as pd
from scipy import sparse
from anndata import AnnData
import warnings
import yaml
import os
warnings.filterwarnings('ignore')

In [2]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=30, facecolor='white', figsize=(10,10))

scanpy==1.8.0 anndata==0.8.0 umap==0.4.6 numpy==1.22.2 scipy==1.6.2 pandas==1.2.3 scikit-learn==0.24.1 statsmodels==0.13.5 python-igraph==0.9.1 louvain==0.7.0 leidenalg==0.8.3


### Configure paths

In [3]:
outdir = "../data/output"


with open("../data/resources/iPSC_lines_map.yaml", 'r') as f:
    iPSC_lines_map = yaml.load(f, Loader=yaml.FullLoader)["lines"]


## Merge Datasets Raw

In [4]:
DownD50 = sc.read_h5ad(outdir+'/adatas/DownD50_raw.h5ad')
UpD50 = sc.read_h5ad(outdir+'/adatas/UpD50_raw.h5ad')
DownD100 = sc.read_h5ad(outdir+'/adatas/DownD100_raw.h5ad')
UpD100_1 = sc.read_h5ad(outdir+'/adatas/UpD100_1_raw.h5ad')
UpD100_2 = sc.read_h5ad(outdir+'/adatas/UpD100_2_raw.h5ad')
DownD250 = sc.read_h5ad(outdir+'/adatas/DownD250_raw.h5ad')
UpD300 = sc.read_h5ad(outdir+'/adatas/UpD300_raw.h5ad')



In [5]:
D50 = ad.concat([DownD50, UpD50],join="inner")
D50.obs_names_make_unique()

D100 = ad.concat([DownD100, UpD100_1, UpD100_2],join="inner")
D100.obs_names_make_unique()

D250 = ad.concat([DownD250, UpD300],join="inner")
D250.obs_names_make_unique()

Multiplexing = ad.concat([D250,D100,D50], join="outer")

In [6]:
Multiplexing.obs_names_make_unique()
Multiplexing
Multiplexing.obs.dataset

AAACCTGAGAGACTAT-1_DownD250    DownD250
AAACCTGCATGGTTGT-1_DownD250    DownD250
AAACCTGTCAGTTAGC-1_DownD250    DownD250
AAACGGGCAGATCGGA-1_DownD250    DownD250
AAACGGGCAGCTATTG-1_DownD250    DownD250
                                 ...   
TTTGTCATCGCCTGTT-1_UpD50          UpD50
TTTGTCATCGCGTAGC-1_UpD50          UpD50
TTTGTCATCGGTGTTA-1_UpD50          UpD50
TTTGTCATCGTTTATC-1_UpD50          UpD50
TTTGTCATCTCTTGAT-1_UpD50          UpD50
Name: dataset, Length: 34249, dtype: object

In [7]:
Multiplexing.obs.loc[Multiplexing.obs.dataset.isin(["DownD50","UpD50"]),"stage"] = "early"
Multiplexing.obs.loc[Multiplexing.obs.dataset.isin(["DownD100","UpD100_1","UpD100_2"]),"stage"] = "mid"
Multiplexing.obs.loc[Multiplexing.obs.dataset.isin(["DownD250","UpD300"]),"stage"] = "late"

In [8]:
Multiplexing.obs.loc[Multiplexing.obs.dataset.isin(["UpD50","UpD100_1","UpD100_2","UpD300"]),"type"] = "upstream"
Multiplexing.obs.loc[Multiplexing.obs.dataset.isin(["DownD100","DownD50","DownD250"]),"type"] = "downstream"


In [9]:
Multiplexing.obs["id_stage"] = Multiplexing.obs["cellID_newName"].astype("str")+"_"+Multiplexing.obs["stage"].astype("str")

### Configure colors

In [10]:
cellID_colors = {}
cellID_newName_colors = {}
cellID_newNames = {}


for line in iPSC_lines_map.keys():
    cellID_colors[iPSC_lines_map[line]["oldName"]] = iPSC_lines_map[line]["color"]
    cellID_newName_colors[iPSC_lines_map[line]["newName"]] = iPSC_lines_map[line]["color"]
    cellID_newNames[iPSC_lines_map[line]["oldName"]] = iPSC_lines_map[line]["newName"]

Multiplexing.obs["cellID"] = Multiplexing.obs["cellID"].astype("category")
Multiplexing.obs["cellID_newName"] = Multiplexing.obs["cellID"].replace(cellID_newNames, inplace=False).astype("category")
Multiplexing.uns["cellID_colors"] = [cellID_colors[line] for line in Multiplexing.obs["cellID"].cat.categories]
Multiplexing.uns["cellID_newName_colors"] = [cellID_newName_colors[line] for line in Multiplexing.obs["cellID_newName"].cat.categories]


stage_colors = {}
type_colors = {}
dataset_colors = {}
cellID_newName_type_colors = {}

with open("../data/resources/iPSC_lines_map.yaml", 'r') as f:
    stage_map = yaml.load(f, Loader=yaml.FullLoader)["stage"]
    for tpt in stage_map.keys():
        stage_colors[tpt] = stage_map[tpt]["color"]


with open("../data/resources/iPSC_lines_map.yaml", 'r') as f:
    type_map = yaml.load(f, Loader=yaml.FullLoader)["type"]
    for tpt in type_map.keys():
        type_colors[tpt] = type_map[tpt]["color"]
        
        
with open("../data/resources/iPSC_lines_map.yaml", 'r') as f:
    cellID_newName_type_map = yaml.load(f, Loader=yaml.FullLoader)["cellID_newName_type"]
    for tpt in cellID_newName_type_map.keys():
        cellID_newName_type_colors[tpt] = cellID_newName_type_map[tpt]["color"]

        
with open("../data/resources/iPSC_lines_map.yaml", 'r') as f:
    dataset_map = yaml.load(f, Loader=yaml.FullLoader)["dataset"]
    for tpt in dataset_map.keys():
        dataset_colors[tpt] = dataset_map[tpt]["color"]
        
Multiplexing.obs["type"] = Multiplexing.obs["type"].astype("category")
Multiplexing.obs["type"] = Multiplexing.obs["type"].cat.set_categories(["upstream","downstream"], ordered=True)
Multiplexing.obs["stage"] = Multiplexing.obs["stage"].astype("category")
Multiplexing.obs["stage"] = Multiplexing.obs["stage"].cat.set_categories(["early","mid","late"], ordered=True)
Multiplexing.obs["dataset"] = Multiplexing.obs["dataset"].astype("category")
Multiplexing.obs["dataset"] = Multiplexing.obs["dataset"].cat.set_categories(["UpD50","DownD50","UpD100_1","UpD100_2","DownD100","UpD300","DownD250"], ordered=True)
Multiplexing.obs["cellID_newName_type"] = Multiplexing.obs["cellID_newName"].astype("str")+"_"+Multiplexing.obs["type"].astype("str")
Multiplexing.obs["cellID_newName_type"] = Multiplexing.obs["cellID_newName_type"].astype("category")
Multiplexing.obs["cellID_newName_type"] = Multiplexing.obs["cellID_newName_type"].cat.set_categories(["CTL04E_upstream","CTL04E_downstream","CTL02A_upstream","CTL02A_downstream","CTL08A_upstream","CTL08A_downstream","CTL01_upstream","CTL01_downstream"], ordered=True)


Multiplexing.uns["type_colors"] = [type_colors[type] for type in Multiplexing.obs["type"].cat.categories]
Multiplexing.uns["stage_colors"] = [stage_colors[type] for type in Multiplexing.obs["stage"].cat.categories]
Multiplexing.uns["dataset_colors"] = [dataset_colors[type] for type in Multiplexing.obs["dataset"].cat.categories]
Multiplexing.uns["cellID_newName_type_colors"] = [cellID_newName_type_colors[type] for type in Multiplexing.obs["cellID_newName_type"].cat.categories]




In [11]:
#sc.pp.filter_cells(Multiplexing, min_genes=200)
#sc.pp.filter_genes(Multiplexing, min_cells=3)
#Multiplexing

In [12]:
#sc.pp.normalize_total(Multiplexing)

In [13]:
#sc.pp.log1p(Multiplexing)

In [14]:
HVGs=pd.read_csv(outdir+"/HVG_list_intersection_Curated.txt", sep = "\t")["HVG"]


In [15]:
Multiplexing.var["highly_variable"] = Multiplexing.var_names.isin(HVGs)
Multiplexing.var["highly_variable"].sum()

3499

In [16]:
del Multiplexing.obsm
Multiplexing

AnnData object with n_obs × n_vars = 34249 × 33538
    obs: 'dataset', 'cellID', 'cellID_newName', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'stage', 'type', 'id_stage', 'cellID_newName_type'
    var: 'highly_variable'
    uns: 'cellID_colors', 'cellID_newName_colors', 'type_colors', 'stage_colors', 'dataset_colors', 'cellID_newName_type_colors'

In [17]:
Multiplexing.write(outdir+'/adatas/MultiplexingPreprocessing_unscaled.h5ad')

In [18]:
Multiplexing

AnnData object with n_obs × n_vars = 34249 × 33538
    obs: 'dataset', 'cellID', 'cellID_newName', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'stage', 'type', 'id_stage', 'cellID_newName_type'
    var: 'highly_variable'
    uns: 'cellID_colors', 'cellID_newName_colors', 'type_colors', 'stage_colors', 'dataset_colors', 'cellID_newName_type_colors'