# GEX data preprocessing

## 1) Preliminaries

In [1]:
import scanpy as sc
import pandas as pd
import scipy
import os

In [2]:
sc.settings.verbosity = 4

In [3]:
data_folder = '../data'
result_folder = '../Result/PseudoBulk'
graph_folder = '../Graph'

In [4]:
if not os.path.isdir(result_folder):
    os.mkdir(result_folder)

## 2) Read dataset

In [5]:
dataset_name = "complete-gex-dataset-annotated.h5ad"
adata = sc.read_h5ad(os.path.join(data_folder, dataset_name))
adata

AnnData object with n_obs × n_vars = 143428 × 15935
    obs: 'n_genes', 'sample_tag', 'patient', 'time', 'lab_id', 'DOB', 'sex', 'ht', 'VES', 'PCR', 'FIBRINOGENO', 'ANTITROMB', 'sCD40L', 'EGF', 'Eotaxin', 'FGF-2', 'FLT-3L', 'Fractalkine', 'G-CSF', 'GM-CSF', 'GROalpha', 'IFNalpha2', 'IFNgamma', 'IL-1alpha', 'IL-1beta', 'IL-1RA', 'IL-2', 'IL-3', 'IL-4', 'IL-5', 'IL-6', 'IL-7', 'IL-8', 'IL-9', 'IL-10', 'IL-12p40', 'IL-12p70', 'IL-13', 'IL-15', 'IL-17A', 'IL-17E/IL-25', 'IL-17F', 'IL-18', 'IL-22', 'IL-27', 'IP-10', 'MCP-1', 'MCP-3', 'M-CSF', 'MDC', 'MIG', 'MIP-1alpha', 'MIP-1beta', 'PDGF-AA', 'PDGF-AB/BB', 'TGFalpha', 'TNFalpha', 'TNFbeta', 'VEGF-A', 'IgA', 'IgG', 'days_from_symptoms_to_admission', 'severity', 'age', 'alcohol', 'smoke', 'fever', 'cough', 'pneumo', 'oxygen', 'comorbities', 'hypertension', 'batch_id', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_rb', 'pct_counts_rb', 'total_counts_ab', 'pct_counts_ab', 'leiden', 'leiden_abseq_harmony

In [6]:
adata_raw = adata.raw.to_adata()
adata_raw

AnnData object with n_obs × n_vars = 143428 × 15935
    obs: 'n_genes', 'sample_tag', 'patient', 'time', 'lab_id', 'DOB', 'sex', 'ht', 'VES', 'PCR', 'FIBRINOGENO', 'ANTITROMB', 'sCD40L', 'EGF', 'Eotaxin', 'FGF-2', 'FLT-3L', 'Fractalkine', 'G-CSF', 'GM-CSF', 'GROalpha', 'IFNalpha2', 'IFNgamma', 'IL-1alpha', 'IL-1beta', 'IL-1RA', 'IL-2', 'IL-3', 'IL-4', 'IL-5', 'IL-6', 'IL-7', 'IL-8', 'IL-9', 'IL-10', 'IL-12p40', 'IL-12p70', 'IL-13', 'IL-15', 'IL-17A', 'IL-17E/IL-25', 'IL-17F', 'IL-18', 'IL-22', 'IL-27', 'IP-10', 'MCP-1', 'MCP-3', 'M-CSF', 'MDC', 'MIG', 'MIP-1alpha', 'MIP-1beta', 'PDGF-AA', 'PDGF-AB/BB', 'TGFalpha', 'TNFalpha', 'TNFbeta', 'VEGF-A', 'IgA', 'IgG', 'days_from_symptoms_to_admission', 'severity', 'age', 'alcohol', 'smoke', 'fever', 'cough', 'pneumo', 'oxygen', 'comorbities', 'hypertension', 'batch_id', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_rb', 'pct_counts_rb', 'total_counts_ab', 'pct_counts_ab', 'leiden', 'leiden_abseq_harmony

In [7]:
adata.obs['PatientTime'] = adata.obs['patient'].astype('str') + '_' + adata.obs['time'].astype('str')
adata_raw.obs['PatientTime'] = adata_raw.obs['patient'].astype('str') + '_' + adata_raw.obs['time'].astype('str')

## 3) Pseudobulk data generation

In [8]:
def adata_pseudobulk(adata):

    if scipy.sparse.issparse(adata.X):
        adata.X=adata.X.todense()

    df_sc = pd.DataFrame(
        index = adata.obs_names.tolist(),
        data = adata.X,
        columns=adata.var_names.tolist()
        )
   
    df_sc = df_sc.sum(axis=0)

    return(df_sc)

In [9]:
total = pd.DataFrame()
obs = pd.DataFrame()

for sample in adata.obs['PatientTime'].unique():
    for cluster in adata.obs['cell_families'].cat.categories:

        dummy = adata[(adata.obs['PatientTime'] == sample) & (adata.obs['cell_families'] == cluster)].copy()
        
        df = adata_pseudobulk(dummy)
        dfmeta=dummy.obs
        total['{}_{}'.format(sample,cluster)] = df
        dfmeta['numberOfCell'] = dummy.shape[0]
        obs = obs.append(dfmeta.loc[:,['PatientTime', 'cell_families', 'patient', 'time', 'numberOfCell', 'severity']].iloc[0,:])

total.to_csv('{}/total.csv'.format(result_folder), sep='\t')
obs.to_csv('{}/obs.csv'.format(result_folder), sep='\t')

In [10]:
total=pd.DataFrame()
obs=pd.DataFrame()

for sample in adata.obs['PatientTime'].unique():
    for cluster in adata.obs['cell_families'].cat.categories:
        
        dummy = adata_raw[(adata_raw.obs['PatientTime'] == sample) & (adata_raw.obs['cell_families'] == cluster)].copy()
        
        df = adata_pseudobulk(dummy)
        dfmeta=dummy.obs
        total['{}_{}'.format(sample,cluster)] = df
        dfmeta['numberOfCell'] = dummy.shape[0]
        obs = obs.append(dfmeta.loc[:,['PatientTime', 'cell_families', 'patient', 'time', 'numberOfCell', 'severity']].iloc[0,:])

total.to_csv('{}/totalRaw.csv'.format(result_folder), sep='\t')
obs.to_csv('{}/obsRaw.csv'.format(result_folder), sep='\t')