# Covac Analysis

In [1]:
import numpy as np
import pandas as pd


In [2]:
from scanpy import read_h5ad
import os

DATA_DIR = "/home/julian/Uni/MasterThesis/data"

corona_ann = read_h5ad(os.path.join(DATA_DIR, "COVAC_POSTQC.h5ad"))

## Exploration

How big is the data? What metadata exists?

In [3]:
import sys

# since float32 are used in np - one number requires 4 bytes
n_bytes_per_count = 4
print("%f.2 GB object"  % (int(sys.getsizeof(corona_ann)) / 1000000000))
print("%f.2 GB raw matrix" % (corona_ann.shape[0] * corona_ann.shape[1] * n_bytes_per_count / 1000000000))
corona_ann

5.438285.2 GB object
42.404290.2 GB raw matrix


AnnData object with n_obs × n_vars = 491473 × 21570
    obs: 'patientID', 'sampleID', 'cohort', 'timepoint', 'log1p_total_counts', 'log1p_n_genes_by_counts', 'pct_counts_mito', 'pct_counts_ribo', 'pct_counts_hb'

How many patients?

In [4]:
corona_ann.obs.patientID.describe()

count     491473
unique        14
top           P7
freq       67682
Name: patientID, dtype: object

What are the cohorts referring to?

In [5]:
corona_ann.obs.cohort.describe()

count     491473
unique         2
top            P
freq      269365
Name: cohort, dtype: object

What do the time labels stand for?

In [6]:
corona_ann.obs.timepoint

new_index
AAACCCACAAGCCCAC-59     t3
AAACCCATCATCCTGC-59     t3
AAACCCATCTTGTTAC-59     t3
AAACGAAAGACAACTA-59     t3
AAACGAACAAGTGGGT-59     t3
                      ... 
TTTGTTGTCGGCCTTT-42    d28
TTTGTTGTCGGTAGAG-42    d28
TTTGTTGTCGTGGACC-42    d28
TTTGTTGTCTATCGGA-42    d28
TTTGTTGTCTGTTGGA-42    d28
Name: timepoint, Length: 491473, dtype: category
Categories (8, object): ['M6', 'd1', 'd28', 'd56', 't1', 't2', 't3', 't4']

## Preprocessing

Reduce based on the timepoint label

In [7]:
corona_subs = corona_ann[corona_ann.obs["timepoint"].isin(['d1', 'd28', 'd56']), :].copy()
del corona_ann

In [8]:
import gc
gc.collect()

66

In [None]:
#from psupertime.preprocessing import transform_labels
## create ordinal labels
#corona_ann.obs["ordinal_label"] = transform_labels(corona_ann.obs["timepoint"])
#corona_ann

In [None]:
from psupertime.preprocessing import Preprocessing

pp = Preprocessing(select_genes="all", scale=True, log=False, smooth=False)
pp.fit_transform(corona_subs, inplace=True)