## Denoising scRNA-seq using DCA 

### Generate AnnData object

In [1]:
import warnings
import numpy as np
import matplotlib.pyplot as plt
import scanpy as sc
import pandas as pd
import anndata
from dca.api import dca
warnings.simplefilter(action="ignore", category = FutureWarning)

  from pandas.core.index import RangeIndex
  _config = yaml.load(open(_config_path))


In [2]:
# Read in the cleaned data set when genes have been filtered to only keep those with total expression across the board > 1
data = pd.read_csv("output/Competition_forWGCNA.csv", index_col=0)
cellinfo = pd.read_csv("output/DCA_cellInfo.csv", index_col=0)
geneinfo = pd.read_csv("output/DCA_geneInfo.csv", index_col=0)

In [3]:
data_T = data.transpose()

In [4]:
print(data_T.head())
print(data_T.shape)

                    MIR1302-2HG  AL627309.1  AL627309.3  AL627309.5  \
TAGTGCAGTTCAAAGA_2            0           0           0           0   
ATTGTTCCACCTATCC_2            0           0           0           0   
AGGTCATTCATGAGTC_2            0           0           0           0   
AAGACTCTCCTAAGTG_2            0           0           0           0   
ACACTGAGTGTTGAGG_2            0           0           0           0   

                    AP006222.2  AL732372.1  AC114498.2  AL669831.2  LINC01409  \
TAGTGCAGTTCAAAGA_2           0           0           0           0          0   
ATTGTTCCACCTATCC_2           0           0           0           0          0   
AGGTCATTCATGAGTC_2           0           0           0           0          0   
AAGACTCTCCTAAGTG_2           0           0           0           0          0   
ACACTGAGTGTTGAGG_2           0           0           0           0          0   

                    FAM87B  ...  MT-CYB  AC136352.4  AC011043.1  AL592183.1  \
TAGTGCA

In [5]:
adata = sc.AnnData(data_T, obs=cellinfo, var = geneinfo)
adata.obs_names = cellinfo.X
adata.var_names = geneinfo.Gene
sc.pp.filter_genes(adata, min_counts = 1)
print(adata)

AnnData object with n_obs × n_vars = 10410 × 28683 
    obs: 'SampleName', 'cellName', 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'initialCell', 'initialFluor', 'initialTransplant', 'Paradigm', 'initialGroup', 'Capture', 'initialSampleName', 'EGFP_assign', 'mCherry_assign', 'zero_assign', 'EGFP', 'mCherry', 'Cell', 'Fluorophore', 'Transplant', 'Group', 'RNA_snn_res.0.2', 'seurat_clusters', 'leiden_clusters', 'CellType', 'S.Score', 'G2M.Score', 'Phase', 'otherGroup', 'newLabel', 'ngeneson', 'X'
    var: 'Gene', 'n_counts'


### Denoise data

In [6]:
%%time
adata_ae=adata.copy()
dca(adata_ae, threads = 1, mode="denoise")

dca: Successfully preprocessed 28683 genes and 10410 cells.




2022-12-14 09:33:20.247400: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-14 09:33:21.321269: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:196] None of the MLIR optimization passes are enabled (registered 0 passes)
2022-12-14 09:33:21.347312: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2700120000 Hz


dca: Calculating reconstructions...
CPU times: user 49min 23s, sys: 47.5 s, total: 50min 10s
Wall time: 51min 10s


In [7]:
for i in [adata, adata_ae]:
    sc.pp.normalize_per_cell(i)
    sc.pp.log1p(i)
    sc.pp.pca(i)

In [8]:
print(adata)

AnnData object with n_obs × n_vars = 10410 × 28683 
    obs: 'SampleName', 'cellName', 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'initialCell', 'initialFluor', 'initialTransplant', 'Paradigm', 'initialGroup', 'Capture', 'initialSampleName', 'EGFP_assign', 'mCherry_assign', 'zero_assign', 'EGFP', 'mCherry', 'Cell', 'Fluorophore', 'Transplant', 'Group', 'RNA_snn_res.0.2', 'seurat_clusters', 'leiden_clusters', 'CellType', 'S.Score', 'G2M.Score', 'Phase', 'otherGroup', 'newLabel', 'ngeneson', 'X', 'n_counts'
    var: 'Gene', 'n_counts'
    uns: 'pca'
    obsm: 'X_pca'
    varm: 'PCs'


In [13]:
df_ae = pd.DataFrame(data=adata_ae.X.transpose(), index = adata_ae.var_names, columns=adata_ae.obs_names)

In [14]:
df_ae.head()

X,TAGTGCAGTTCAAAGA_2,ATTGTTCCACCTATCC_2,AGGTCATTCATGAGTC_2,AAGACTCTCCTAAGTG_2,ACACTGAGTGTTGAGG_2,TAATTCCAGATAGCTA_2,AAACGCTCACCCTAGG_2,CCGATGGGTAACATGA_2,ATTGGGTTCGCCACTT_2,ACCACAAAGACTTCGT_2,...,CCATCACGTGATTGGG_14,CTGAGCGGTGATGAAT_14,TAACGACAGTGTACCT_14,AAGCGAGGTCTCAGGC_14,GGGTGAAGTATTCTCT_14,GACTCTCAGGGTCTTT_14,GAGGGTATCCTCAGAA_14,TTTCACATCACGAACT_14,CTTGATTAGAGAGGGC_14,ATAGACCCAGAACTAA_14
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MIR1302-2HG,0.000624,0.000368,0.000424,0.000234,0.000528,0.000313,0.002611,0.000276,0.001413,0.000587,...,0.001751,0.000678,0.000673,0.000459,0.000788,0.000979,0.000804,0.00091,0.00122,0.002423
AL627309.1,0.002232,0.001378,0.001294,0.000576,0.000957,0.000365,0.015789,0.000562,0.007424,0.000765,...,0.009037,0.002327,0.002396,0.000779,0.002484,0.00304,0.001614,0.00393,0.005092,0.015032
AL627309.3,0.000496,0.000333,0.000348,0.000176,0.000407,0.000218,0.00306,0.000188,0.001515,0.000374,...,0.001943,0.000563,0.000566,0.00032,0.000739,0.000715,0.000694,0.000859,0.001077,0.002988
AL627309.5,0.005625,0.009217,0.003767,0.002041,0.003323,0.001002,0.050502,0.00127,0.037291,0.003946,...,0.011921,0.006742,0.006889,0.004011,0.012304,0.011661,0.005081,0.016676,0.012863,0.019926
AP006222.2,0.000598,0.000404,0.000426,0.000183,0.000447,0.000197,0.003142,0.000247,0.001745,0.000418,...,0.00233,0.000721,0.000749,0.000311,0.00091,0.00076,0.000805,0.001073,0.001385,0.003124


In [15]:
df_ae.to_csv("output/Denoised_Competition_matrix.csv")

## END OF FILE