In [1]:
import os
import scipy
import numpy as np
import pandas as pd
import math
import sys
import multivelo as mv
import scanpy as sc
import scvelo as scv
import matplotlib.pyplot as plt
import requests
# from dtw import *

## Load the data

In [5]:
celltypes =  pd.read_csv("/mnt/data0/halo/A594/GSM3271040_RNA_sciCAR_A549_cell.txt.gz")
celltypes

Unnamed: 0,sample,cell_name,experiment,treatment_time
0,sci-RNA-A-001.CGCCAGGCAT,293T,coassay,
1,sci-RNA-A-001.AAGTACGTTA,A549,coassay,3.0
2,sci-RNA-A-001.GCCATCAACT,3T3,coassay,
3,sci-RNA-A-001.TCTCTCATCC,A549,coassay,0.0
4,sci-RNA-A-001.TCCGCCGGTC,A549,coassay,3.0
...,...,...,...,...
6088,sci-RNA-E-096.CGAATCTCCT,A549,coassay,3.0
6089,sci-RNA-E-096.ATATGCCATC,A549,coassay,3.0
6090,sci-RNA-E-096.TTGCAGCATT,A549,coassay,1.0
6091,sci-RNA-E-096.ACTCTACTGG,A549,coassay,1.0


In [6]:
genes =  pd.read_csv("/mnt/data0/halo/A594/GSM3271040_RNA_sciCAR_A549_gene.txt.gz")
genes

Unnamed: 0,gene_id,gene_type,gene_short_name
0,ENSG00000223972.4,pseudogene,DDX11L1
1,ENSG00000227232.4,pseudogene,WASH7P
2,ENSG00000243485.2,lincRNA,MIR1302-11
3,ENSG00000237613.2,lincRNA,FAM138A
4,ENSG00000268020.2,pseudogene,OR4G4P
...,...,...,...
113148,ENSMUSG00000064368.1,protein_coding,mt-Nd6
113149,ENSMUSG00000064369.1,Mt_tRNA,mt-Te
113150,ENSMUSG00000064370.1,protein_coding,mt-Cytb
113151,ENSMUSG00000064371.1,Mt_tRNA,mt-Tt


In [9]:
from scipy.sparse import coo_matrix
genecount =np.genfromtxt("/mnt/data0/halo/A594/GSM3271040_RNA_sciCAR_A549_gene_count.txt", skip_header=1)
rows = genecount[:, 0].astype(int)
cols = genecount[:, 1].astype(int)
vals = genecount[:, 2]

# Create the COO sparse matrix
sparse_mat = coo_matrix((vals, (rows, cols)))

sparse_mat

<113154x6094 sparse matrix of type '<class 'numpy.float64'>'
	with 9251102 stored elements in COOrdinate format>

In [10]:
sparse_mat = sparse_mat.tocsr().T
sparse_mat = sparse_mat[1:, 1:]
sparse_mat.shape

(6093, 113153)

In [11]:
import anndata as ad

rna_data = ad.AnnData(sparse_mat)

In [12]:
rna_data.obs = celltypes
rna_data.var = genes

In [20]:
## remove Nans
celltypes[celltypes.cell_name=="A549"]

Unnamed: 0,sample,cell_name,experiment,treatment_time
1,sci-RNA-A-001.AAGTACGTTA,A549,coassay,3.0
3,sci-RNA-A-001.TCTCTCATCC,A549,coassay,0.0
4,sci-RNA-A-001.TCCGCCGGTC,A549,coassay,3.0
5,sci-RNA-A-001.TTCTATAGAG,A549,coassay,1.0
7,sci-RNA-A-001.CGTCTATGAA,A549,coassay,1.0
...,...,...,...,...
6088,sci-RNA-E-096.CGAATCTCCT,A549,coassay,3.0
6089,sci-RNA-E-096.ATATGCCATC,A549,coassay,3.0
6090,sci-RNA-E-096.TTGCAGCATT,A549,coassay,1.0
6091,sci-RNA-E-096.ACTCTACTGG,A549,coassay,1.0


In [21]:
rna_data = rna_data[celltypes[celltypes.cell_name=="A549"].index, :]
rna_data

View of AnnData object with n_obs × n_vars = 4277 × 113153
    obs: 'sample', 'cell_name', 'experiment', 'treatment_time'
    var: 'gene_id', 'gene_type', 'gene_short_name'

In [22]:
## preprocessing RNA data
rna_data.var["modality"] = "Gene Expression"
rna_data.var["feature_types"] = "Gene Expression"

In [24]:
rna_data.write_h5ad("data/datasets/A549_rna.h5ad")

In [28]:
## Read ATAC data

cells =  pd.read_csv("/mnt/data0/halo/A594/GSM3271041_ATAC_sciCAR_A549_cell.txt.gz")
cells[cells.group=="A549_0h"]

Unnamed: 0,sample,source,group,experiment
6,sci-RNA-A-071.GGCTGCCTTA,Human,A549_0h,co_assay
12,sci-RNA-A-071.TGGCAGAAGT,Human,A549_0h,co_assay
20,sci-RNA-A-023.ATGAGTTCTC,Human,A549_0h,co_assay
26,sci-RNA-A-059.GACCAATGCG,Human,A549_0h,co_assay
28,sci-RNA-A-059.CCTAAGCGGT,Human,A549_0h,co_assay
...,...,...,...,...
6062,sci-RNA-E-070.CCATCGGACC,Human,A549_0h,co_assay
6063,sci-RNA-E-070.ACGCGCTCCT,Human,A549_0h,co_assay
6071,sci-RNA-E-022.CTGGTTGGTT,Human,A549_0h,co_assay
6077,sci-RNA-E-022.CGTAAGGAGT,Human,A549_0h,co_assay


In [29]:
ATAC_feature =  pd.read_csv("/mnt/data0/halo/A594/GSM3271041_ATAC_sciCAR_A549_peak.txt.gz")
ATAC_feature

Unnamed: 0,id,peak,chr,start,end
0,1,1-9963-10665,1,9963,10665
1,2,1-11369-12010,1,11369,12010
2,3,1-24886-25386,1,24886,25386
3,4,1-29054-30366,1,29054,30366
4,5,1-36073-36581,1,36073,36581
...,...,...,...,...,...
189598,189599,hs37d5-35449616-35449816,hs37d5,35449616,35449816
189599,189600,hs37d5-35450394-35450635,hs37d5,35450394,35450635
189600,189601,hs37d5-35454173-35454373,hs37d5,35454173,35454373
189601,189602,hs37d5-35455021-35455259,hs37d5,35455021,35455259


In [30]:
ataccount =np.genfromtxt("/mnt/data0/halo/A594/GSM3271041_ATAC_sciCAR_A549_peak_count.txt.gz", skip_header=1)
rows = ataccount[:, 0].astype(int)
cols = ataccount[:, 1].astype(int)
vals = ataccount[:, 2]

# Create the COO sparse matrix
sparse_mat = coo_matrix((vals, (rows, cols)))
sparse_mat = sparse_mat.tocsr().T
sparse_mat = sparse_mat[1:, 1:]
sparse_mat.shape

(6085, 189603)

In [31]:
atac_data = ad.AnnData(sparse_mat)
atac_data.obs = cells
atac_data.var = ATAC_feature

In [35]:
atac_data.obs = atac_data.obs.set_index("sample")
rna_data.obs = rna_data.obs.set_index("sample")

In [38]:
rna_data.obs

Unnamed: 0_level_0,cell_name,experiment,treatment_time
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
sci-RNA-A-001.AAGTACGTTA,A549,coassay,3.0
sci-RNA-A-001.TCTCTCATCC,A549,coassay,0.0
sci-RNA-A-001.TCCGCCGGTC,A549,coassay,3.0
sci-RNA-A-001.TTCTATAGAG,A549,coassay,1.0
sci-RNA-A-001.CGTCTATGAA,A549,coassay,1.0
...,...,...,...
sci-RNA-E-096.CGAATCTCCT,A549,coassay,3.0
sci-RNA-E-096.ATATGCCATC,A549,coassay,3.0
sci-RNA-E-096.TTGCAGCATT,A549,coassay,1.0
sci-RNA-E-096.ACTCTACTGG,A549,coassay,1.0


In [51]:
atac_data.obs[atac_data.obs['group'].str.contains("A549")]

atac_data = atac_data[atac_data.obs[atac_data.obs['group'].str.contains("A549")].index, :]
atac_data

View of AnnData object with n_obs × n_vars = 4258 × 189603
    obs: 'source', 'group', 'experiment'
    var: 'id', 'peak', 'chr', 'start', 'end'

In [56]:
rna_data.obs.index.sort_values()

Index(['sci-RNA-A-001.AAGTACGTTA', 'sci-RNA-A-001.AGGTAGAGCT',
       'sci-RNA-A-001.ATCTAGGTTC', 'sci-RNA-A-001.CGAATCTCCT',
       'sci-RNA-A-001.CGTATTGAGA', 'sci-RNA-A-001.CGTCTATGAA',
       'sci-RNA-A-001.GACCAATGCG', 'sci-RNA-A-001.TAGCCAGCAA',
       'sci-RNA-A-001.TCCGCCGGTC', 'sci-RNA-A-001.TCTCTCATCC',
       ...
       'sci-RNA-E-096.AATCGAACTC', 'sci-RNA-E-096.ACTCTACTGG',
       'sci-RNA-E-096.ATATGCCATC', 'sci-RNA-E-096.CCGCGCAGGT',
       'sci-RNA-E-096.CCTATCATAA', 'sci-RNA-E-096.CGAATCTCCT',
       'sci-RNA-E-096.GGCGGTTGAC', 'sci-RNA-E-096.TCTCTCATCC',
       'sci-RNA-E-096.TGCCTAACTT', 'sci-RNA-E-096.TTGCAGCATT'],
      dtype='object', name='sample', length=4277)

In [62]:
rna_data

AnnData object with n_obs × n_vars = 4277 × 113153
    obs: 'cell_name', 'experiment', 'treatment_time'
    var: 'gene_id', 'gene_type', 'gene_short_name', 'modality', 'feature_types'
    layers: 'counts'

In [57]:
atac_data.obs.index.sort_values()

Index(['sci-RNA-A-001.AAGTACGTTA', 'sci-RNA-A-001.CGTATTGAGA',
       'sci-RNA-A-001.CGTCTATGAA', 'sci-RNA-A-001.GACCAATGCG',
       'sci-RNA-A-001.TAGCCAGCAA', 'sci-RNA-A-001.TCCGCCGGTC',
       'sci-RNA-A-001.TCTCTCATCC', 'sci-RNA-A-001.TTCTATAGAG',
       'sci-RNA-A-002.AAGTACGTTA', 'sci-RNA-A-002.AATCCGGTCA',
       ...
       'sci-RNA-E-096.AATCGAACTC', 'sci-RNA-E-096.ACTCTACTGG',
       'sci-RNA-E-096.ATATGCCATC', 'sci-RNA-E-096.CCTATCATAA',
       'sci-RNA-E-096.CGAATCTCCT', 'sci-RNA-E-096.CTGAAGAGAC',
       'sci-RNA-E-096.GGCGGTTGAC', 'sci-RNA-E-096.TCTCTCATCC',
       'sci-RNA-E-096.TGAGACTCTA', 'sci-RNA-E-096.TGCCTAACTT'],
      dtype='object', name='sample', length=4258)

In [63]:
intersection = atac_data.obs.index.intersection(rna_data.obs.index)


In [64]:
intersection

Index(['sci-RNA-A-071.GCGGAGTCGA', 'sci-RNA-A-071.TTGCAGCATT',
       'sci-RNA-A-071.GCGGCCAATC', 'sci-RNA-A-071.CTGAAGAGAC',
       'sci-RNA-A-071.GGCTCGAGAT', 'sci-RNA-A-071.GGCTTCTGGA',
       'sci-RNA-A-023.GCGGAGTCGA', 'sci-RNA-A-023.AATCGAACTC',
       'sci-RNA-A-023.CTGAAGAGAC', 'sci-RNA-A-023.AGCGATCCGC',
       ...
       'sci-RNA-E-070.TCTATCGGTA', 'sci-RNA-E-070.CGAATCTCCT',
       'sci-RNA-E-022.AATCCGGTCA', 'sci-RNA-E-022.ACTCTACTGG',
       'sci-RNA-E-022.CTGGTTGGTT', 'sci-RNA-E-022.GGCTATTCGA',
       'sci-RNA-E-022.TCTAGTCAAG', 'sci-RNA-E-022.TTCTCTACTA',
       'sci-RNA-E-022.TCCTCTCCGT', 'sci-RNA-E-022.ACTCGACGCC'],
      dtype='object', name='sample', length=3260)

In [65]:
rna_data = rna_data[intersection, :]
atac_data = atac_data[intersection, :]

In [66]:
rna_data

View of AnnData object with n_obs × n_vars = 3260 × 113153
    obs: 'cell_name', 'experiment', 'treatment_time'
    var: 'gene_id', 'gene_type', 'gene_short_name', 'modality', 'feature_types'
    layers: 'counts'

In [55]:
"sci-RNA-A-001.CGAATCTCCT" in atac_data.obs.index

False

In [67]:
atac_data

View of AnnData object with n_obs × n_vars = 3260 × 189603
    obs: 'source', 'group', 'experiment'
    var: 'id', 'peak', 'chr', 'start', 'end'

In [59]:
'sci-RNA-A-001.CGTATTGAGA' in rna_data.obs.index

True

In [72]:
atac_data.var

Unnamed: 0,id,peak,chr,start,end
0,1,1-9963-10665,1,9963,10665
1,2,1-11369-12010,1,11369,12010
2,3,1-24886-25386,1,24886,25386
3,4,1-29054-30366,1,29054,30366
4,5,1-36073-36581,1,36073,36581
...,...,...,...,...,...
189598,189599,hs37d5-35449616-35449816,hs37d5,35449616,35449816
189599,189600,hs37d5-35450394-35450635,hs37d5,35450394,35450635
189600,189601,hs37d5-35454173-35454373,hs37d5,35454173,35454373
189601,189602,hs37d5-35455021-35455259,hs37d5,35455021,35455259


In [70]:
atac_data.var["chr"] = atac_data.var["chr"].astype(str)


In [71]:
rna_data.write_h5ad("data/datasets/A549_rna.h5ad")
atac_data.write_h5ad("data/datasets/A549_atac.h5ad")

## Wrap up the rna and dna data to multi-modal adata

In [1]:
## preprocessing RNA data 

import os
import scipy
import numpy as np
import pandas as pd
import math
import sys
import scanpy as sc
import matplotlib.pyplot as plt

In [65]:
rna_data = sc.read_h5ad("/mnt/data0/halo/A594/A549_rna.h5ad")
rna_data

AnnData object with n_obs × n_vars = 3260 × 113153
    obs: 'cell_name', 'experiment', 'treatment_time'
    var: 'gene_id', 'gene_type', 'gene_short_name', 'modality', 'feature_types'
    layers: 'counts'

In [58]:
rna_data.obs

Unnamed: 0_level_0,cell_name,experiment,treatment_time
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
sci-RNA-A-071.GCGGAGTCGA,A549,coassay,3.0
sci-RNA-A-071.TTGCAGCATT,A549,coassay,1.0
sci-RNA-A-071.GCGGCCAATC,A549,coassay,3.0
sci-RNA-A-071.CTGAAGAGAC,A549,coassay,1.0
sci-RNA-A-071.GGCTCGAGAT,A549,coassay,3.0
...,...,...,...
sci-RNA-E-022.GGCTATTCGA,A549,coassay,3.0
sci-RNA-E-022.TCTAGTCAAG,A549,coassay,1.0
sci-RNA-E-022.TTCTCTACTA,A549,coassay,1.0
sci-RNA-E-022.TCCTCTCCGT,A549,coassay,3.0


In [5]:
! pip install --user scikit-misc

Collecting scikit-misc
  Using cached scikit_misc-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.7 MB)
Installing collected packages: scikit-misc
Successfully installed scikit-misc-0.2.0


In [59]:
rna_data.layers["counts"] = rna_data.X.copy()  # preserve counts

# sc.pp.filter_genes_dispersion(rna_data, flavor="seurat", n_top_genes=None)


In [60]:
sc.pp.filter_genes_dispersion(rna_data, flavor="seurat", n_top_genes=None)


In [61]:
rna_data.layers["counts"]

<3260x6778 sparse matrix of type '<class 'numpy.float32'>'
	with 1647801 stored elements in Compressed Sparse Column format>

In [30]:
sc.pp.filter_genes_dispersion(rna_data, flavor="seurat", n_top_genes=None)
rna_data.layers["counts"]

<3260x6778 sparse matrix of type '<class 'numpy.float32'>'
	with 1647801 stored elements in Compressed Sparse Column format>

In [31]:
print(rna_data.X[1, :])

  (0, 37)	1.0
  (0, 84)	2.0
  (0, 90)	2.0
  (0, 109)	2.0
  (0, 175)	4.0
  (0, 212)	3.0
  (0, 217)	1.0
  (0, 227)	7.0
  (0, 269)	1.0
  (0, 283)	1.0
  (0, 291)	1.0
  (0, 292)	2.0
  (0, 293)	2.0
  (0, 302)	3.0
  (0, 313)	2.0
  (0, 367)	1.0
  (0, 382)	4.0
  (0, 398)	1.0
  (0, 451)	3.0
  (0, 479)	4.0
  (0, 480)	3.0
  (0, 514)	1.0
  (0, 599)	1.0
  (0, 646)	3.0
  (0, 651)	1.0
  :	:
  (0, 5465)	1.0
  (0, 5511)	1.0
  (0, 5540)	1.0
  (0, 5544)	1.0
  (0, 5581)	1.0
  (0, 5617)	1.0
  (0, 5667)	1.0
  (0, 5681)	9.0
  (0, 5800)	1.0
  (0, 5804)	2.0
  (0, 5888)	1.0
  (0, 5939)	4.0
  (0, 6119)	5.0
  (0, 6130)	4.0
  (0, 6223)	5.0
  (0, 6231)	4.0
  (0, 6251)	3.0
  (0, 6278)	3.0
  (0, 6287)	1.0
  (0, 6307)	1.0
  (0, 6368)	2.0
  (0, 6460)	2.0
  (0, 6512)	2.0
  (0, 6697)	2.0
  (0, 6728)	2.0


In [62]:
rna_data.var

Unnamed: 0,gene_id,gene_type,gene_short_name,modality,feature_types,means,dispersions,dispersions_norm
63,ENSG00000223764.2,lincRNA,RP11-54O7.3,Gene Expression,Gene Expression,0.020344,1.274681,2.337238
66,ENSG00000188976.6,protein_coding,NOC2L,Gene Expression,Gene Expression,0.158523,1.163348,2.072673
71,ENSG00000188290.6,protein_coding,HES4,Gene Expression,Gene Expression,0.024244,1.453199,2.761457
80,ENSG00000131591.13,protein_coding,C1orf159,Gene Expression,Gene Expression,0.239934,0.970455,1.614294
90,ENSG00000078808.12,protein_coding,SDF4,Gene Expression,Gene Expression,0.226573,1.258805,2.299512
...,...,...,...,...,...,...,...,...
106675,ENSMUSG00000048915.12,protein_coding,Efna5,Gene Expression,Gene Expression,0.026935,0.761691,1.118199
107987,ENSMUSG00000092341.2,lincRNA,Malat1,Gene Expression,Gene Expression,0.066167,0.608672,0.754572
108594,ENSMUSG00000012443.3,protein_coding,Kif11,Gene Expression,Gene Expression,0.023944,0.807492,1.227037
108693,ENSMUSG00000074852.3,protein_coding,Hpse2,Gene Expression,Gene Expression,0.063867,2.156290,4.432243


In [33]:
rna_data.write_h5ad("data/datasets/rna_filtered.h5ad")

In [14]:
atac_data = sc.read_10x_mtx("data/datasets/A549/ATAC")

In [63]:
atac_data = sc.read_h5ad("/mnt/data0/halo/A594/A549_atac.h5ad")

In [66]:
rna_data.var["start"] = " "
rna_data.var["chr"] = " "
rna_data.var["end"] = " "
rna_data.var["peak"] = " "
rna_data.var


Unnamed: 0,gene_id,gene_type,gene_short_name,modality,feature_types,start,chr,end,peak
0,ENSG00000223972.4,pseudogene,DDX11L1,Gene Expression,Gene Expression,,,,
1,ENSG00000227232.4,pseudogene,WASH7P,Gene Expression,Gene Expression,,,,
2,ENSG00000243485.2,lincRNA,MIR1302-11,Gene Expression,Gene Expression,,,,
3,ENSG00000237613.2,lincRNA,FAM138A,Gene Expression,Gene Expression,,,,
4,ENSG00000268020.2,pseudogene,OR4G4P,Gene Expression,Gene Expression,,,,
...,...,...,...,...,...,...,...,...,...
113148,ENSMUSG00000064368.1,protein_coding,mt-Nd6,Gene Expression,Gene Expression,,,,
113149,ENSMUSG00000064369.1,Mt_tRNA,mt-Te,Gene Expression,Gene Expression,,,,
113150,ENSMUSG00000064370.1,protein_coding,mt-Cytb,Gene Expression,Gene Expression,,,,
113151,ENSMUSG00000064371.1,Mt_tRNA,mt-Tt,Gene Expression,Gene Expression,,,,


In [68]:
atac_data.var["modality"] = "Peaks"
atac_data.var["feature_types"] = "Peaks"
atac_data.var = atac_data.var.rename(columns={"id":"gene_id", "feature":"feature_types"})
atac_data.var["gene_short_name"] = "Peaks"
atac_data.var["gene_type"] = "Peaks"
atac_data.var

Unnamed: 0,gene_id,peak,chr,start,end,modality,feature_types,gene_short_name,gene_type
0,1,1-9963-10665,1,9963,10665,Peaks,Peaks,Peaks,Peaks
1,2,1-11369-12010,1,11369,12010,Peaks,Peaks,Peaks,Peaks
2,3,1-24886-25386,1,24886,25386,Peaks,Peaks,Peaks,Peaks
3,4,1-29054-30366,1,29054,30366,Peaks,Peaks,Peaks,Peaks
4,5,1-36073-36581,1,36073,36581,Peaks,Peaks,Peaks,Peaks
...,...,...,...,...,...,...,...,...,...
189598,189599,hs37d5-35449616-35449816,hs37d5,35449616,35449816,Peaks,Peaks,Peaks,Peaks
189599,189600,hs37d5-35450394-35450635,hs37d5,35450394,35450635,Peaks,Peaks,Peaks,Peaks
189600,189601,hs37d5-35454173-35454373,hs37d5,35454173,35454373,Peaks,Peaks,Peaks,Peaks
189601,189602,hs37d5-35455021-35455259,hs37d5,35455021,35455259,Peaks,Peaks,Peaks,Peaks


In [69]:
atac_data.obs["cell_name"] =  "A549"
atac_data.obs["treatment_time"]=" "
atac_data.obs

Unnamed: 0_level_0,source,group,experiment,cell_name,treatment_time
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
sci-RNA-A-071.GCGGAGTCGA,Human,A549_3h,co_assay,A549,
sci-RNA-A-071.TTGCAGCATT,Human,A549_1h,co_assay,A549,
sci-RNA-A-071.GCGGCCAATC,Human,A549_3h,co_assay,A549,
sci-RNA-A-071.CTGAAGAGAC,Human,A549_1h,co_assay,A549,
sci-RNA-A-071.GGCTCGAGAT,Human,A549_3h,co_assay,A549,
...,...,...,...,...,...
sci-RNA-E-022.GGCTATTCGA,Human,A549_3h,co_assay,A549,
sci-RNA-E-022.TCTAGTCAAG,Human,A549_1h,co_assay,A549,
sci-RNA-E-022.TTCTCTACTA,Human,A549_1h,co_assay,A549,
sci-RNA-E-022.TCCTCTCCGT,Human,A549_3h,co_assay,A549,


In [70]:
rna_data.obs["group"] = " "
rna_data.obs["source"]="Human"
rna_data.obs

Unnamed: 0_level_0,cell_name,experiment,treatment_time,group,source
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
sci-RNA-A-071.GCGGAGTCGA,A549,coassay,3.0,,Human
sci-RNA-A-071.TTGCAGCATT,A549,coassay,1.0,,Human
sci-RNA-A-071.GCGGCCAATC,A549,coassay,3.0,,Human
sci-RNA-A-071.CTGAAGAGAC,A549,coassay,1.0,,Human
sci-RNA-A-071.GGCTCGAGAT,A549,coassay,3.0,,Human
...,...,...,...,...,...
sci-RNA-E-022.GGCTATTCGA,A549,coassay,3.0,,Human
sci-RNA-E-022.TCTAGTCAAG,A549,coassay,1.0,,Human
sci-RNA-E-022.TTCTCTACTA,A549,coassay,1.0,,Human
sci-RNA-E-022.TCCTCTCCGT,A549,coassay,3.0,,Human


In [79]:
rna_data.obs = rna_data.obs.set_index("sample")
atac_data.obs = atac_data.obs.set_index("sample")

In [80]:
rna_data.obs

Unnamed: 0_level_0,cell_name,experiment,treatment_time,group,source
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
sci-RNA-A-071.GCGGAGTCGA,A549,coassay,3.0,,Human
sci-RNA-A-071.TTGCAGCATT,A549,coassay,1.0,,Human
sci-RNA-A-071.GCGGCCAATC,A549,coassay,3.0,,Human
sci-RNA-A-071.CTGAAGAGAC,A549,coassay,1.0,,Human
sci-RNA-A-071.GGCTCGAGAT,A549,coassay,3.0,,Human
...,...,...,...,...,...
sci-RNA-E-022.GGCTATTCGA,A549,coassay,3.0,,Human
sci-RNA-E-022.TCTAGTCAAG,A549,coassay,1.0,,Human
sci-RNA-E-022.TTCTCTACTA,A549,coassay,1.0,,Human
sci-RNA-E-022.TCCTCTCCGT,A549,coassay,3.0,,Human


In [81]:
import anndata as ad
adata = ad.concat([rna_data, atac_data], join="inner", axis=1)

  utils.warn_names_duplicates("var")


In [82]:
adata

AnnData object with n_obs × n_vars = 3260 × 302756
    var: 'gene_id', 'gene_type', 'gene_short_name', 'modality', 'feature_types', 'start', 'chr', 'end', 'peak'

In [83]:
adata.obs = rna_data.obs
# adata.obs = adata.obs.drop("group",  axis=1)
adata.obs

Unnamed: 0_level_0,cell_name,experiment,treatment_time,group,source
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
sci-RNA-A-071.GCGGAGTCGA,A549,coassay,3.0,,Human
sci-RNA-A-071.TTGCAGCATT,A549,coassay,1.0,,Human
sci-RNA-A-071.GCGGCCAATC,A549,coassay,3.0,,Human
sci-RNA-A-071.CTGAAGAGAC,A549,coassay,1.0,,Human
sci-RNA-A-071.GGCTCGAGAT,A549,coassay,3.0,,Human
...,...,...,...,...,...
sci-RNA-E-022.GGCTATTCGA,A549,coassay,3.0,,Human
sci-RNA-E-022.TCTAGTCAAG,A549,coassay,1.0,,Human
sci-RNA-E-022.TTCTCTACTA,A549,coassay,1.0,,Human
sci-RNA-E-022.TCCTCTCCGT,A549,coassay,3.0,,Human


In [84]:
adata.var

Unnamed: 0,gene_id,gene_type,gene_short_name,modality,feature_types,start,chr,end,peak
0,ENSG00000223972.4,pseudogene,DDX11L1,Gene Expression,Gene Expression,,,,
1,ENSG00000227232.4,pseudogene,WASH7P,Gene Expression,Gene Expression,,,,
2,ENSG00000243485.2,lincRNA,MIR1302-11,Gene Expression,Gene Expression,,,,
3,ENSG00000237613.2,lincRNA,FAM138A,Gene Expression,Gene Expression,,,,
4,ENSG00000268020.2,pseudogene,OR4G4P,Gene Expression,Gene Expression,,,,
...,...,...,...,...,...,...,...,...,...
189598,189599,Peaks,Peaks,Peaks,Peaks,35449616,hs37d5,35449816,hs37d5-35449616-35449816
189599,189600,Peaks,Peaks,Peaks,Peaks,35450394,hs37d5,35450635,hs37d5-35450394-35450635
189600,189601,Peaks,Peaks,Peaks,Peaks,35454173,hs37d5,35454373,hs37d5-35454173-35454373
189601,189602,Peaks,Peaks,Peaks,Peaks,35455021,hs37d5,35455259,hs37d5-35455021-35455259


In [85]:
adata.var

Unnamed: 0,gene_id,gene_type,gene_short_name,modality,feature_types,start,chr,end,peak
0,ENSG00000223972.4,pseudogene,DDX11L1,Gene Expression,Gene Expression,,,,
1,ENSG00000227232.4,pseudogene,WASH7P,Gene Expression,Gene Expression,,,,
2,ENSG00000243485.2,lincRNA,MIR1302-11,Gene Expression,Gene Expression,,,,
3,ENSG00000237613.2,lincRNA,FAM138A,Gene Expression,Gene Expression,,,,
4,ENSG00000268020.2,pseudogene,OR4G4P,Gene Expression,Gene Expression,,,,
...,...,...,...,...,...,...,...,...,...
189598,189599,Peaks,Peaks,Peaks,Peaks,35449616,hs37d5,35449816,hs37d5-35449616-35449816
189599,189600,Peaks,Peaks,Peaks,Peaks,35450394,hs37d5,35450635,hs37d5-35450394-35450635
189600,189601,Peaks,Peaks,Peaks,Peaks,35454173,hs37d5,35454373,hs37d5-35454173-35454373
189601,189602,Peaks,Peaks,Peaks,Peaks,35455021,hs37d5,35455259,hs37d5-35455021-35455259


In [55]:
rna_data.var = rna_data.var[["modality", "feature_types"]]
rna_data.var


Unnamed: 0_level_0,modality,feature_types
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000223764.2,Gene Expression,Gene Expression
ENSG00000188976.6,Gene Expression,Gene Expression
ENSG00000188290.6,Gene Expression,Gene Expression
ENSG00000131591.13,Gene Expression,Gene Expression
ENSG00000078808.12,Gene Expression,Gene Expression
...,...,...
ENSMUSG00000048915.12,Gene Expression,Gene Expression
ENSMUSG00000092341.2,Gene Expression,Gene Expression
ENSMUSG00000012443.3,Gene Expression,Gene Expression
ENSMUSG00000074852.3,Gene Expression,Gene Expression


In [56]:
atac_data.var["modality"] = "Peaks"
atac_data.var["feature_types"] = "Peaks"


In [57]:
atac_data.var = atac_data.var.rename(columns={"id":"gene_id"})
atac_data.var

Unnamed: 0_level_0,peak,chr,start,end,modality,feature_types
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1-9963-10665,1,9963,10665,Peaks,Peaks
2,1-11369-12010,1,11369,12010,Peaks,Peaks
3,1-24886-25386,1,24886,25386,Peaks,Peaks
4,1-29054-30366,1,29054,30366,Peaks,Peaks
5,1-36073-36581,1,36073,36581,Peaks,Peaks
...,...,...,...,...,...,...
189599,hs37d5-35449616-35449816,hs37d5,35449616,35449816,Peaks,Peaks
189600,hs37d5-35450394-35450635,hs37d5,35450394,35450635,Peaks,Peaks
189601,hs37d5-35454173-35454373,hs37d5,35454173,35454373,Peaks,Peaks
189602,hs37d5-35455021-35455259,hs37d5,35455021,35455259,Peaks,Peaks


In [45]:
atac_data.var = atac_data.var.set_index("gene_id")

AnnData expects .var.index to contain strings, but got values like:
    [1, 2, 3, 4, 5]

    Inferred to be: integer

  value_idx = self._prep_dim_index(value.index, attr)


In [47]:
atac_data.var.index = atac_data.var.index.astype(str)

In [48]:
atac_data.var

Unnamed: 0_level_0,peak,chr,start,end
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1-9963-10665,1,9963,10665
2,1-11369-12010,1,11369,12010
3,1-24886-25386,1,24886,25386
4,1-29054-30366,1,29054,30366
5,1-36073-36581,1,36073,36581
...,...,...,...,...
189599,hs37d5-35449616-35449816,hs37d5,35449616,35449816
189600,hs37d5-35450394-35450635,hs37d5,35450394,35450635
189601,hs37d5-35454173-35454373,hs37d5,35454173,35454373
189602,hs37d5-35455021-35455259,hs37d5,35455021,35455259


In [58]:
adata = ad.concat([rna_data, atac_data], join="inner", axis=1)

In [87]:
adata.var["gene_id"] = adata.var.gene_id.astype(str)

In [89]:
adata.var["chr"] = adata.var.chr.astype(str)
adata.var["start"] = adata.var.start.astype(str)
adata.var["end"] = adata.var.end.astype(str)


In [90]:
adata.write_h5ad("data/datasets/A549_multiome_2.h5ad")

In [55]:
## relink the data from Minxue's data


rna_data = sc.read_10x_mtx("data/datasets/A549/RNA/")



In [56]:
rna_data

AnnData object with n_obs × n_vars = 4825 × 113153
    var: 'gene_ids'