### Load the libraries

In [1]:
### Load the libraries 
import os, sys
import warnings
warnings.filterwarnings('ignore')
import pandas as pd 
import numpy as np 
import anndata as ad 
import matplotlib.pyplot as plt 
import scanpy as sc

<i><b> Print the container version </b></i>

In [2]:
# Container used for this analysis can be found here : cokorac/cs-core-image-amd64:dev (date of use : 23/01/25)

<i><b> Set the home directory </b></i>

In [3]:
os.environ['HOME_Nikola'] = '/group/kalebic/Nikola/1_single_cell_final/Nikola_final/final/scdgomics'
home_path = os.getenv('HOME_Nikola')
home_path

'/group/kalebic/Nikola/1_single_cell_final/Nikola_final/final/scdgomics'

<i><b> Set the plotting parameters </b></i>

In [4]:
%matplotlib inline
sc.set_figure_params(dpi = 80)

### Assemble the Linnarsson_mm_dg_GSE104323

In [5]:
# Load the expression data 
expression_linnarsson_mm_dg = pd.read_csv(os.path.join(home_path, 'datasets/Linnarsson_mm_dg_GSE104323/GSE104323_10X_expression_data_V2.tab'), delimiter='\t' ,index_col=0)
expression_linnarsson_mm_dg.head()

Unnamed: 0_level_0,10X79_1_TCTACCATGCCTAA-,10X79_2_GTACTAGTGAACAT-,10X79_2_AATCAGTACCTACA-,10X79_1_CGGGTTCTTGAGGT-,10X79_1_GTGGAAGGCGTACA-,10X79_1_GTCCGCAAGCCATT-,10X79_2_TAAAGCAATACGCT-,10X79_1_AGTGATCAGCAACT-,10X79_1_CTCAATCCCAAGAT-,10X79_1_CCTTGTCGGATGTT-,...,10X79_2_CCAAATCCTCCTAG-,10X79_2_ATGTAGTTATCGGT-,10X80_2_GCTAATCTTATCTG-,10X80_1_CATTTAGTACGCGA-,10X80_2_GAATCTCAGCGACC-,10X80_1_CACGGTCTACGAGT-,10X80_2_ATTGCAGCCACGTC-,10X80_1_GCGGTTCGGCATCG-,10X83_4_TTCAGCATACTCTT-,10X80_1_TTACCAGGAGTAGA-
cellid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0610007P14Rik,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
0610009B22Rik,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0610009L18Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0610009O20Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
0610010F05Rik,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# Load the metadata file
metadata_linnarsson_mm_dg = pd.read_csv(os.path.join(home_path, 'datasets/Linnarsson_mm_dg_GSE104323/GSE104323_metadata_barcodes_24185cells.txt'), index_col=0, delimiter='\t')
metadata_linnarsson_mm_dg = metadata_linnarsson_mm_dg.dropna(how='any')
metadata_linnarsson_mm_dg.head()

Unnamed: 0_level_0,source name,organism,characteristics: strain,characteristics: age,characteristics: sex of pooled animals,characteristics: cell cluster,molecule,SRR run accession,raw file (original file name),UMI_CellularBarcode
Sample name (24185 single cells),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
10X79_1_AAACTAGCTAGCCC-,dentate gyrus,Mus musculus,hGFAP-GFP,P120,2males+1female,Neuroblast,total RNA,SRR6089817,10X79_1_AAACTAGCTAGCCC.fq.gz,CGGCGATCCC_AAACTAGCTAGCCC
10X79_1_AAACTAGGATGTAT-,dentate gyrus,Mus musculus,hGFAP-GFP,P120,2males+1female,OPC,total RNA,SRR6089947,10X79_1_AAACTAGGATGTAT.fq.gz,AGTGGTAATG_AAACTAGGATGTAT
10X79_1_AAACTCACGGCGTT-,dentate gyrus,Mus musculus,hGFAP-GFP,P120,2males+1female,GC-adult,total RNA,SRR6089529,10X79_1_AAACTCACGGCGTT.fq.gz,GGGTGCGCTC_AAACTCACGGCGTT
10X79_1_AAACTGTCGGCTCA-,dentate gyrus,Mus musculus,hGFAP-GFP,P120,2males+1female,MOL,total RNA,SRR6089595,10X79_1_AAACTGTCGGCTCA.fq.gz,CCTTTCAACG_AAACTGTCGGCTCA
10X79_1_AAACTGTGATAAGT-,dentate gyrus,Mus musculus,hGFAP-GFP,P120,2males+1female,OPC,total RNA,SRR6090058,10X79_1_AAACTGTGATAAGT.fq.gz,CCTTTCAGGT_AAACTGTGATAAGT


In [7]:
# Reset the index
metadata_linnarsson_mm_dg = metadata_linnarsson_mm_dg.reset_index()
# Build the initial dataframe
expression_linnarsson_mm_dg = expression_linnarsson_mm_dg.T
adata_linnarsson_mm_dg = ad.AnnData(expression_linnarsson_mm_dg)
# Add var_names and obs_names
adata_linnarsson_mm_dg.obs_names = metadata_linnarsson_mm_dg['Sample name (24185 single cells)']
adata_linnarsson_mm_dg.var_names = expression_linnarsson_mm_dg.T.index

In [8]:
# Add the metadata
adata_linnarsson_mm_dg.obs = metadata_linnarsson_mm_dg

In [9]:
# Save the anndata
adata_linnarsson_mm_dg.write_h5ad(os.path.join(home_path, 'data_versions/Adata_linnarsson_mm_dg_raw.h5ad'))

### Assemble the Zylka_mm_ncx_GSE123335

In [10]:
# Load the expression data
expression_zylka_mm_ncx = pd.read_csv(os.path.join(home_path, 'datasets/Zylka_mm_ncx_GSE123335/GSE123335_E14_combined_matrix.txt'), delimiter='\t', index_col=0)
expression_zylka_mm_ncx.head()

Unnamed: 0_level_0,e14-WT10_AAAAAGCAAGAA,e14-WT10_AAAAATCTCTCC,e14-WT10_AAAACACATTCC,e14-WT10_AAAACCGTGGAT,e14-WT10_AAAACGCGCCGA,e14-WT10_AAAAGAGCCGAG,e14-WT10_AAAATGGACTCA,e14-WT10_AAAATGTCACAA,e14-WT10_AAAATTCCGCTT,e14-WT10_AAAATTGCAGAG,...,e14-WT9-2_TTTTATTGTCAA,e14-WT9-2_TTTTCCCAGATA,e14-WT9-2_TTTTCCTAATAT,e14-WT9-2_TTTTCGTTACCG,e14-WT9-2_TTTTCTATCGTT,e14-WT9-2_TTTTGGTCCCGN,e14-WT9-2_TTTTGTCAGTCT,e14-WT9-2_TTTTTAGATGTN,e14-WT9-2_TTTTTCATCGGG,e14-WT9-2_TTTTTTTACTTG
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0610005C13Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0610007N19Rik,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
0610007P14Rik,0,1,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,0,0
0610009B14Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0610009B22Rik,0,0,0,0,0,0,0,0,1,1,...,0,0,2,0,0,0,0,0,0,0


In [11]:
# Load the metadata
metadata_zylka_mm_ncx = pd.read_csv(os.path.join(home_path, 'datasets/Zylka_mm_ncx_GSE123335/GSE123335_E14_combined_matrix_ClusterAnnotations.txt'), delimiter='\t', index_col=0)
metadata_zylka_mm_ncx.head()

Unnamed: 0_level_0,Cluster
CellID,Unnamed: 1_level_1
e14.WT10_AAAAAGCAAGAA,RG1 [8-E]
e14.WT10_AAAAATCTCTCC,RG1 [8-E]
e14.WT10_AAAACACATTCC,LayerV-VI [3-E]
e14.WT10_AAAACCGTGGAT,LayerV-VI [3-E]
e14.WT10_AAAACGCGCCGA,LayerV-VI [5-E]


In [12]:
# Print the sizes of the expression data and metadata
print(expression_zylka_mm_ncx.shape)
print(metadata_zylka_mm_ncx.shape)

# Expression matrix seems to have more cells (11069) than the metadata. 
# In the paper (https://www.nature.com/articles/s41467-018-08079-9) authors claim to have identified 10,931 cells, so I subsetted the expression matrix to those cellIDs. 

(21313, 11069)
(10931, 1)


In [13]:
# Reset the index
metadata_zylka_mm_ncx = metadata_zylka_mm_ncx.reset_index()
metadata_zylka_mm_ncx['CellID'] = metadata_zylka_mm_ncx['CellID'].str.replace('.', '-')
# Transpose
expression_zylka_mm_ncx = expression_zylka_mm_ncx.T
# Build the initial anndata object
adata_zylka_mm_ncx = ad.AnnData(expression_zylka_mm_ncx)

In [14]:
# Keep only cellIDs present in the metadata
metadata_barcodes = metadata_zylka_mm_ncx['CellID'].tolist()
adata_zylka_mm_ncx = adata_zylka_mm_ncx[adata_zylka_mm_ncx.obs_names.isin(metadata_barcodes)].copy()

In [15]:
# Add the metadata to the anndata object
adata_zylka_mm_ncx.obs = metadata_zylka_mm_ncx
adata_zylka_mm_ncx

AnnData object with n_obs × n_vars = 10931 × 21313
    obs: 'CellID', 'Cluster'

In [16]:
# Save the anndata
adata_zylka_mm_ncx.write_h5ad(os.path.join(home_path, 'data_versions/Adata_zylka_mm_ncx_raw.h5ad'))