In [73]:
#How to find, load and process snRNA-seq data

In [None]:
import wget
import pandas as pd
import numpy as np
import scanpy as sc
import anndata

In [None]:
#Gene network analysis is a method designed to identify sub-networks (modules) of correlated genes, which are likely to be co-expressed.
#This can be helpful in identification of sub-networks (modules) of genes that contribute to disease.
#In this example, we will cover how to create a pairwise correlation matrix of genes, as well as how to associate them with disease.

In [None]:
#First we will cover how to find, load and process the snRNA-seq data

In [None]:
#Acquire snRNA-seq data from cellxgene portal: https://cellxgene.cziscience.com/collections/180bff9c-c8a5-4539-b13b-ddbc00d643e6
#Chosen microglia cell type to focus on from this paper: Molecular characterization of selectively vulnerable neurons in Alzheimer's Disease
#https://www.nature.com/articles/s41593-020-00764-7

In [None]:
#For this tutorial, we will be using an open access freely available dataset that has been generated from microglia of the entorhinal cortex within the brain.
#This dataset is available from the cellxgene portal, accessible here: https://cellxgene.cziscience.com/collections/180bff9c-c8a5-4539-b13b-ddbc00d643e6 entitled "Molecular characterization of selectively vulnerable neurons in Alzheimer’s Disease: EC microglia".
#SnRNA-seq was performed for Controls and donors with Alzheimer's Disease.
#This dataset was chosen due to its small size and compatability with the purpose of the pipeline.
#This data will be available in the data/test/ directory.
#The generated dataset is stored in h5ad format.
#By the end of this section, we will have loaded and explored the dataset.


In [None]:
#Start by downloading the dataset from the original portal.
# URL of the dataset
url = "https://datasets.cellxgene.cziscience.com/1f0cd8ed-94c6-440c-bd5b-bad55e2666b1.h5ad"

# Destination path where the dataset will be saved
destination_path = "/shared/as8020/recode/mic_leng21.h5ad"

# Download the dataset
wget.download(url, destination_path)

#Alternatively, the dataset can be found in the dataset/test/ directory saved as mic_leng21.h5ad.

In [46]:
#Load in the test dataset
mic = sc.read('dataset/mic_leng21.h5ad')

In [6]:
#inspect the loaded data
mic

AnnData object with n_obs × n_vars = 5572 × 32743
    obs: 'SampleID', 'donor_id', 'BraakStage', 'SampleBatch', 'nUMI', 'nGene', 'initialClusterAssignments', 'seurat.subclusters', 'subclusterAssignment', 'tissue_ontology_term_id', 'cell_type_ontology_term_id', 'assay_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'suspension_type', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length'
    uns: 'citation', 'schema_reference', 'schema_version', 'title'
    obsm: 'X_cca', 'X_cca.aligned', 'X_tsne'

In [8]:
#Check if the gene names are in the correct format of gene symbols and not Ensembl IDs which are also common.
mic.var

Unnamed: 0,feature_is_filtered,feature_name,feature_reference,feature_biotype,feature_length
ENSG00000100568,False,VTI1B,NCBITaxon:9606,gene,5768
ENSG00000101440,False,ASIP,NCBITaxon:9606,gene,845
ENSG00000249847,False,ENSG00000249847.1,NCBITaxon:9606,gene,646
ENSG00000136630,False,HLX,NCBITaxon:9606,gene,5629
ENSG00000231731,False,ENSG00000231731.7,NCBITaxon:9606,gene,5510
...,...,...,...,...,...
ENSG00000163162,False,RNF149,NCBITaxon:9606,gene,5326
ENSG00000185189,False,NRBP2,NCBITaxon:9606,gene,4934
ENSG00000188352,False,FOCAD,NCBITaxon:9606,gene,9754
ENSG00000008394,False,MGST1,NCBITaxon:9606,gene,7441


In [47]:
#As can be seen from the gene features dataframe, they have currently used the Ensembl gene naming system.
#However, this isn't helpful for our analyses as they are not intuitively easy to interpret, instead you would need to research each Ensembl ID to identify that particular gene's name and function.
#From the second column feature_name, it appears that the original authors have converted the Ensembl IDs to gene symbol names.

#Let's go ahead and map the values in the feature_name column to the rownames of the dataframe:
# Set the "feature_name" column as the index (row names)
mic.var.set_index("feature_name", drop = False, inplace=True)

#It is important to note that not all Ensembl IDs map to Gene symbol names, as can be seen within rows 3 and 5 within the top of the dataframe.
#Therefore, since there is not a mapping for all Ensembl IDs, we shall remove these rows from the dataframe as they will be difficult to interpret in subsequent analyses.
# Filter rows where the index does not start with "ENSG" i.e. the Ensembl IDs.
# Define the condition for filtering genes
filter_genes = ~mic.var.index.str.startswith("ENSG")  # Exclude genes starting with "ENSG"
filter_genes

# Filter genes based on the condition
mic = mic[:, filter_genes]


In [26]:
mic

View of AnnData object with n_obs × n_vars = 5572 × 24751
    obs: 'SampleID', 'donor_id', 'BraakStage', 'SampleBatch', 'nUMI', 'nGene', 'initialClusterAssignments', 'seurat.subclusters', 'subclusterAssignment', 'tissue_ontology_term_id', 'cell_type_ontology_term_id', 'assay_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'suspension_type', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length'
    uns: 'citation', 'schema_reference', 'schema_version', 'title'
    obsm: 'X_cca', 'X_cca.aligned', 'X_tsne'

In [27]:
mic.var

Unnamed: 0_level_0,feature_is_filtered,feature_name,feature_reference,feature_biotype,feature_length
feature_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
VTI1B,False,VTI1B,NCBITaxon:9606,gene,5768
ASIP,False,ASIP,NCBITaxon:9606,gene,845
HLX,False,HLX,NCBITaxon:9606,gene,5629
C16orf87,False,C16orf87,NCBITaxon:9606,gene,7216
HMBOX1,False,HMBOX1,NCBITaxon:9606,gene,6633
...,...,...,...,...,...
RNF149,False,RNF149,NCBITaxon:9606,gene,5326
NRBP2,False,NRBP2,NCBITaxon:9606,gene,4934
FOCAD,False,FOCAD,NCBITaxon:9606,gene,9754
MGST1,False,MGST1,NCBITaxon:9606,gene,7441


In [None]:
#As can be seen, the number of genes have now reduced as any rows with Ensembl IDs have been removed.

In [None]:
#Also calculate the highly variable genes.

In [None]:
#Calculating highly variable genes on gene expression data that has not been log-transformed or normalized appropriately can lead to issues, including the presence of infinity values.
#Log transformation is a common preprocessing step for scRNA-seq data, especially when dealing with count data, to stabilize the variance and make the data more amenable to downstream analysis. 
#It helps to mitigate the impact of high expression values and reduce the influence of technical noise.

In [5]:
# Log normalize the gene expression data
sc.pp.log1p(mic)

  if isinstance(data, AnnData) and data.isview:


In [6]:
# Calculate highly variable genes
sc.pp.highly_variable_genes(mic, n_top_genes = 1000)

In [7]:
mic

AnnData object with n_obs × n_vars = 5572 × 24751
    obs: 'SampleID', 'donor_id', 'BraakStage', 'SampleBatch', 'nUMI', 'nGene', 'initialClusterAssignments', 'seurat.subclusters', 'subclusterAssignment', 'tissue_ontology_term_id', 'cell_type_ontology_term_id', 'assay_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'suspension_type', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'citation', 'schema_reference', 'schema_version', 'title'
    obsm: 'X_cca', 'X_cca.aligned', 'X_tsne'

In [75]:
#Lets save the filtered object
mic.write_h5ad('dataset/mic_leng21_filtered.h5ad')

In [None]:
#We will now explore the associated metadata 

In [30]:
mic.obs.columns

Index(['SampleID', 'donor_id', 'BraakStage', 'SampleBatch', 'nUMI', 'nGene',
       'initialClusterAssignments', 'seurat.subclusters',
       'subclusterAssignment', 'tissue_ontology_term_id',
       'cell_type_ontology_term_id', 'assay_ontology_term_id',
       'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'development_stage_ontology_term_id', 'sex_ontology_term_id',
       'is_primary_data', 'organism_ontology_term_id', 'suspension_type',
       'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex',
       'tissue', 'self_reported_ethnicity', 'development_stage',
       'observation_joinid'],
      dtype='object')

In [None]:
#As can be seen, this dataset contains 5572 cells and 32743 genes.
#It also has relevant metadata in the obs section, such as BraakStage. 
#The metadata may need to be encoded into the correct format for subsequent analyses, so let's have a look at the current format.

In [7]:
mic.obs

Unnamed: 0,SampleID,donor_id,BraakStage,SampleBatch,nUMI,nGene,initialClusterAssignments,seurat.subclusters,subclusterAssignment,tissue_ontology_term_id,...,tissue_type,cell_type,assay,disease,organism,sex,tissue,self_reported_ethnicity,development_stage,observation_joinid
EC3_AACCATGTCACGAAGG,EC3,3,0,C,219.0,193,EC:c1,0,EC:Micro.s0,UBERON:0002728,...,tissue,mature microglial cell,10x 3' v2,normal,Homo sapiens,male,entorhinal cortex,unknown,71-year-old human stage,K#V~xdAtWl
EC3_AATCGGTCACTTACGA,EC3,3,0,C,266.0,223,EC:c6,1,EC:Micro.s1,UBERON:0002728,...,tissue,mature microglial cell,10x 3' v2,normal,Homo sapiens,male,entorhinal cortex,unknown,71-year-old human stage,SF!nEev15F
EC3_ACAGCTAAGTGTCCCG,EC3,3,0,C,233.0,204,EC:c1,1,EC:Micro.s1,UBERON:0002728,...,tissue,mature microglial cell,10x 3' v2,normal,Homo sapiens,male,entorhinal cortex,unknown,71-year-old human stage,-EP32!nM28
EC3_ACAGCTACAAGGACAC,EC3,3,0,C,285.0,242,EC:c1,1,EC:Micro.s1,UBERON:0002728,...,tissue,mature microglial cell,10x 3' v2,normal,Homo sapiens,male,entorhinal cortex,unknown,71-year-old human stage,LW<Bj-F)!}
EC3_ACAGCTAGTGATGTCT,EC3,3,0,C,959.0,699,EC:c5,2,EC:Micro.s2,UBERON:0002728,...,tissue,mature microglial cell,10x 3' v2,normal,Homo sapiens,male,entorhinal cortex,unknown,71-year-old human stage,xqpir=h}cz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
EC10_TTTCCTCGTGGTAACG,EC10,10,6,D,1010.0,710,EC:c1,1,EC:Micro.s1,UBERON:0002728,...,tissue,mature microglial cell,10x 3' v2,Alzheimer disease,Homo sapiens,male,entorhinal cortex,unknown,82-year-old human stage,YGUBOx%{wc
EC10_TTTCCTCGTTTGACAC,EC10,10,6,D,1355.0,1013,EC:c1,1,EC:Micro.s1,UBERON:0002728,...,tissue,mature microglial cell,10x 3' v2,Alzheimer disease,Homo sapiens,male,entorhinal cortex,unknown,82-year-old human stage,1Ns|7j@C8q
EC10_TTTCCTCTCACAGGCC,EC10,10,6,D,1971.0,1358,EC:c1,0,EC:Micro.s0,UBERON:0002728,...,tissue,mature microglial cell,10x 3' v2,Alzheimer disease,Homo sapiens,male,entorhinal cortex,unknown,82-year-old human stage,=JuQ#I!6*4
EC10_TTTGTCACAAGCGTAG,EC10,10,6,D,403.0,342,EC:c1,2,EC:Micro.s2,UBERON:0002728,...,tissue,mature microglial cell,10x 3' v2,Alzheimer disease,Homo sapiens,male,entorhinal cortex,unknown,82-year-old human stage,GA7~u!F(fS


In [78]:
#Lets create a separate dataframe with the metadata information as this will be needed for the correlation analysis.
#Currently we want to create a copy of the metadata so as not to alter the original adata object.
metadata = mic.obs.copy()
metadata

Unnamed: 0,SampleID,donor_id,BraakStage,SampleBatch,nUMI,nGene,initialClusterAssignments,seurat.subclusters,subclusterAssignment,tissue_ontology_term_id,...,tissue_type,cell_type,assay,disease,organism,sex,tissue,self_reported_ethnicity,development_stage,observation_joinid
EC3_AACCATGTCACGAAGG,EC3,3,0,C,219.0,193,EC:c1,0,EC:Micro.s0,UBERON:0002728,...,tissue,mature microglial cell,10x 3' v2,normal,Homo sapiens,male,entorhinal cortex,unknown,71-year-old human stage,K#V~xdAtWl
EC3_AATCGGTCACTTACGA,EC3,3,0,C,266.0,223,EC:c6,1,EC:Micro.s1,UBERON:0002728,...,tissue,mature microglial cell,10x 3' v2,normal,Homo sapiens,male,entorhinal cortex,unknown,71-year-old human stage,SF!nEev15F
EC3_ACAGCTAAGTGTCCCG,EC3,3,0,C,233.0,204,EC:c1,1,EC:Micro.s1,UBERON:0002728,...,tissue,mature microglial cell,10x 3' v2,normal,Homo sapiens,male,entorhinal cortex,unknown,71-year-old human stage,-EP32!nM28
EC3_ACAGCTACAAGGACAC,EC3,3,0,C,285.0,242,EC:c1,1,EC:Micro.s1,UBERON:0002728,...,tissue,mature microglial cell,10x 3' v2,normal,Homo sapiens,male,entorhinal cortex,unknown,71-year-old human stage,LW<Bj-F)!}
EC3_ACAGCTAGTGATGTCT,EC3,3,0,C,959.0,699,EC:c5,2,EC:Micro.s2,UBERON:0002728,...,tissue,mature microglial cell,10x 3' v2,normal,Homo sapiens,male,entorhinal cortex,unknown,71-year-old human stage,xqpir=h}cz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
EC10_TTTCCTCGTGGTAACG,EC10,10,6,D,1010.0,710,EC:c1,1,EC:Micro.s1,UBERON:0002728,...,tissue,mature microglial cell,10x 3' v2,Alzheimer disease,Homo sapiens,male,entorhinal cortex,unknown,82-year-old human stage,YGUBOx%{wc
EC10_TTTCCTCGTTTGACAC,EC10,10,6,D,1355.0,1013,EC:c1,1,EC:Micro.s1,UBERON:0002728,...,tissue,mature microglial cell,10x 3' v2,Alzheimer disease,Homo sapiens,male,entorhinal cortex,unknown,82-year-old human stage,1Ns|7j@C8q
EC10_TTTCCTCTCACAGGCC,EC10,10,6,D,1971.0,1358,EC:c1,0,EC:Micro.s0,UBERON:0002728,...,tissue,mature microglial cell,10x 3' v2,Alzheimer disease,Homo sapiens,male,entorhinal cortex,unknown,82-year-old human stage,=JuQ#I!6*4
EC10_TTTGTCACAAGCGTAG,EC10,10,6,D,403.0,342,EC:c1,2,EC:Micro.s2,UBERON:0002728,...,tissue,mature microglial cell,10x 3' v2,Alzheimer disease,Homo sapiens,male,entorhinal cortex,unknown,82-year-old human stage,GA7~u!F(fS


In [79]:
#There are many columns that are not needed.
#Let's remove uninteresting columns
columns_to_remove = ['SampleID', 'SampleBatch',
       'initialClusterAssignments',
       'subclusterAssignment', 'tissue_ontology_term_id',
       'cell_type_ontology_term_id', 'assay_ontology_term_id',
       'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'development_stage_ontology_term_id', 'sex_ontology_term_id',
       'is_primary_data', 'organism_ontology_term_id', 'suspension_type',
       'tissue_type', 'cell_type', 'assay', 'organism',
       'tissue', 'self_reported_ethnicity',
       'observation_joinid' ]

In [80]:
metadata.drop(columns=columns_to_remove, inplace = True) #Set inplace=True to modify the DataFrame in place. If you set inplace=False or omit it, the drop() method will return a new DataFrame with the specified columns removed, leaving the original DataFrame unchanged.

In [81]:
metadata

Unnamed: 0,donor_id,BraakStage,nUMI,nGene,seurat.subclusters,disease,sex,development_stage
EC3_AACCATGTCACGAAGG,3,0,219.0,193,0,normal,male,71-year-old human stage
EC3_AATCGGTCACTTACGA,3,0,266.0,223,1,normal,male,71-year-old human stage
EC3_ACAGCTAAGTGTCCCG,3,0,233.0,204,1,normal,male,71-year-old human stage
EC3_ACAGCTACAAGGACAC,3,0,285.0,242,1,normal,male,71-year-old human stage
EC3_ACAGCTAGTGATGTCT,3,0,959.0,699,2,normal,male,71-year-old human stage
...,...,...,...,...,...,...,...,...
EC10_TTTCCTCGTGGTAACG,10,6,1010.0,710,1,Alzheimer disease,male,82-year-old human stage
EC10_TTTCCTCGTTTGACAC,10,6,1355.0,1013,1,Alzheimer disease,male,82-year-old human stage
EC10_TTTCCTCTCACAGGCC,10,6,1971.0,1358,0,Alzheimer disease,male,82-year-old human stage
EC10_TTTGTCACAAGCGTAG,10,6,403.0,342,2,Alzheimer disease,male,82-year-old human stage


In [82]:
mic

View of AnnData object with n_obs × n_vars = 5572 × 24751
    obs: 'SampleID', 'donor_id', 'BraakStage', 'SampleBatch', 'nUMI', 'nGene', 'initialClusterAssignments', 'seurat.subclusters', 'subclusterAssignment', 'tissue_ontology_term_id', 'cell_type_ontology_term_id', 'assay_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'suspension_type', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length'
    uns: 'citation', 'schema_reference', 'schema_version', 'title'
    obsm: 'X_cca', 'X_cca.aligned', 'X_tsne'

In [None]:
#From investigating the metadata dataframe, BraakStage, nUMI, nGene and seurat.subclusters are all numerical, whilst disease, sex and development_stage are all character strings.
#The columns with character strings will need to be reformatted appropriately so that they can be correlated against.
#Lets first identify the unique labels within each column

In [54]:
metadata['sex'].unique()

['male']
Categories (1, object): ['male']

In [None]:
#Looks like there are only male participants. Since there are only male differences, this column can also be removed, since we will not be able to investigate sex differences.

In [83]:
column_to_remove = 'sex'
metadata.drop(columns=column_to_remove, inplace = True)

In [84]:
metadata

Unnamed: 0,donor_id,BraakStage,nUMI,nGene,seurat.subclusters,disease,development_stage
EC3_AACCATGTCACGAAGG,3,0,219.0,193,0,normal,71-year-old human stage
EC3_AATCGGTCACTTACGA,3,0,266.0,223,1,normal,71-year-old human stage
EC3_ACAGCTAAGTGTCCCG,3,0,233.0,204,1,normal,71-year-old human stage
EC3_ACAGCTACAAGGACAC,3,0,285.0,242,1,normal,71-year-old human stage
EC3_ACAGCTAGTGATGTCT,3,0,959.0,699,2,normal,71-year-old human stage
...,...,...,...,...,...,...,...
EC10_TTTCCTCGTGGTAACG,10,6,1010.0,710,1,Alzheimer disease,82-year-old human stage
EC10_TTTCCTCGTTTGACAC,10,6,1355.0,1013,1,Alzheimer disease,82-year-old human stage
EC10_TTTCCTCTCACAGGCC,10,6,1971.0,1358,0,Alzheimer disease,82-year-old human stage
EC10_TTTGTCACAAGCGTAG,10,6,403.0,342,2,Alzheimer disease,82-year-old human stage


In [59]:
#Now let's have a look at the disease variable
metadata['disease'].unique()

['normal', 'Alzheimer disease']
Categories (2, object): ['Alzheimer disease', 'normal']

In [85]:
#The disease column can be encoded into a binary variable:
metadata['AD'] = metadata['disease'].apply(lambda x: 1 if x == "Alzheimer disease" else 0)
metadata['Normal'] = metadata['disease'].apply(lambda x: 1 if x == "normal" else 0)

In [86]:
metadata

Unnamed: 0,donor_id,BraakStage,nUMI,nGene,seurat.subclusters,disease,development_stage,AD,Normal
EC3_AACCATGTCACGAAGG,3,0,219.0,193,0,normal,71-year-old human stage,0,1
EC3_AATCGGTCACTTACGA,3,0,266.0,223,1,normal,71-year-old human stage,0,1
EC3_ACAGCTAAGTGTCCCG,3,0,233.0,204,1,normal,71-year-old human stage,0,1
EC3_ACAGCTACAAGGACAC,3,0,285.0,242,1,normal,71-year-old human stage,0,1
EC3_ACAGCTAGTGATGTCT,3,0,959.0,699,2,normal,71-year-old human stage,0,1
...,...,...,...,...,...,...,...,...,...
EC10_TTTCCTCGTGGTAACG,10,6,1010.0,710,1,Alzheimer disease,82-year-old human stage,1,0
EC10_TTTCCTCGTTTGACAC,10,6,1355.0,1013,1,Alzheimer disease,82-year-old human stage,1,0
EC10_TTTCCTCTCACAGGCC,10,6,1971.0,1358,0,Alzheimer disease,82-year-old human stage,1,0
EC10_TTTGTCACAAGCGTAG,10,6,403.0,342,2,Alzheimer disease,82-year-old human stage,1,0


In [None]:
#Now lets sort out the development_stage column

In [87]:
metadata['development_stage'].unique()

['71-year-old human stage', '50-year-old human stage', '60-year-old human stage', '77-year-old human stage', '87-year-old human stage', '80 year-old and over human stage', '72-year-old human stage', '82-year-old human stage']
Categories (8, object): ['80 year-old and over human stage', '50-year-old human stage', '60-year-old human stage', '71-year-old human stage', '72-year-old human stage', '77-year-old human stage', '82-year-old human stage', '87-year-old human stage']

In [88]:
#There appear to be 8 categories. Lets numerically encode them
# Recode development_stage
development_stage_mapping = {
    '50-year-old human stage': 50,
    '60-year-old human stage': 60,
    '71-year-old human stage': 71,
    '72-year-old human stage': 72,
    '77-year-old human stage': 77,
    '80 year-old and over human stage': 80,
    '82-year-old human stage': 82,
    '87-year-old human stage': 87
}
metadata['development_stage'] = metadata['development_stage'].map(development_stage_mapping)

In [89]:
metadata

Unnamed: 0,donor_id,BraakStage,nUMI,nGene,seurat.subclusters,disease,development_stage,AD,Normal
EC3_AACCATGTCACGAAGG,3,0,219.0,193,0,normal,71,0,1
EC3_AATCGGTCACTTACGA,3,0,266.0,223,1,normal,71,0,1
EC3_ACAGCTAAGTGTCCCG,3,0,233.0,204,1,normal,71,0,1
EC3_ACAGCTACAAGGACAC,3,0,285.0,242,1,normal,71,0,1
EC3_ACAGCTAGTGATGTCT,3,0,959.0,699,2,normal,71,0,1
...,...,...,...,...,...,...,...,...,...
EC10_TTTCCTCGTGGTAACG,10,6,1010.0,710,1,Alzheimer disease,82,1,0
EC10_TTTCCTCGTTTGACAC,10,6,1355.0,1013,1,Alzheimer disease,82,1,0
EC10_TTTCCTCTCACAGGCC,10,6,1971.0,1358,0,Alzheimer disease,82,1,0
EC10_TTTGTCACAAGCGTAG,10,6,403.0,342,2,Alzheimer disease,82,1,0


In [90]:
#Drop the disease column as it is no longer necessary
# Drop unnecessary columns
metadata = metadata.drop(['disease'], axis=1)
metadata

Unnamed: 0,donor_id,BraakStage,nUMI,nGene,seurat.subclusters,development_stage,AD,Normal
EC3_AACCATGTCACGAAGG,3,0,219.0,193,0,71,0,1
EC3_AATCGGTCACTTACGA,3,0,266.0,223,1,71,0,1
EC3_ACAGCTAAGTGTCCCG,3,0,233.0,204,1,71,0,1
EC3_ACAGCTACAAGGACAC,3,0,285.0,242,1,71,0,1
EC3_ACAGCTAGTGATGTCT,3,0,959.0,699,2,71,0,1
...,...,...,...,...,...,...,...,...
EC10_TTTCCTCGTGGTAACG,10,6,1010.0,710,1,82,1,0
EC10_TTTCCTCGTTTGACAC,10,6,1355.0,1013,1,82,1,0
EC10_TTTCCTCTCACAGGCC,10,6,1971.0,1358,0,82,1,0
EC10_TTTGTCACAAGCGTAG,10,6,403.0,342,2,82,1,0


In [91]:
#Save the metadata dataframe
metadata.to_csv('data/mic_metadata.csv', index = True)

In [16]:
metadata = pd.read_csv('data/mic_metadata.csv', index_col = 0)

In [None]:
#Due to the nature of single-cell data, we naturally have many cells from the same donor.
#However, we cannot simply correlate the gene expression data in its current form. this would lead to within and outwith donor correlations.
#Therefore, since we are working with single-cell data, this must first be pseudobulked in order to continue with the analysis.
#This is important as it not only speeds up the computation, but most importantly negates the effects of within sample correlation.
#Also, pseudobulking can help to mitigate the issues commonly found in single-cell data, such as drop outs and high zero value counts.

In [None]:
#First we shall sort out the metadata dataframe so that it only contains one row per donor since the data will be aggregated.

In [18]:
# Convert row names to a column named 'cell_id'
metadata['cell_id'] = metadata.index

In [94]:
# Group by 'donor_id' and select the first row of each group
rows = metadata.groupby('donor_id').first().reset_index()

In [95]:
rows

Unnamed: 0,donor_id,BraakStage,nUMI,nGene,seurat.subclusters,development_stage,AD,Normal,cell_id
0,1,0,302.0,268,1,50,0,1,EC1_AAAGATGAGGAGTTTA
1,2,0,676.0,553,1,60,0,1,EC2_AAAGCAAAGCTACCGC
2,3,0,219.0,193,0,71,0,1,EC3_AACCATGTCACGAAGG
3,4,2,348.0,295,0,72,1,0,EC4_AAAGCAAGTATGGTTC
4,5,2,447.0,328,1,77,1,0,EC5_AAACCTGAGGCGTACA
5,6,2,376.0,328,2,87,1,0,EC6_AAACGGGTCGAGAGCA
6,7,2,1399.0,965,1,80,1,0,EC7_AAACCTGGTCGTTGTA
7,8,6,619.0,500,0,72,1,0,EC8_AAACCTGAGCCGCCTA
8,9,6,531.0,437,0,82,1,0,EC9_AAACCTGCAGCTTCGG
9,10,6,688.0,549,1,82,1,0,EC10_AAACGGGGTTGGAGGT


In [96]:
# Extract row indices corresponding to the first cell from each donor
row_list = []
for i, row in rows.iterrows():
    row_idx = metadata.index.get_loc(row['cell_id'])
    row_list.append(row_idx)

In [97]:
row_list

[70, 318, 0, 1675, 620, 856, 1140, 1777, 3114, 4812]

In [106]:
# Select the columns from the DataFrame
metadata2 = metadata.iloc[row_list, :].copy()

In [107]:
metadata2

Unnamed: 0,donor_id,BraakStage,nUMI,nGene,seurat.subclusters,development_stage,AD,Normal,cell_id
EC1_AAAGATGAGGAGTTTA,1,0,302.0,268,1,50,0,1,EC1_AAAGATGAGGAGTTTA
EC2_AAAGCAAAGCTACCGC,2,0,676.0,553,1,60,0,1,EC2_AAAGCAAAGCTACCGC
EC3_AACCATGTCACGAAGG,3,0,219.0,193,0,71,0,1,EC3_AACCATGTCACGAAGG
EC4_AAAGCAAGTATGGTTC,4,2,348.0,295,0,72,1,0,EC4_AAAGCAAGTATGGTTC
EC5_AAACCTGAGGCGTACA,5,2,447.0,328,1,77,1,0,EC5_AAACCTGAGGCGTACA
EC6_AAACGGGTCGAGAGCA,6,2,376.0,328,2,87,1,0,EC6_AAACGGGTCGAGAGCA
EC7_AAACCTGGTCGTTGTA,7,2,1399.0,965,1,80,1,0,EC7_AAACCTGGTCGTTGTA
EC8_AAACCTGAGCCGCCTA,8,6,619.0,500,0,72,1,0,EC8_AAACCTGAGCCGCCTA
EC9_AAACCTGCAGCTTCGG,9,6,531.0,437,0,82,1,0,EC9_AAACCTGCAGCTTCGG
EC10_AAACGGGGTTGGAGGT,10,6,688.0,549,1,82,1,0,EC10_AAACGGGGTTGGAGGT


In [108]:
metadata2.set_index('donor_id', inplace = True, drop = False)

In [109]:
metadata2

Unnamed: 0_level_0,donor_id,BraakStage,nUMI,nGene,seurat.subclusters,development_stage,AD,Normal,cell_id
donor_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1,0,302.0,268,1,50,0,1,EC1_AAAGATGAGGAGTTTA
2,2,0,676.0,553,1,60,0,1,EC2_AAAGCAAAGCTACCGC
3,3,0,219.0,193,0,71,0,1,EC3_AACCATGTCACGAAGG
4,4,2,348.0,295,0,72,1,0,EC4_AAAGCAAGTATGGTTC
5,5,2,447.0,328,1,77,1,0,EC5_AAACCTGAGGCGTACA
6,6,2,376.0,328,2,87,1,0,EC6_AAACGGGTCGAGAGCA
7,7,2,1399.0,965,1,80,1,0,EC7_AAACCTGGTCGTTGTA
8,8,6,619.0,500,0,72,1,0,EC8_AAACCTGAGCCGCCTA
9,9,6,531.0,437,0,82,1,0,EC9_AAACCTGCAGCTTCGG
10,10,6,688.0,549,1,82,1,0,EC10_AAACGGGGTTGGAGGT


In [113]:
#Remove the cell_id column
metadata2.drop(columns = 'cell_id', inplace = True)

In [114]:
metadata2

Unnamed: 0_level_0,donor_id,BraakStage,nUMI,nGene,seurat.subclusters,development_stage,AD,Normal
donor_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,0,302.0,268,1,50,0,1
2,2,0,676.0,553,1,60,0,1
3,3,0,219.0,193,0,71,0,1
4,4,2,348.0,295,0,72,1,0
5,5,2,447.0,328,1,77,1,0
6,6,2,376.0,328,2,87,1,0
7,7,2,1399.0,965,1,80,1,0
8,8,6,619.0,500,0,72,1,0
9,9,6,531.0,437,0,82,1,0
10,10,6,688.0,549,1,82,1,0


In [177]:
#Save the metadata
metadata2.to_csv('data//mic_metadata_pseudobulk.csv', index = True)

In [None]:
#The metadata dataframe for the pseudobulk is now complete

In [None]:
#Lets proceed to aggregate the gene expression data.
#This involves summing the gene expression data for each gene of each donor.

In [None]:
#First the gene expression matrix will need to be extracted from our mic adata object

In [None]:
#since we are working with single-cell data which will be stored as a sparse matrix, this must be coerced into a dense matrix, so that it can be converted to a dataframe.

In [8]:
# Convert the sparse matrix to a dense matrix
dense_matrix = mic.X.todense()

In [9]:
datExpr = pd.DataFrame(dense_matrix, index=mic.obs_names, columns=mic.var_names)

In [10]:
datExpr

feature_name,VTI1B,ASIP,HLX,C16orf87,HMBOX1,RECQL5,NXPH1,SLC5A10,VN1R2,PTOV1,...,SIX3,ACP1,TRBV29OR9-2,OR13A1,LINC02715,RNF149,NRBP2,FOCAD,MGST1,RAB5C-AS1
EC3_AACCATGTCACGAAGG,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.693147,0.000000,0.0,0.0,0.0
EC3_AATCGGTCACTTACGA,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.693147,0.0,0.0,0.0
EC3_ACAGCTAAGTGTCCCG,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
EC3_ACAGCTACAAGGACAC,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
EC3_ACAGCTAGTGATGTCT,0.0,0.0,0.0,0.0,0.000000,0.0,0.693147,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
EC10_TTTCCTCGTGGTAACG,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.693147,0.000000,0.0,0.0,0.0
EC10_TTTCCTCGTTTGACAC,0.0,0.0,0.0,0.0,0.693147,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
EC10_TTTCCTCTCACAGGCC,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
EC10_TTTGTCACAAGCGTAG,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.693147,0.000000,0.0,0.0,0.0


In [None]:
#save datExpr
#Save the metadata dataframe
datExpr.to_csv('dataset/mic_datExpr_singlecell.csv', index = True)

In [None]:
#Since highly variable genes capture the most informative genes, they will be used to filter the expression matrix further.
#This is also a way to reduce the dimensionality of the data, so that downstream analyses may be more computationally efficient.

In [11]:
hvg = mic.var_names[mic.var['highly_variable']]
hvg

Index(['UBE2D2', 'SAR1A', 'IER3', 'RASGEF1C', 'TTC7A', 'TPK1', 'PTGES3',
       'HMG20B', 'GEM', 'TREM1',
       ...
       'STAB1', 'NFAT5', 'MAN1A2', 'EDA', 'SNCA', 'FAM135A', 'H2BC21',
       'FCGR3A', 'ARL2-SNX15', 'VSIG4'],
      dtype='object', name='feature_name', length=999)

In [12]:
datExpr = datExpr.loc[:,hvg]
datExpr

feature_name,UBE2D2,SAR1A,IER3,RASGEF1C,TTC7A,TPK1,PTGES3,HMG20B,GEM,TREM1,...,STAB1,NFAT5,MAN1A2,EDA,SNCA,FAM135A,H2BC21,FCGR3A,ARL2-SNX15,VSIG4
EC3_AACCATGTCACGAAGG,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000
EC3_AATCGGTCACTTACGA,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000
EC3_ACAGCTAAGTGTCCCG,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.693147,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000
EC3_ACAGCTACAAGGACAC,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000
EC3_ACAGCTAGTGATGTCT,0.693147,0.0,0.693147,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
EC10_TTTCCTCGTGGTAACG,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000
EC10_TTTCCTCGTTTGACAC,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.693147,0.000000,0.0,0.0,0.0,0.693147,0.0,0.0,0.000000,0.000000
EC10_TTTCCTCTCACAGGCC,0.693147,0.0,0.000000,0.693147,0.0,0.693147,0.0,0.0,0.0,0.0,...,0.000000,0.693147,0.0,0.0,0.0,0.000000,0.0,0.0,0.693147,0.693147
EC10_TTTGTCACAAGCGTAG,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.693147,0.0,0.0,0.000000,0.000000


In [None]:
#Add the donor_id column to the gene expression dataframe, so we know which cell came from which donor

In [13]:
# Reset the index of 'datExpr' DataFrame to make the row names (cell names) a column
datExpr_donor = datExpr.reset_index()

In [14]:
datExpr_donor

feature_name,index,UBE2D2,SAR1A,IER3,RASGEF1C,TTC7A,TPK1,PTGES3,HMG20B,GEM,...,STAB1,NFAT5,MAN1A2,EDA,SNCA,FAM135A,H2BC21,FCGR3A,ARL2-SNX15,VSIG4
0,EC3_AACCATGTCACGAAGG,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000
1,EC3_AATCGGTCACTTACGA,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000
2,EC3_ACAGCTAAGTGTCCCG,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.693147,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000
3,EC3_ACAGCTACAAGGACAC,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000
4,EC3_ACAGCTAGTGATGTCT,0.693147,0.0,0.693147,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,EC10_TTTCCTCGTGGTAACG,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000
5568,EC10_TTTCCTCGTTTGACAC,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.693147,0.000000,0.0,0.0,0.0,0.693147,0.0,0.0,0.000000,0.000000
5569,EC10_TTTCCTCTCACAGGCC,0.693147,0.0,0.000000,0.693147,0.0,0.693147,0.0,0.0,0.0,...,0.000000,0.693147,0.0,0.0,0.0,0.000000,0.0,0.0,0.693147,0.693147
5570,EC10_TTTGTCACAAGCGTAG,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.693147,0.0,0.0,0.000000,0.000000


In [19]:
# Merge 'datExpr_reset' with 'metadata' on the 'index' and 'cell_id' columns
datExpr_donor = pd.merge(datExpr_donor, metadata[['cell_id', 'donor_id']], left_on='index', right_on='cell_id', how='left')

In [20]:
datExpr_donor

Unnamed: 0,index,UBE2D2,SAR1A,IER3,RASGEF1C,TTC7A,TPK1,PTGES3,HMG20B,GEM,...,MAN1A2,EDA,SNCA,FAM135A,H2BC21,FCGR3A,ARL2-SNX15,VSIG4,cell_id,donor_id
0,EC3_AACCATGTCACGAAGG,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,EC3_AACCATGTCACGAAGG,3
1,EC3_AATCGGTCACTTACGA,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,EC3_AATCGGTCACTTACGA,3
2,EC3_ACAGCTAAGTGTCCCG,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,EC3_ACAGCTAAGTGTCCCG,3
3,EC3_ACAGCTACAAGGACAC,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,EC3_ACAGCTACAAGGACAC,3
4,EC3_ACAGCTAGTGATGTCT,0.693147,0.0,0.693147,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,EC3_ACAGCTAGTGATGTCT,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,EC10_TTTCCTCGTGGTAACG,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,EC10_TTTCCTCGTGGTAACG,10
5568,EC10_TTTCCTCGTTTGACAC,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.693147,0.0,0.0,0.000000,0.000000,EC10_TTTCCTCGTTTGACAC,10
5569,EC10_TTTCCTCTCACAGGCC,0.693147,0.0,0.000000,0.693147,0.0,0.693147,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.693147,0.693147,EC10_TTTCCTCTCACAGGCC,10
5570,EC10_TTTGTCACAAGCGTAG,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.693147,0.0,0.0,0.000000,0.000000,EC10_TTTGTCACAAGCGTAG,10


In [21]:
# Set the cell names as the index again
datExpr_donor.set_index('index', inplace=True)


In [22]:
datExpr_donor

Unnamed: 0_level_0,UBE2D2,SAR1A,IER3,RASGEF1C,TTC7A,TPK1,PTGES3,HMG20B,GEM,TREM1,...,MAN1A2,EDA,SNCA,FAM135A,H2BC21,FCGR3A,ARL2-SNX15,VSIG4,cell_id,donor_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
EC3_AACCATGTCACGAAGG,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,EC3_AACCATGTCACGAAGG,3
EC3_AATCGGTCACTTACGA,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,EC3_AATCGGTCACTTACGA,3
EC3_ACAGCTAAGTGTCCCG,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,EC3_ACAGCTAAGTGTCCCG,3
EC3_ACAGCTACAAGGACAC,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,EC3_ACAGCTACAAGGACAC,3
EC3_ACAGCTAGTGATGTCT,0.693147,0.0,0.693147,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,EC3_ACAGCTAGTGATGTCT,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
EC10_TTTCCTCGTGGTAACG,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,EC10_TTTCCTCGTGGTAACG,10
EC10_TTTCCTCGTTTGACAC,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.693147,0.0,0.0,0.000000,0.000000,EC10_TTTCCTCGTTTGACAC,10
EC10_TTTCCTCTCACAGGCC,0.693147,0.0,0.000000,0.693147,0.0,0.693147,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.693147,0.693147,EC10_TTTCCTCTCACAGGCC,10
EC10_TTTGTCACAAGCGTAG,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.693147,0.0,0.0,0.000000,0.000000,EC10_TTTGTCACAAGCGTAG,10


In [23]:
# Remove the 'cell_id' column if needed
datExpr_donor.drop(columns=['cell_id'], inplace=True)

In [24]:
datExpr_donor

Unnamed: 0_level_0,UBE2D2,SAR1A,IER3,RASGEF1C,TTC7A,TPK1,PTGES3,HMG20B,GEM,TREM1,...,NFAT5,MAN1A2,EDA,SNCA,FAM135A,H2BC21,FCGR3A,ARL2-SNX15,VSIG4,donor_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
EC3_AACCATGTCACGAAGG,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,3
EC3_AATCGGTCACTTACGA,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,3
EC3_ACAGCTAAGTGTCCCG,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,3
EC3_ACAGCTACAAGGACAC,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,3
EC3_ACAGCTAGTGATGTCT,0.693147,0.0,0.693147,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
EC10_TTTCCTCGTGGTAACG,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,10
EC10_TTTCCTCGTTTGACAC,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.693147,0.0,0.0,0.000000,0.000000,10
EC10_TTTCCTCTCACAGGCC,0.693147,0.0,0.000000,0.693147,0.0,0.693147,0.0,0.0,0.0,0.0,...,0.693147,0.0,0.0,0.0,0.000000,0.0,0.0,0.693147,0.693147,10
EC10_TTTGTCACAAGCGTAG,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.693147,0.0,0.0,0.000000,0.000000,10


In [25]:
#Save the expression matrix with donor_id
datExpr_donor.to_csv('dataset/mic_datExpr_donorid_singlecell.csv', index = True)

In [None]:
#Now that we have our gene expression dataframe, it is now possible to aggregate the data for pseudobulking.

In [26]:
# Aggregate expression by donor ID (summing the values)
pseudobulk_df = datExpr_donor.groupby('donor_id').sum()

In [27]:
pseudobulk_df

Unnamed: 0_level_0,UBE2D2,SAR1A,IER3,RASGEF1C,TTC7A,TPK1,PTGES3,HMG20B,GEM,TREM1,...,STAB1,NFAT5,MAN1A2,EDA,SNCA,FAM135A,H2BC21,FCGR3A,ARL2-SNX15,VSIG4
donor_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,6.931472,3.465736,19.238222,29.347748,6.238325,4.564348,6.238325,1.386294,0.693147,0.0,...,44.374969,11.49582,7.336937,5.950643,10.802673,4.85203,3.465736,4.158883,1.386294,18.13961
2,19.12044,1.386294,5.662961,65.9282,18.545074,19.525904,8.317766,6.931472,0.0,0.0,...,11.090355,11.49582,5.950643,11.090355,20.506733,11.49582,17.040997,5.545177,0.693147,5.545177
3,2.079442,0.0,1.386294,0.693147,0.0,0.693147,3.465736,0.693147,0.0,0.0,...,12.594433,0.0,2.079442,1.386294,1.386294,0.693147,0.0,1.386294,0.0,5.662961
4,6.238325,0.0,3.871201,4.85203,4.564348,2.079442,4.564348,2.079442,0.0,0.0,...,8.723231,5.545177,4.158883,2.772589,2.079442,6.238325,3.465736,4.85203,0.0,3.178054
5,8.317766,0.693147,3.465736,18.13961,28.184597,9.534162,9.416378,2.772589,0.0,0.0,...,13.169796,14.268409,7.336937,6.931472,12.882114,14.673874,6.238325,13.693045,3.465736,6.238325
6,28.079237,9.639523,13.117681,4.158883,30.158678,27.555988,24.090252,10.227308,0.693147,0.0,...,73.301506,35.010708,23.920353,13.575262,33.624413,21.317663,6.931472,40.385986,8.317766,79.844391
7,70.819923,9.704061,35.180607,288.971863,161.505615,110.800446,85.388435,13.575262,1.791759,0.0,...,54.079033,159.883743,76.41835,73.423744,124.665649,142.947891,17.158781,64.018661,6.238325,67.236404
8,74.92556,32.002552,33.50663,153.331543,115.193764,140.264847,50.54763,15.249238,0.693147,0.0,...,56.262703,124.727928,53.032536,58.747612,86.826851,119.117081,119.641464,19.525904,39.627174,56.785954
9,116.632179,68.281776,40.032639,53.032536,29.923111,133.281265,182.492737,9.704061,6.356108,1.791759,...,97.629524,183.47554,52.79697,54.706512,91.391197,103.174698,48.180504,63.429741,432.745911,251.994003
10,146.71283,17.32868,39.575058,142.371399,118.360527,141.037216,139.062012,18.427292,2.772589,0.0,...,123.094772,206.378754,58.813278,80.929451,161.92569,211.728058,32.120338,102.142883,53.373463,191.84993


In [28]:
#Save the pseudobulk expression matrix with donor_id
pseudobulk_df.to_csv('dataset/mic_datExpr_pseudobulk.csv', index = True)

In [None]:
#We now have the pseudobulked data and the corresponding metadata dataframe to start the correlation network analysis