In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import os

# Load the H5AD file
adata = sc.read_h5ad('/home/glennrdx/Documents/Research_Project/processed_h5ad/crypt_enriched_final.h5ad')

In [4]:
def filter_genes(adata_filt, obs_name, group1, group2):
    # Filter genes expressed in at least 10% of cells in either group
    ix_group1 = np.isin(adata_filt.obs[obs_name], group1)
    adata_sub_group1 = adata_filt[ix_group1].copy()
    filter_1 = sc.pp.filter_genes(adata_sub_group1.X, min_cells=int(adata_sub_group1.n_obs * 0.1), inplace=False)[0]
    del adata_sub_group1

    ix_group2 = np.isin(adata_filt.obs[obs_name], group2)
    adata_sub_group2 = adata_filt[ix_group2].copy()
    filter_2 = sc.pp.filter_genes(adata_sub_group2.X, min_cells=int(adata_sub_group2.n_obs * 0.1), inplace=False)[0]
    del adata_sub_group2
    
    ix_genes=[a or b for a, b in zip(filter_1,filter_2)]
    
    adata_filt = adata_filt[:,np.array(ix_genes)].copy()
    
    return adata_filt

In [5]:
def filter_amb(adata):
    # ambient genes for filtering, see processing notebook
    ambient_genes=['Itln1','Spink4','Zg16','Lyz1','Defa21','Gm14851','Defa22','Gm15308','Gm15284',
                   'Defa20','Gm15308','Gm14850','Gm7861','Defa17','AY761184', 'Ang4','Agr2','Clps','Tff3','Defa24','Fcgbp']
    ix_amb_genes = np.in1d(adata.var_names,ambient_genes,invert=True)
    return (ix_amb_genes)

In [8]:
# Get unique cell types
cell_types = adata.obs['leiden15'].unique()

# Create main directory
os.makedirs("/home/glennrdx/Documents/Research_Project/scRNAseq-MSc-Analysis/3. upstream_analysis/crypt/differential_expression_deseq2/DESeq2_data_files/", exist_ok=True)

for cell_type in cell_types:
    # Create directory for each cell type
    cell_type_dir = os.path.join("/home/glennrdx/Documents/Research_Project/scRNAseq-MSc-Analysis/3. upstream_analysis/crypt/differential_expression_deseq2/DESeq2_data_files/", cell_type)
    os.makedirs(cell_type_dir, exist_ok=True)
    
    # Filter data for the current cell type
    adata_subset = adata[adata.obs['leiden15'] == cell_type]

    # Filter out ambient genes
    print('Removing ambient genes ⌛')
    ix_amb = filter_amb(adata_subset) 
    adata_subset = adata_subset[:, ix_amb].copy()

    # Filter out genes that are expressed in less than 10% of cells in either group (CD or HFD)
    print('Removing genes expressed in less than 10% of cells ⌛')
    adata_subset = filter_genes(adata_subset, 'Diet', 'CD', 'HFHSD')
    
    # Create count matrix
    count_matrix = pd.DataFrame(adata_subset.X.toarray(),
                                index=adata_subset.obs_names,
                                columns=adata_subset.var_names)
    
    # Aggregate counts by sample
    count_matrix = count_matrix.groupby(adata_subset.obs['Sample'], observed=True).sum()
    
    # Save count matrix
    count_matrix.to_csv(os.path.join(cell_type_dir, "count_matrix.csv"))
    
    # Create metadata
    metadata = adata_subset.obs[['Sample', 'Diet']].drop_duplicates()
    metadata = metadata.set_index('Sample')
    
    # Rename columns in metadata if necessary
    metadata = metadata.rename(columns={
        'gene_symbol': 'X',
        'log2FoldChange': 'LogFC',
        'pvalue': 'P.Value',
        'padj': 'adj.P.Val'
    })
    
    # Save metadata
    metadata.to_csv(os.path.join(cell_type_dir, "metadata.csv"))
    
    print(f"Processed {cell_type}")

print("Done!")

Removing ambient genes ⌛
Removing genes expressed in less than 10% of cells ⌛
Processed Enterocyte Progenitor
Removing ambient genes ⌛
Removing genes expressed in less than 10% of cells ⌛
Processed Goblet
Removing ambient genes ⌛
Removing genes expressed in less than 10% of cells ⌛
Processed EEC
Removing ambient genes ⌛
Removing genes expressed in less than 10% of cells ⌛
Processed EEC Progenitor
Removing ambient genes ⌛
Removing genes expressed in less than 10% of cells ⌛
Processed Tuft Progenitor
Removing ambient genes ⌛
Removing genes expressed in less than 10% of cells ⌛
Processed ISC
Removing ambient genes ⌛
Removing genes expressed in less than 10% of cells ⌛
Processed Goblet Progenitor
Removing ambient genes ⌛
Removing genes expressed in less than 10% of cells ⌛
Processed Tuft
Removing ambient genes ⌛
Removing genes expressed in less than 10% of cells ⌛
Processed Enterocyte
Removing ambient genes ⌛
Removing genes expressed in less than 10% of cells ⌛
Processed Not Annotated
Remo