In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import os

# Load the H5AD file
adata = sc.read_h5ad('/home/glennrdx/Documents/Research_Project/processed_h5ad/1.2 crypt_enriched_main.h5ad')

def filter_genes(adata_filt, obs_name, group1, group2):
    # Filter genes expressed in at least 10% of cells in either group
    ix_group1 = np.isin(adata_filt.obs[obs_name], group1)
    adata_sub_group1 = adata_filt[ix_group1].copy()
    filter_1 = sc.pp.filter_genes(adata_sub_group1.X, min_cells=int(adata_sub_group1.n_obs * 0.1), inplace=False)[0]
    del adata_sub_group1
    ix_group2 = np.isin(adata_filt.obs[obs_name], group2)
    adata_sub_group2 = adata_filt[ix_group2].copy()
    filter_2 = sc.pp.filter_genes(adata_sub_group2.X, min_cells=int(adata_sub_group2.n_obs * 0.1), inplace=False)[0]
    del adata_sub_group2
    ix_genes=[a or b for a, b in zip(filter_1,filter_2)]
    adata_filt = adata_filt[:,np.array(ix_genes)].copy()
    return adata_filt

def filter_amb(adata):
    # ambient genes for filtering, see processing notebook
    ambient_genes=['Itln1','Spink4','Zg16','Lyz1','Defa21','Gm14851','Defa22','Gm15308','Gm15284',
                   'Defa20','Gm15308','Gm14850','Gm7861','Defa17','AY761184', 'Ang4','Agr2','Clps','Tff3','Defa24','Fcgbp']
    ix_amb_genes = np.in1d(adata.var_names,ambient_genes,invert=True)
    return (ix_amb_genes)

def create_gct_file(count_matrix, output_file):
    with open(output_file, 'w') as f:
        f.write("#1.2\n")
        f.write(f"{count_matrix.shape[0]}\t{count_matrix.shape[1]}\n")
        f.write("NAME\tDescription\t" + "\t".join(count_matrix.columns) + "\n")
        for gene in count_matrix.index:
            f.write(f"{gene}\t{gene}\t" + "\t".join(map(str, count_matrix.loc[gene])) + "\n")

def create_cls_file(metadata, output_file):
    with open(output_file, 'w') as f:
        f.write(f"{len(metadata)} 2 1\n")
        f.write("# CD HFHSD\n")
        f.write(" ".join(["0" if diet == "CD" else "1" for diet in metadata['Diet']]))

# Get unique cell types
cell_types = adata.obs['leiden15'].unique()

# Create main directory
gsea_dir = "/home/glennrdx/Documents/Research_Project/scRNAseq-MSc-Analysis/3. upstream_analysis/crypt/GSEA"
os.makedirs(gsea_dir, exist_ok=True)

for cell_type in cell_types:
    # Create directory for each cell type
    cell_type_dir = os.path.join(gsea_dir, cell_type)
    os.makedirs(cell_type_dir, exist_ok=True)

    # Filter data for the current cell type
    adata_subset = adata[adata.obs['leiden15'] == cell_type]

    # Filter out ambient genes
    print(f'Removing ambient genes for {cell_type} ⌛')
    ix_amb = filter_amb(adata_subset)
    adata_subset = adata_subset[:, ix_amb].copy()

    # Filter out genes that are expressed in less than 10% of cells in either group (CD or HFHSD)
    print(f'Removing genes expressed in less than 10% of cells for {cell_type} ⌛')
    adata_subset = filter_genes(adata_subset, 'Diet', 'CD', 'HFHSD')

    # Create count matrix
    count_matrix = pd.DataFrame(adata_subset.X.toarray(),
                                index=adata_subset.obs_names,
                                columns=adata_subset.var_names)

    # Aggregate counts by sample
    count_matrix = count_matrix.groupby(adata_subset.obs['Sample'], observed=True).sum()

    # Transpose count matrix for GCT format
    count_matrix = count_matrix.T

    # Create metadata
    metadata = adata_subset.obs[['Sample', 'Diet']].drop_duplicates()
    metadata = metadata.set_index('Sample')

    # Create GCT file
    gct_file = os.path.join(cell_type_dir, f"{cell_type}_expression.gct")
    create_gct_file(count_matrix, gct_file)

    # Create CLS file
    cls_file = os.path.join(cell_type_dir, f"{cell_type}_phenotypes.cls")
    create_cls_file(metadata, cls_file)

    print(f"Processed {cell_type}")

print("Done!")

Removing ambient genes for Enterocyte Progenitor ⌛
Removing genes expressed in less than 10% of cells for Enterocyte Progenitor ⌛
Processed Enterocyte Progenitor
Removing ambient genes for Goblet ⌛
Removing genes expressed in less than 10% of cells for Goblet ⌛
Processed Goblet
Removing ambient genes for EEC ⌛
Removing genes expressed in less than 10% of cells for EEC ⌛
Processed EEC
Removing ambient genes for EEC Progenitor ⌛
Removing genes expressed in less than 10% of cells for EEC Progenitor ⌛
Processed EEC Progenitor
Removing ambient genes for Tuft Progenitor ⌛
Removing genes expressed in less than 10% of cells for Tuft Progenitor ⌛
Processed Tuft Progenitor
Removing ambient genes for ISC ⌛
Removing genes expressed in less than 10% of cells for ISC ⌛
Processed ISC
Removing ambient genes for Goblet Progenitor ⌛
Removing genes expressed in less than 10% of cells for Goblet Progenitor ⌛
Processed Goblet Progenitor
Removing ambient genes for Tuft ⌛
Removing genes expressed in less tha