In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import os

# Load your AnnData object
adata = sc.read_h5ad('/home/glennrdx/Documents/Research_Project/processed_h5ad/1.3 crypt_enriched_eec.h5ad')

In [2]:
adata

AnnData object with n_obs × n_vars = 3091 × 17573
    obs: 'Sample', 'Diet', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'n_genes', 'n_counts', 'doublet_score', 'predicted_doublet', '_scvi_batch', '_scvi_labels', 'leiden', 'published_annotations', 'leiden0', 'leiden1', 'agreeance_annotation', 'leiden2', 'leiden3', 'leiden4', 'leiden5', 'leiden6', 'leiden7', 'leiden8', 'leiden9', 'leiden10', 'leiden11', 'leiden12', 'leiden13', 'leiden14', 'leiden15', 'leiden16', 'leiden17', 'EEC_final'
    var: 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cell

In [4]:
import random

genes = ['Itln1','Spink4','Zg16','Lyz1','Defa21','Gm14851','Defa22','Gm15308','Gm15284',
                   'Defa20','Gm15308','Gm14850','Gm7861','Defa17','AY761184', 'Ang4','Agr2','Clps','Tff3','Defa24','Fcgbp']

random.shuffle(genes)
print(', '.join(genes))

Defa20, Ang4, Gm14850, Gm7861, Defa22, Gm15308, Itln1, Zg16, Defa17, Lyz1, Gm15284, Defa21, Fcgbp, Agr2, Gm15308, Clps, Spink4, Gm14851, AY761184, Defa24, Tff3


In [14]:
def filter_amb(adata):
    # ambient genes for filtering, see processing notebook
    ambient_genes=['Itln1','Spink4','Zg16','Lyz1','Defa21','Gm14851','Defa22','Gm15308','Gm15284',
                   'Defa20','Gm15308','Gm14850','Gm7861','Defa17','AY761184', 'Ang4','Agr2','Clps','Tff3','Defa24','Fcgbp']
    ix_amb_genes = np.in1d(adata.var_names,ambient_genes,invert=True)
    return (ix_amb_genes)

In [15]:
def filter_genes(adata_filt, obs_name, group1, group2):
    # Filter genes expressed in at least 10% of cells in either group
    ix_group1 = np.isin(adata_filt.obs[obs_name], group1)
    adata_sub_group1 = adata_filt[ix_group1].copy()
    filter_1 = sc.pp.filter_genes(adata_sub_group1.X, min_cells=int(adata_sub_group1.n_obs * 0.1), inplace=False)[0]
    del adata_sub_group1

    ix_group2 = np.isin(adata_filt.obs[obs_name], group2)
    adata_sub_group2 = adata_filt[ix_group2].copy()
    filter_2 = sc.pp.filter_genes(adata_sub_group2.X, min_cells=int(adata_sub_group2.n_obs * 0.1), inplace=False)[0]
    del adata_sub_group2
    
    ix_genes=[a or b for a, b in zip(filter_1,filter_2)]
    
    adata_filt = adata_filt[:,np.array(ix_genes)].copy()
    
    return adata_filt

In [16]:
# Define the directory path where the CSVs will be saved
annotation = 'leiden15'
output_dir = "/home/glennrdx/Documents/Research_Project/scRNAseq-MSc-Analysis/3. upstream_analysis/crypt/differential_expression_scanpy"

# annotation = 'EEC_final'
# output_dir = "/home/glennrdx/Documents/Research_Project/scRNAseq-MSc-Analysis/3. upstream_analysis/crypt/differential_expression_scanpy/EEC_subpopulations"

# Ensure the directory exists (create it if it doesn't)
os.makedirs(output_dir, exist_ok=True)

# Get the unique cluster identifiers
annot = adata.obs[annotation].unique()

# Perform differential expression analysis for each cluster
for cluster in annot:

    # Subset the data for the cluster
    adata_cluster = adata[adata.obs[annotation] == cluster].copy()

    # Filter out ambient genes
    print('Removing ambient genes ⌛')
    ix_amb = filter_amb(adata_cluster) 
    adata_cluster = adata_cluster[:, ix_amb].copy()

    # Filter out genes that are expressed in less than 10% of cells in either group (CD or HFD)
    print('Removing genes expressed in less than 10% of cells ⌛')
    adata_cluster = filter_genes(adata_cluster, 'Diet', 'CD', 'HFHSD')

    # Perform differential expression between Diet conditions
    sc.tl.rank_genes_groups(adata_cluster, groupby='Diet', method='wilcoxon')  # or 'wilcoxon', 'logreg', etc.

    # Store the results
    result = adata_cluster.uns['rank_genes_groups']

    # Unpack the results
    genes = result['names']
    pvals = result['pvals']
    pvals_adj = result['pvals_adj']
    logfoldchanges = result['logfoldchanges']

    # Define the comparison index for the second value
    comparison_index = 1  # Adjust if needed for different comparisons

    # Flatten the tuples and create the DataFrame
    flat_results = []
    for i in range(len(genes)):
        if comparison_index < len(genes[i]):  # Check if the comparison index exists
            logfc_value = logfoldchanges[i][comparison_index]
            pval = pvals[i][comparison_index]
            pval_adj = pvals_adj[i][comparison_index]

            flat_results.append({
                '': genes[i][comparison_index],
                'logFC': logfc_value,
                'AveExpr': np.nan,
                't': np.nan,
                'P.Value': pval,
                'adj.P.Val': pval_adj,
                'B': np.nan,
                'abs.log2FC': logfc_value,
            })

    # Convert to DataFrame
    df = pd.DataFrame(flat_results)

    # Remove duplicate genes (keeping the first occurrence)
    df_unique = df.drop_duplicates(subset='', keep='first')

    # Define the file path for the current cluster
    file_path = os.path.join(output_dir, f'diff_exp_CD_vs_HFD_{cluster}.csv')
    
    # Save the DataFrame to a CSV file with the specified format
    df_unique.to_csv(file_path, index=False)
    
    print(f"Saved results for cluster {cluster} to {file_path}")


Removing ambient genes ⌛
Removing genes expressed in less than 10% of cells ⌛
Saved results for cluster EEC (EC) to /home/glennrdx/Documents/Research_Project/scRNAseq-MSc-Analysis/3. upstream_analysis/crypt/differential_expression_scanpy/EEC_subpopulations/diff_exp_CD_vs_HFD_EEC (EC).csv
Removing ambient genes ⌛
Removing genes expressed in less than 10% of cells ⌛
Saved results for cluster EEC progenitor to /home/glennrdx/Documents/Research_Project/scRNAseq-MSc-Analysis/3. upstream_analysis/crypt/differential_expression_scanpy/EEC_subpopulations/diff_exp_CD_vs_HFD_EEC progenitor.csv
Removing ambient genes ⌛
Removing genes expressed in less than 10% of cells ⌛
Saved results for cluster EEC (I_L) to /home/glennrdx/Documents/Research_Project/scRNAseq-MSc-Analysis/3. upstream_analysis/crypt/differential_expression_scanpy/EEC_subpopulations/diff_exp_CD_vs_HFD_EEC (I_L).csv
Removing ambient genes ⌛
Removing genes expressed in less than 10% of cells ⌛
Saved results for cluster EEC (K) to /hom