In [16]:
import scanpy as sc
import pandas as pd
import numpy as np
import os

# Load your AnnData object
adata = sc.read_h5ad('/home/glennrdx/Documents/Research_Project/processed_h5ad/crypt_enriched_final.h5ad')

# Define the directory path where the CSVs will be saved
output_dir = "/home/glennrdx/Documents/Research_Project/scRNAseq-MSc-Analysis/upstream_analysis/crypt/differential_expression_scanpy"

# Ensure the directory exists (create it if it doesn't)
os.makedirs(output_dir, exist_ok=True)

# Get the unique cluster identifiers
leiden15 = adata.obs['leiden15'].unique()

# Perform differential expression analysis for each cluster
for cluster in leiden15:

    # Subset the data for the cluster
    adata_cluster = adata[adata.obs['leiden15'] == cluster].copy()

    # Perform differential expression between Diet conditions
    sc.tl.rank_genes_groups(adata_cluster, groupby='Diet', method='wilcoxon')  # or 'wilcoxon', 'logreg', etc.

    # Store the results
    result = adata_cluster.uns['rank_genes_groups']

    # Unpack the results
    genes = result['names']
    pvals = result['pvals']
    pvals_adj = result['pvals_adj']
    logfoldchanges = result['logfoldchanges']

    # Define the comparison index for the second value
    comparison_index = 1  # Adjust if needed for different comparisons

    # Flatten the tuples and create the DataFrame
    flat_results = []
    for i in range(len(genes)):
        if comparison_index < len(genes[i]):  # Check if the comparison index exists
            logfc_value = logfoldchanges[i][comparison_index]
            pval = pvals[i][comparison_index]
            pval_adj = pvals_adj[i][comparison_index]

            flat_results.append({
                '': genes[i][comparison_index],
                'logFC': logfc_value,
                'AveExpr': np.nan,
                't': np.nan,
                'P.Value': pval,
                'adj.P.Val': pval_adj,
                'B': np.nan,
                'abs.log2FC': logfc_value,
            })

    # Convert to DataFrame
    df = pd.DataFrame(flat_results)

    # Remove duplicate genes (keeping the first occurrence)
    df_unique = df.drop_duplicates(subset='', keep='first')

    # Define the file path for the current cluster
    file_path = os.path.join(output_dir, f'diff_exp_CD_vs_HFD_{cluster}.csv')
    
    # Save the DataFrame to a CSV file with the specified format
    df_unique.to_csv(file_path, index=False)
    
    print(f"Saved results for cluster {cluster} to {file_path}")


Saved results for cluster Enterocyte Progenitor to /home/glennrdx/Documents/Research_Project/scRNAseq-MSc-Analysis/upstream_analysis/crypt/differential_expression_scanpy/diff_exp_CD_vs_HFD_Enterocyte Progenitor.csv
Saved results for cluster Goblet to /home/glennrdx/Documents/Research_Project/scRNAseq-MSc-Analysis/upstream_analysis/crypt/differential_expression_scanpy/diff_exp_CD_vs_HFD_Goblet.csv
Saved results for cluster EEC to /home/glennrdx/Documents/Research_Project/scRNAseq-MSc-Analysis/upstream_analysis/crypt/differential_expression_scanpy/diff_exp_CD_vs_HFD_EEC.csv
Saved results for cluster EEC Progenitor to /home/glennrdx/Documents/Research_Project/scRNAseq-MSc-Analysis/upstream_analysis/crypt/differential_expression_scanpy/diff_exp_CD_vs_HFD_EEC Progenitor.csv
Saved results for cluster Tuft Progenitor to /home/glennrdx/Documents/Research_Project/scRNAseq-MSc-Analysis/upstream_analysis/crypt/differential_expression_scanpy/diff_exp_CD_vs_HFD_Tuft Progenitor.csv
Saved results for