In [2]:
!pip install pydeseq2

Collecting pydeseq2
  Downloading pydeseq2-0.4.11-py3-none-any.whl.metadata (7.0 kB)
Downloading pydeseq2-0.4.11-py3-none-any.whl (45 kB)
Installing collected packages: pydeseq2
Successfully installed pydeseq2-0.4.11


In [4]:
import scanpy as sc
import pandas as pd
import numpy as np
import os
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats
from pydeseq2.default_inference import DefaultInference

# Load your AnnData object
adata = sc.read_h5ad('/home/glennrdx/Documents/Research_Project/processed_h5ad/crypt_enriched_final.h5ad')

# Define the directory path where the CSVs will be saved
output_dir = "/home/glennrdx/Documents/Research_Project/scRNAseq-MSc-Analysis/upstream_analysis/crypt/differential_expression_pydeseq2"
os.makedirs(output_dir, exist_ok=True)

# Data preparation
# Round the count matrix to 0 decimals and convert to integers
counts_matrix = np.round(adata.X.toarray(), decimals=0).astype(int)

counts_df = pd.DataFrame(counts_matrix, columns=adata.var_names, index=adata.obs_names)
metadata = adata.obs[['Diet', 'leiden15']]

# Get the unique cluster identifiers
leiden15 = adata.obs['leiden15'].unique()

# Perform differential expression analysis for each cluster
for cluster in leiden15:
    # Subset the data for the cluster
    cluster_mask = metadata['leiden15'] == cluster
    counts_cluster = counts_df[cluster_mask]
    metadata_cluster = metadata[cluster_mask]
    
    # Create DeseqDataSet
    dds = DeseqDataSet(
        counts=counts_cluster,
        metadata=metadata_cluster,
        design_factors="Diet",
        refit_cooks=True,
        inference=DefaultInference(n_cpus=8)
    )
    
    # Fit dispersions and LFCs
    dds.deseq2()
    
    # Perform statistical analysis
    stat_res = DeseqStats(dds, contrast=["Diet", "HFD", "CD"])
    stat_res.summary()
    
    # Process results
    results = stat_res.results_df.reset_index()
    results = results.rename(columns={
        'index': '',
        'log2FoldChange': 'logFC',
        'pvalue': 'P.Value',
        'padj': 'adj.P.Val'
    })
    results['AveExpr'] = np.nan
    results['t'] = np.nan
    results['B'] = np.nan
    results['abs.log2FC'] = abs(results['logFC'])
    
    # Reorder columns to match the original output
    results = results[[
        '', 'logFC', 'AveExpr', 't', 'P.Value', 'adj.P.Val', 'B', 'abs.log2FC'
    ]]
    
    # Save results
    file_path = os.path.join(output_dir, f'diff_exp_CD_vs_HFD_{cluster}.csv')
    results.to_csv(file_path, index=False)
    print(f"Saved results for cluster {cluster} to {file_path}")

Fitting size factors...
... done in 15.30 seconds.

Fitting dispersions...
... done in 104.59 seconds.

Fitting dispersion trend curve...
  self._fit_parametric_dispersion_trend(vst)
... done in 0.82 seconds.

Fitting MAP dispersions...
... done in 292.58 seconds.

Fitting LFCs...
... done in 71.84 seconds.

Calculating cook's distance...


In [7]:
import scanpy as sc
import pandas as pd
import numpy as np
import os
import matplotlib as plt 

# Load your AnnData object
adata = sc.read_h5ad('/home/glennrdx/Documents/Research_Project/processed_h5ad/crypt_enriched_final.h5ad')

In [8]:
def plot_gene_heatmap_umap(adata, genes, size = 30):
    """
    Plot heatmap UMAPs for a list of genes in an AnnData object.

    Parameters:
    -----------
    adata : AnnData
        The AnnData object containing the single-cell data.
    genes : list
        A list of gene names to plot.

    Returns:
    --------
    None. The function displays the plots.
    """
    num_genes = len(genes)
    fig, axes = plt.subplots(1, num_genes, figsize=(6*num_genes, 6))
    
    if num_genes == 1:
        axes = [axes]
    
    for ax, gene in zip(axes, genes):
        sc.pl.umap(
            adata,
            color=gene,
            title=f'UMAP Heatmap - {gene}',
            frameon=False,
            ax=ax,
            size=size,
            color_map='viridis',
            show=False
        )
        
        ax.set_facecolor('black')
    
    plt.tight_layout()
    plt.show()

# Example usage:
# plot_gene_heatmap_umap(adata, ['Gene1', 'Gene2', 'Gene3'])

In [6]:
plot_gene_heatmap_umap(adata, ['Pdlim2'])

AttributeError: module 'matplotlib' has no attribute 'subplots'