In [None]:
import pandas as pd
import numpy as np
import sys
# --- NEW IMPORTS ---
import matplotlib.pyplot as plt
import seaborn as sns
# -------------------

# --- Configuration ---
DGE_FILE = "DGE_Results_EGCG_vs_Senescent.csv"
ENRICHMENT_FILE = "Enrichment_UpRegulated.csv"
GENE_COLUMN_NAME = "Unnamed: 0" # Check your CSV, this is likely the gene name column

def analyze_dge_results(dge_file):
    """
    Analyzes DGE results, prints summaries, and creates a volcano plot as a PNG.
    """
    print(f"--- 1. Analyzing DGE Results ({dge_file}) ---")
    try:
        dge_df = pd.read_csv(dge_file)

        # Rename gene column for clarity
        if GENE_COLUMN_NAME in dge_df.columns:
            dge_df = dge_df.rename(columns={GENE_COLUMN_NAME: 'gene'})
        else:
            print(f"Warning: Gene column '{GENE_COLUMN_NAME}' not found.")
            dge_df['gene'] = dge_df.iloc[:, 0] # Assume first col is gene

        if 'significant' not in dge_df.columns:
            print("Error: 'significant' column not found. Cannot proceed.")
            return

        # --- A. DGE Summary ---
        sig_genes = dge_df[dge_df['significant'] == True]
        sig_up = sig_genes[sig_genes['log2FoldChange'] > 0]
        sig_down = sig_genes[sig_genes['log2FoldChange'] < 0]

        print(f"\nTotal Genes Tested: {len(dge_df)}")
        print(f"Total Significant Genes: {len(sig_genes)}")
        print(f"  > Up-regulated: {len(sig_up)}")
        print(f"  > Down-regulated: {len(sig_down)}")

        # --- B. Top Genes ---
        print("\n--- Top 5 Up-Regulated Genes (by padj) ---")
        print(sig_up.sort_values('padj').head().to_markdown(index=False))

        print("\n--- Top 5 Down-Regulated Genes (by padj) ---")
        print(sig_down.sort_values('padj').head().to_markdown(index=False))

        # --- C. Volcano Plot (Using Matplotlib/Seaborn) ---
        print("\nGenerating Volcano Plot...")

        # Create -log10(padj) column
        dge_df['neg_log10_padj'] = -np.log10(dge_df['padj'].replace(0, 1e-300))

        # Create a simpler color column for the plot
        dge_df['Change'] = dge_df.apply(
            lambda row: "Up-regulated" if row['significant'] and row['log2FoldChange'] > 0
            else ("Down-regulated" if row['significant'] and row['log2FoldChange'] < 0
                  else "Not Significant"),
            axis=1
        )

        # Set up the plot
        plt.figure(figsize=(10, 7))
        palette = {
            'Up-regulated': '#E64B35',
            'Down-regulated': '#3C5488',
            'Not Significant': 'lightgray'
        }

        sns.scatterplot(
            data=dge_df,
            x='log2FoldChange',
            y='neg_log10_padj',
            hue='Change',
            palette=palette,
            s=20,          # Smaller marker size
            alpha=0.6,     # Add transparency
            edgecolor=None # Remove outlines
        )

        plt.title('Volcano Plot of DGE Results', fontsize=16)
        plt.xlabel('Log2 Fold Change (EGCG vs Senescent)', fontsize=12)
        plt.ylabel('-log10(Adjusted p-value)', fontsize=12)
        plt.legend(title='Gene Expression', loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=3)

        # Add lines for thresholds
        plt.axvline(x=1, linestyle='--', color='gray', linewidth=0.8)
        plt.axvline(x=-1, linestyle='--', color='gray', linewidth=0.8)
        plt.axhline(y=-np.log10(0.05), linestyle='--', color='gray', linewidth=0.8)

        # Save the figure as a PNG
        volcano_filename = 'volcano_plot.png'
        plt.savefig(volcano_filename, dpi=300, bbox_inches='tight')
        plt.close() # Close the plot to free memory

        print(f"Volcano plot saved to '{volcano_filename}'")

    except FileNotFoundError:
        print(f"FATAL ERROR: DGE file not found at '{dge_file}'")
    except Exception as e:
        print(f"An error occurred during DGE analysis: {e}")

def analyze_enrichment_results(enrichment_file):
    """
    Analyzes enrichment results, prints a summary, and creates a bar chart as a PNG.
    """
    print(f"\n--- 2. Analyzing Enrichment Results ({enrichment_file}) ---")
    try:
        enrich_df = pd.read_csv(enrichment_file)

        if enrich_df.empty:
            print("Enrichment file is empty. No significant pathways found.")
            return

        print("Enrichment results found:")
        print(enrich_df.to_markdown(index=False))

        # --- B. Enrichment Bar Chart (Using Matplotlib/Seaborn) ---
        if 'p_value' in enrich_df.columns:
            enrich_df['neg_log10_p'] = -np.log10(enrich_df['p_value'].replace(0, 1e-300))

            # Sort by p-value and get top 15
            enrich_df_sorted = enrich_df.sort_values('p_value').head(15)

            plt.figure(figsize=(10, max(6, len(enrich_df_sorted) * 0.5))) # Dynamic height

            sns.barplot(
                data=enrich_df_sorted,
                x='neg_log10_p',
                y='name',
                color='#3C5488' # Use a consistent blue
            )

            plt.title('Top Enriched Up-Regulated Pathways', fontsize=16)
            plt.xlabel('-log10(p-value)', fontsize=12)
            plt.ylabel('Pathway', fontsize=12)

            # Save the figure as a PNG
            barchart_filename = 'enrichment_barchart.png'
            plt.savefig(barchart_filename, dpi=300, bbox_inches='tight')
            plt.close() # Close the plot

            print(f"\nEnrichment bar chart saved to '{barchart_filename}'")
        else:
            print("\nNo 'p_value' column found, skipping bar chart.")

    except pd.errors.EmptyDataError:
        print("Enrichment file is empty. No significant pathways found.")
    except FileNotFoundError:
        print(f"FATAL ERROR: Enrichment file not found at '{enrichment_file}'")
    except Exception as e:
        print(f"An error occurred during enrichment analysis: {e}")

if __name__ == "__main__":
    # Ensure necessary libraries are installed:
    # pip install pandas numpy matplotlib seaborn
    print("========================================")
    print("Starting DGE and Enrichment Analysis")
    print("========================================")

    analyze_dge_results(DGE_FILE)
    analyze_enrichment_results(ENRICHMENT_FILE)

    print("\n========================================")
    print("Analysis complete.")
    print("========================================")

Starting DGE and Enrichment Analysis
--- 1. Analyzing DGE Results (DGE_Results_EGCG_vs_Senescent.csv) ---

Total Genes Tested: 22379
Total Significant Genes: 2
  > Up-regulated: 2
  > Down-regulated: 0

--- Top 5 Up-Regulated Genes (by padj) ---
| gene   |   baseMean |   log2FoldChange |    lfcSE |    stat |      pvalue |      padj | significant   |
|:-------|-----------:|-----------------:|---------:|--------:|------------:|----------:|:--------------|
| SMN1   |    572.241 |         0.88211  | 0.177736 | 4.96304 | 6.9399e-07  | 0.0155308 | True          |
| RGPD2  |    233.226 |         0.939135 | 0.198298 | 4.73598 | 2.17996e-06 | 0.0243927 | True          |

--- Top 5 Down-Regulated Genes (by padj) ---
| gene   | baseMean   | log2FoldChange   | lfcSE   | stat   | pvalue   | padj   | significant   |
|--------|------------|------------------|---------|--------|----------|--------|---------------|

Generating Volcano Plot...
Volcano plot saved to 'volcano_plot.png'

--- 2. Analyzing E