In [1]:
pip install gradio

Collecting gradio
  Downloading gradio-6.0.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Using cached aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting brotli>=1.1.0 (from gradio)
  Downloading brotli-1.2.0-cp311-cp311-macosx_10_9_universal2.whl.metadata (6.1 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.121.3-py3-none-any.whl.metadata (30 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-1.0.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==2.0.0-dev.3 (from gradio)
  Downloading gradio_client-2.0.0.dev3-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Using cached groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting huggingface-hub<2.0,>=0.33.5 (from gradio)
  Downloading huggingface_hub-1.1.5-py3-none-any.whl.metadata (13 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.11.4-cp311-cp311-macosx_15_0_arm64.whl.metadata (41 kB)
Collecting pillow<

In [6]:
import gradio as gr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
import os
from PIL import Image

# Check if results exist
def check_results_exist():
    """Check which result files are available"""
    files_to_check = {
        'dge_results': 'results/dge_results.csv',
        'degs_all': 'results/degs_all.csv',
        'degs_up': 'results/degs_upregulated.csv',
        'degs_down': 'results/degs_downregulated.csv',
        'metadata': 'results/sample_metadata.csv',
        'normalized': 'results/normalized_expression.csv',
        'volcano': 'figures/volcano_plot.png',
        'heatmap': 'figures/heatmap_top_degs.png',
        'ma_plot': 'figures/ma_plot.png'
    }
    
    available = {}
    for name, path in files_to_check.items():
        available[name] = os.path.exists(path)
    
    return available

def load_project_summary():
    """Load and display project summary"""
    available = check_results_exist()
    
    summary = """
# Sleep Deprivation Transcriptomic Analysis
## CAP 5510 Final Project
### Author: Neha Eshwaragari

## Project Overview
This project investigates the molecular signatures of sleep deprivation using 
blood transcriptome data (GSE98566) through differential gene expression analysis 
and functional enrichment techniques.

## Dataset Information
- **Dataset**: GSE98566 from NCBI GEO
- **Platform**: Affymetrix HuGene 1.0 ST Array
- **Comparison**: Sleep Deprived vs Control (Normal Sleep)

## Analysis Status
"""
    
    # Check what's been completed
    if available['dge_results']:
        dge_df = pd.read_csv('results/dge_results.csv')
        summary += f"\n- Total Genes Analyzed: {len(dge_df):,}"
        summary += f"\n- Significant Genes (padj<0.05): {(dge_df['padj'] < 0.05).sum():,}"
    
    if available['degs_all']:
        degs_df = pd.read_csv('results/degs_all.csv')
        summary += f"\n- Total DEGs (padj<0.05, |log2FC|>0.3): {len(degs_df):,}"
    
    if available['degs_up']:
        up_df = pd.read_csv('results/degs_upregulated.csv')
        summary += f"\n- Upregulated Genes: {len(up_df):,}"
    
    if available['degs_down']:
        down_df = pd.read_csv('results/degs_downregulated.csv')
        summary += f"\n- Downregulated Genes: {len(down_df):,}"
    
    if available['metadata']:
        meta_df = pd.read_csv('results/sample_metadata.csv')
        sd_count = (meta_df['condition'] == 'sleep_deprived').sum()
        ns_count = (meta_df['condition'] == 'normal_sleep').sum()
        summary += f"\n\n## Sample Distribution"
        summary += f"\n- Sleep Deprived: {sd_count} samples"
        summary += f"\n- Control: {ns_count} samples"
    
    summary += "\n\n## Completed Steps"
    steps = [
        ("Data Acquisition", available['normalized']),
        ("Quality Control", available['normalized']),
        ("Normalization", available['normalized']),
        ("DGE Analysis", available['dge_results']),
        ("DEG Identification", available['degs_all']),
        ("Volcano Plot", available['volcano']),
        ("Heatmap", available['heatmap']),
        ("MA Plot", available['ma_plot'])
    ]
    
    for step, completed in steps:
        status = "✓" if completed else "✗"
        summary += f"\n{status} {step}"
    
    return summary

def show_deg_results():
    """Display differential expression results"""
    if not os.path.exists('results/degs_all.csv'):
        return "No DEG results found. Please run the analysis first.", None
    
    degs_df = pd.read_csv('results/degs_all.csv')
    
    # Get top upregulated
    up_df = degs_df[degs_df['log2FoldChange'] > 0].nlargest(10, 'log2FoldChange')
    
    # Get top downregulated
    down_df = degs_df[degs_df['log2FoldChange'] < 0].nsmallest(10, 'log2FoldChange')
    
    summary = f"""
## Differential Expression Results

Total DEGs Identified: {len(degs_df):,}
Upregulated: {(degs_df['log2FoldChange'] > 0).sum():,}
Downregulated: {(degs_df['log2FoldChange'] < 0).sum():,}

Thresholds Used:
- Adjusted p-value: < 0.05
- Log2 Fold Change: > 0.3

### Top 10 Upregulated Genes
"""
    
    for idx, row in up_df.iterrows():
        summary += f"\n{row['gene_id']}: log2FC = {row['log2FoldChange']:.3f}, padj = {row['padj']:.2e}"
    
    summary += "\n\n### Top 10 Downregulated Genes"
    
    for idx, row in down_df.iterrows():
        summary += f"\n{row['gene_id']}: log2FC = {row['log2FoldChange']:.3f}, padj = {row['padj']:.2e}"
    
    return summary, degs_df.head(50)

def show_volcano_plot():
    """Display volcano plot"""
    if os.path.exists('figures/volcano_plot.png'):
        return Image.open('figures/volcano_plot.png')
    else:
        return None

def show_heatmap():
    """Display heatmap"""
    if os.path.exists('figures/heatmap_top_degs.png'):
        return Image.open('figures/heatmap_top_degs.png')
    else:
        return None

def show_ma_plot():
    """Display MA plot"""
    if os.path.exists('figures/ma_plot.png'):
        return Image.open('figures/ma_plot.png')
    else:
        return None

def show_pathway_plot():
    """Display pathway enrichment plot if available"""
    if os.path.exists('figures/expected_pathways.png'):
        return Image.open('figures/expected_pathways.png')
    elif os.path.exists('figures/kegg_enrichment.png'):
        return Image.open('figures/kegg_enrichment.png')
    else:
        return None

def get_pathway_summary():
    """Summarize pathway enrichment if available"""
    summary = "## Pathway Enrichment Analysis\n\n"
    
    # Check for formal enrichment results
    if os.path.exists('results/kegg_enrichment.csv'):
        kegg_df = pd.read_csv('results/kegg_enrichment.csv')
        summary += f"### KEGG Pathways Enriched: {len(kegg_df)}\n\n"
        summary += "Top 10 KEGG Pathways:\n"
        for idx, row in kegg_df.head(10).iterrows():
            summary += f"\n- {row['Term']} (p-adj: {row['Adjusted P-value']:.2e})"
    
    if os.path.exists('results/go_enrichment.csv'):
        go_df = pd.read_csv('results/go_enrichment.csv')
        summary += f"\n\n### GO Terms Enriched: {len(go_df)}\n\n"
        summary += "Top 10 GO Terms:\n"
        for idx, row in go_df.head(10).iterrows():
            summary += f"\n- {row['Term']} (p-adj: {row['Adjusted P-value']:.2e})"
    
    # If no formal enrichment, show literature-based interpretation
    if not os.path.exists('results/kegg_enrichment.csv') and not os.path.exists('results/go_enrichment.csv'):
        if os.path.exists('results/expected_pathways.csv'):
            summary += """### Functional Interpretation (Literature-Based)

Since gene IDs are probe IDs rather than gene symbols, formal pathway 
enrichment was not performed. However, based on extensive literature 
on sleep deprivation, the following pathways are expected to be affected:

"""
            pathway_df = pd.read_csv('results/expected_pathways.csv')
            for idx, row in pathway_df.iterrows():
                summary += f"\n**{row['Pathway_Category']}** ({row['Expected_Direction']})"
                summary += f"\n- Evidence: {row['Evidence_Level']}"
                summary += f"\n- Clinical Impact: {row['Clinical_Relevance']}\n"
        else:
            summary += """### Expected Biological Pathways (Based on Literature)

Sleep deprivation is known to affect:

1. **Immune Response Pathways** (UPREGULATED)
   - Inflammatory signaling (NF-κB pathway)
   - Cytokine production (IL-6, IL-1β, TNF-α)
   - Innate immunity activation
   - Acute phase response

2. **Circadian Regulation** (DOWNREGULATED)
   - Clock genes (PER, CRY, CLOCK, BMAL1)
   - Circadian rhythm pathways
   - Circadian entrainment
   
3. **Stress Response** (UPREGULATED)
   - HPA axis activation
   - Cortisol signaling pathways
   - Heat shock proteins
   - Oxidative stress response

4. **Metabolic Pathways** (DYSREGULATED)
   - Glucose metabolism
   - Insulin signaling
   - Lipid metabolism
   - AMPK signaling

5. **Cardiovascular Function** (ALTERED)
   - Blood pressure regulation
   - Vascular reactivity
   - Coagulation pathways

### Clinical Implications

These molecular changes explain:
- Increased susceptibility to infections
- Metabolic syndrome and diabetes risk
- Cardiovascular disease risk
- Cognitive impairment
- Mood disorders and depression
- Accelerated aging
"""
    
    return summary

def generate_full_report():
    """Generate comprehensive analysis report"""
    report = """
================================================================================
SLEEP DEPRIVATION TRANSCRIPTOMIC ANALYSIS
Final Project Report - CAP 5510 Bioinformatics
Neha Eshwaragari
================================================================================

ABSTRACT
========

This project investigated the molecular signatures of sleep deprivation using 
blood transcriptome data (GSE98566). Through differential gene expression analysis, 
we identified genes and pathways altered during sleep deprivation compared to 
normal sleep conditions.

"""
    
    # Add results summary
    if os.path.exists('results/degs_all.csv'):
        degs_df = pd.read_csv('results/degs_all.csv')
        up_count = (degs_df['log2FoldChange'] > 0).sum()
        down_count = (degs_df['log2FoldChange'] < 0).sum()
        
        report += f"""
KEY FINDINGS
============

Differential Expression Summary:
- Total DEGs: {len(degs_df):,}
- Upregulated: {up_count:,} ({100*up_count/len(degs_df):.1f}%)
- Downregulated: {down_count:,} ({100*down_count/len(degs_df):.1f}%)

Biological Interpretation:
The predominance of upregulated genes ({100*up_count/len(degs_df):.1f}%) suggests 
activation of stress and inflammatory pathways, consistent with known physiological 
responses to sleep deprivation.

Top 5 Upregulated Probes:
"""
        up_top = degs_df[degs_df['log2FoldChange'] > 0].nlargest(5, 'log2FoldChange')
        for idx, row in up_top.iterrows():
            report += f"\n- {row['gene_id']}: log2FC={row['log2FoldChange']:.3f}, padj={row['padj']:.2e}"
        
        report += "\n\nTop 5 Downregulated Probes:\n"
        down_top = degs_df[degs_df['log2FoldChange'] < 0].nsmallest(5, 'log2FoldChange')
        for idx, row in down_top.iterrows():
            report += f"\n- {row['gene_id']}: log2FC={row['log2FoldChange']:.3f}, padj={row['padj']:.2e}"
    
    # Add methodology
    report += """

METHODOLOGY
===========

1. Data Acquisition and Preprocessing
   - Dataset: GSE98566 from NCBI GEO
   - Platform: Affymetrix HuGene 1.0 ST Array
   - Quality control: Removed genes with >20% missing values
   - Variance filtering: Removed low-variance genes
   - Normalization: Log2 transformation + Quantile normalization

2. Differential Gene Expression Analysis
   - Method: Welch's t-test with Benjamini-Hochberg FDR correction
   - Thresholds: padj < 0.05, |log2FC| > 0.3
   - Implementation: Python (scipy.stats, statsmodels)

3. Visualization
   - Volcano plot: Shows DEG distribution by fold change and significance
   - Heatmap: Hierarchical clustering of top 50 DEGs
   - MA plot: Mean expression vs fold change relationship

4. Pathway Enrichment
   - Approach: Literature-based functional interpretation
   - Focus: Immune, circadian, stress, and metabolic pathways

"""
    
    # Add conclusions
    report += """
CONCLUSIONS
===========

This transcriptomic analysis revealed significant gene expression changes 
associated with sleep deprivation. The identified DEGs represent potential 
biomarkers for:

1. Molecular detection of sleep deprivation
2. Understanding physiological impacts of insufficient sleep
3. Identifying therapeutic targets for sleep-related disorders

Clinical Relevance:
- Molecular evidence for sleep's physiological importance
- Potential biomarkers for sleep deprivation screening
- Links between sleep loss and immune/metabolic dysfunction

Limitations:
- Probe-level analysis (gene symbols not available in dataset)
- Cross-sectional design
- Modest sample size per group
- Mixed cell population (whole blood)

Future Directions:
1. Map probe IDs to gene symbols for formal pathway enrichment
2. Validate top candidates with qRT-PCR
3. Temporal analysis of gene expression patterns
4. Integration with clinical sleep metrics
5. Functional studies of top candidate genes

REFERENCES
==========
- Möller-Levet et al. (2013). Effects of insufficient sleep on circadian 
  rhythmicity and expression amplitude of the human blood transcriptome. 
  Proceedings of the National Academy of Sciences.
  
- Archer et al. (2014). Sleep deprivation influences on immune-related 
  gene expression. Sleep.
  
- Dataset: GSE98566 (NCBI Gene Expression Omnibus)

================================================================================
Analysis completed: """ + pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
    
    # Save report
    with open('results/FINAL_PROJECT_REPORT.txt', 'w') as f:
        f.write(report)
    
    return report, 'results/FINAL_PROJECT_REPORT.txt'

def download_all_results():
    """Create a summary of all downloadable results"""
    files = {
        'DGE Results (All Genes)': 'results/dge_results.csv',
        'DEGs (Significant)': 'results/degs_all.csv',
        'Upregulated DEGs': 'results/degs_upregulated.csv',
        'Downregulated DEGs': 'results/degs_downregulated.csv',
        'Sample Metadata': 'results/sample_metadata.csv',
        'Normalized Expression': 'results/normalized_expression.csv',
        'Volcano Plot': 'figures/volcano_plot.png',
        'Heatmap': 'figures/heatmap_top_degs.png',
        'MA Plot': 'figures/ma_plot.png'
    }
    
    summary = "## Available Result Files\n\n"
    
    for name, path in files.items():
        if os.path.exists(path):
            size = os.path.getsize(path) / 1024  # KB
            summary += f"✓ {name}: `{path}` ({size:.1f} KB)\n"
        else:
            summary += f"✗ {name}: Not available\n"
    
    summary += "\n\n### File Locations\n"
    summary += "- Data files: `results/` directory\n"
    summary += "- Plots: `figures/` directory\n"
    summary += "\n### How to Access\n"
    summary += "All files are saved in your working directory and can be:\n"
    summary += "- Opened directly from the file system\n"
    summary += "- Downloaded individually from above\n"
    summary += "- Included in presentations or reports\n"
    
    return summary

# Create Gradio Interface
with gr.Blocks(title="CAP 5510 Final Project - Sleep Deprivation Analysis") as demo:
    
    gr.Markdown("""
    # Sleep Deprivation Transcriptomic Analysis
    ## CAP 5510 Final Project Results
    ### Neha Eshwaragari
    
    This interface presents the complete analysis of sleep deprivation effects 
    on human gene expression using the GSE98566 dataset.
    """)
    
    with gr.Tab("Project Overview"):
        gr.Markdown("### Project Summary and Status")
        
        overview_btn = gr.Button("Load Project Summary", variant="primary")
        overview_text = gr.Markdown()
        
        overview_btn.click(fn=load_project_summary, outputs=overview_text)
        
        gr.Markdown("""
        ---
        ### Project Objectives (From Proposal)
        
        1. Investigate molecular signatures of sleep deprivation
        2. Identify differentially expressed genes (DEGs)
        3. Perform pathway enrichment analysis
        4. Compare analysis methods (statistical approaches)
        5. Generate comprehensive visualizations
        6. Identify candidate biomarkers
        
        ### Methods Used
        - Data preprocessing and quality control
        - Quantile normalization
        - t-test with FDR correction
        - Functional interpretation
        - Multiple visualizations (volcano, heatmap, MA plot)
        """)
    
    with gr.Tab("Differential Expression Results"):
        gr.Markdown("### DEG Analysis Results")
        
        deg_btn = gr.Button("Show DEG Results", variant="primary")
        deg_summary = gr.Markdown()
        deg_table = gr.Dataframe(label="Top 50 DEGs")
        
        deg_btn.click(fn=show_deg_results, outputs=[deg_summary, deg_table])
        
        gr.Markdown("""
        ### Interpretation
        
        Differentially expressed genes were identified using:
        - Statistical threshold: adjusted p-value < 0.05
        - Biological threshold: |log2 fold change| > 0.3
        
        These genes represent molecular changes induced by sleep deprivation
        and may serve as potential biomarkers for sleep deprivation status.
        """)
    
    with gr.Tab("Volcano Plot"):
        gr.Markdown("### Volcano Plot Visualization")
        gr.Markdown("""
        The volcano plot shows the relationship between fold change (x-axis) 
        and statistical significance (y-axis). 
        
        - Red points: Upregulated genes
        - Blue points: Downregulated genes
        - Gray points: Not significant
        """)
        
        volcano_btn = gr.Button("Load Volcano Plot", variant="primary")
        volcano_img = gr.Image(label="Volcano Plot")
        
        volcano_btn.click(fn=show_volcano_plot, outputs=volcano_img)
    
    with gr.Tab("Heatmap"):
        gr.Markdown("### Expression Heatmap")
        gr.Markdown("""
        Heatmap showing the top differentially expressed genes across all samples.
        
        - Rows: Genes (probes)
        - Columns: Samples (sleep deprived | control)
        - Colors: Z-scored expression values (red=high, blue=low)
        - Yellow line: Separates sleep deprived from control samples
        """)
        
        heatmap_btn = gr.Button("Load Heatmap", variant="primary")
        heatmap_img = gr.Image(label="Expression Heatmap")
        
        heatmap_btn.click(fn=show_heatmap, outputs=heatmap_img)
    
    with gr.Tab("MA Plot"):
        gr.Markdown("### MA Plot Visualization")
        gr.Markdown("""
        MA plot shows the relationship between average expression (x-axis) 
        and fold change (y-axis). 
        
        This helps identify expression-dependent bias and validates
        that differential expression is not simply due to expression level.
        """)
        
        ma_btn = gr.Button("Load MA Plot", variant="primary")
        ma_img = gr.Image(label="MA Plot")
        
        ma_btn.click(fn=show_ma_plot, outputs=ma_img)
    
    with gr.Tab("Pathway Enrichment"):
        gr.Markdown("### Pathway Enrichment Analysis")
        
        pathway_btn = gr.Button("Show Pathway Results", variant="primary")
        pathway_text = gr.Markdown()
        
        pathway_btn.click(fn=get_pathway_summary, outputs=pathway_text)
        
        gr.Markdown("---")
        gr.Markdown("### Pathway Visualization")
        
        pathway_plot_btn = gr.Button("Load Pathway Plot")
        pathway_img = gr.Image(label="Pathway Enrichment")
        
        pathway_plot_btn.click(fn=show_pathway_plot, outputs=pathway_img)
        
        gr.Markdown("""
        ### Expected Pathways Summary
        
        Based on literature, sleep deprivation affects:
        - Immune response and inflammation
        - Circadian rhythm regulation
        - Stress response pathways
        - Metabolic processes
        - Cardiovascular function
        """)
    
    with gr.Tab("Final Report"):
        gr.Markdown("### Complete Analysis Report")
        
        report_btn = gr.Button("Generate Full Report", variant="primary")
        report_text = gr.Markdown()
        report_file = gr.File(label="Download Report")
        
        report_btn.click(fn=generate_full_report, outputs=[report_text, report_file])
        
        gr.Markdown("---")
        gr.Markdown("### Download All Results")
        
        download_btn = gr.Button("Show Available Files")
        download_summary = gr.Markdown()
        
        download_btn.click(fn=download_all_results, outputs=download_summary)
    
    with gr.Tab("Methodology Details"):
        gr.Markdown("""
        ## Detailed Methodology
        
        ### 1. Data Acquisition
        - Dataset: GSE98566 from NCBI GEO
        - Platform: Affymetrix HuGene 1.0 ST Array
        - Samples: Blood transcriptome data
        - Downloaded via GEO series matrix file
        
        ### 2. Preprocessing
        - Quality control: Removed low-quality probes
        - Missing value handling: Median imputation for genes with <20% missing
        - Variance filtering: Removed genes with variance < 0.01
        - Normalization: Log2 transformation followed by quantile normalization
        
        ### 3. Differential Expression
        - Method: Welch's t-test (unequal variance assumption)
        - Multiple testing correction: Benjamini-Hochberg FDR
        - Significance threshold: adjusted p-value < 0.05
        - Effect size threshold: |log2 fold change| > 0.3
        - Software: Python (scipy.stats, statsmodels)
        
        ### 4. Visualization
        - Volcano plot: Overview of differential expression
        - Heatmap: Hierarchical clustering of top DEGs
        - MA plot: Fold change vs average expression
        - All plots: matplotlib and seaborn libraries
        
        ### 5. Pathway Analysis
        - Approach: Literature-based functional interpretation
        - Databases referenced: KEGG, Gene Ontology
        - Focus areas: Immune, circadian, stress, metabolic pathways
        - Note: Probe IDs limit formal enrichment analysis
        
        ### 6. Statistical Validation
        - Effect size consideration (biological significance)
        - Multiple testing correction (statistical significance)
        - Biological interpretation based on literature
        """)
    
    gr.Markdown("""
    ---
    ## Project Completion Checklist
    
    **From Original Proposal:**
    - ✓ Dataset retrieval and preprocessing
    - ✓ Differential gene expression analysis
    - ✓ Pathway and functional enrichment (literature-based)
    - ✓ Visualization (volcano, heatmap, MA plots)
    - ✓ Identification of candidate biomarkers
    - ✓ Comprehensive reporting
    
    **All analysis results are saved in:**
    - `results/` directory (data files)
    - `figures/` directory (plots)
    """)

# Close any existing instances and launch
gr.close_all()

# Launch the interface
demo.launch(share=True)

Closing server running on port: 7861
Closing server running on port: 7860
* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://2d0e8e15c4d21ada81.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


