In [1]:
"""
================================================================================
Step 1: Preprocess GEO2R Differential Expression Results
================================================================================

Prepare GEO2R output files for g:Profiler KEGG pathway enrichment analysis.

Input:  GEO2R Top Table TSV files (raw differential expression results)
Output: Cleaned gene lists and statistics for g:Profiler input

Author: Jihong Oh
Email: jihong421@gmail.com
Date: February 2026
================================================================================
"""

import pandas as pd
import numpy as np
from google.colab import files

# Analysis thresholds
ADJ_PVAL_THRESHOLD = 0.01
LOG2FC_THRESHOLD = 0.58

print("="*80)
print("STEP 1: PREPROCESS GEO2R RESULTS FOR g:PROFILER")
print("="*80)
print(f"\nDEG criteria: adj.p < {ADJ_PVAL_THRESHOLD}, |log2FC| > {LOG2FC_THRESHOLD}")
print("\nPlease upload GEO2R Top Table files (TSV format)")
print("Expected files:")
print("  - GSE43292_top_table.tsv (or filename containing '43292')")
print("  - GSE20950_top_table.tsv (or filename containing '20950')")

uploaded = files.upload()

def load_and_clean_deg(filename):
    """
    Preprocess GEO2R differential expression results

    Steps:
    1. Remove NaN/empty gene symbols
    2. Remove ambiguous probe mappings (///)
    3. Calculate -log10(adj.P.Val) for QC
    4. Deduplicate: keep probe with lowest adj.p per gene
    5. Filter DEGs by thresholds
    6. Generate files for g:Profiler input
    """
    print(f"\n{'='*60}")
    print(f"Processing: {filename}")
    print(f"{'='*60}")

    # Load
    df = pd.read_csv(filename, sep='\t')
    print(f"Initial rows: {len(df)}")

    # Step 1: Remove NaN and empty gene symbols
    df = df[df['Gene.symbol'].notna()]
    df = df[df['Gene.symbol'] != '']
    print(f"After removing NaN/empty: {len(df)}")

    # Step 2: Remove ambiguous probe mappings
    df = df[~df['Gene.symbol'].str.contains('///', na=False)]
    print(f"After removing ambiguous probes: {len(df)}")

    # Step 3: QC metric
    df['neglog10_adjP'] = -np.log10(df['adj.P.Val'])

    # Step 4: Deduplicate (keep probe with smallest adj.P.Val)
    print(f"Before deduplication: {len(df)}")
    df = df.sort_values('adj.P.Val').groupby('Gene.symbol').first().reset_index()
    print(f"After deduplication: {len(df)} unique genes")

    # Step 5: Filter DEGs
    df_filtered = df[
        (df['adj.P.Val'] < ADJ_PVAL_THRESHOLD) &
        (df['logFC'].abs() > LOG2FC_THRESHOLD)
    ].copy()
    print(f"DEGs passing thresholds: {len(df_filtered)}")

    # Add regulation direction
    df_filtered['Regulation'] = df_filtered['logFC'].apply(
        lambda x: "Up" if x > 0 else "Down"
    )

    # Prepare gene lists
    background = sorted(df['Gene.symbol'].unique())
    deg_all = sorted(df_filtered['Gene.symbol'].unique())
    deg_up = sorted(df_filtered[df_filtered['logFC'] > 0]['Gene.symbol'].unique())
    deg_down = sorted(df_filtered[df_filtered['logFC'] < 0]['Gene.symbol'].unique())

    print(f"\n{'='*60}")
    print(f"RESULTS:")
    print(f"  Background genes: {len(background)}")
    print(f"  Total DEGs: {len(deg_all)}")
    print(f"    ↑ Up-regulated: {len(deg_up)}")
    print(f"    ↓ Down-regulated: {len(deg_down)}")
    print(f"{'='*60}")

    # Determine dataset ID from filename
    if '43292' in filename:
        dataset = 'GSE43292'
    elif '20950' in filename:
        dataset = 'GSE20950'
    else:
        dataset = filename.replace('_top_table', '').replace('.tsv', '').replace('.txt', '')

    # Save files
    print(f"\nSaving files for {dataset}...")

    # For g:Profiler
    with open(f"{dataset}.background.txt", "w") as f:
        f.write("\n".join(background))
    print(f"  ✓ {dataset}.background.txt (g:Profiler custom background)")

    with open(f"{dataset}.DEG_all.txt", "w") as f:
        f.write("\n".join(deg_all))
    print(f"  ✓ {dataset}.DEG_all.txt (g:Profiler query)")

    # Additional outputs
    with open(f"{dataset}.DEG_up.txt", "w") as f:
        f.write("\n".join(deg_up))
    print(f"  ✓ {dataset}.DEG_up.txt")

    with open(f"{dataset}.DEG_down.txt", "w") as f:
        f.write("\n".join(deg_down))
    print(f"  ✓ {dataset}.DEG_down.txt")

    # Stats tables for downstream analysis
    deg_stats = df_filtered[['Gene.symbol', 'logFC', 'adj.P.Val', 'Regulation']].copy()
    deg_stats = deg_stats.sort_values('adj.P.Val')
    deg_stats.to_csv(f"{dataset}.DEG_with_stats.tsv", sep="\t", index=False)
    print(f"  ✓ {dataset}.DEG_with_stats.tsv")

    all_stats = df[['Gene.symbol', 'logFC', 'adj.P.Val']].copy()
    all_stats.to_csv(f"{dataset}.all_genes_with_stats.tsv", sep="\t", index=False)
    print(f"  ✓ {dataset}.all_genes_with_stats.tsv")

    # Download all files
    print(f"\nDownloading files...")
    for fname in [
        f"{dataset}.background.txt",
        f"{dataset}.DEG_all.txt",
        f"{dataset}.DEG_up.txt",
        f"{dataset}.DEG_down.txt",
        f"{dataset}.DEG_with_stats.tsv",
        f"{dataset}.all_genes_with_stats.tsv"
    ]:
        files.download(fname)

    return df, df_filtered, dataset

# Process all uploaded files
processed_datasets = []

for filename in uploaded.keys():
    df_all, df_deg, dataset_id = load_and_clean_deg(filename)
    processed_datasets.append(dataset_id)

# Next steps instructions
print("\n" + "="*80)
print("PREPROCESSING COMPLETE!")
print("="*80)
print(f"\nProcessed datasets: {', '.join(processed_datasets)}")
print("\nNEXT STEP: g:Profiler Analysis")
print("-" * 80)
print("For each dataset, perform the following:")
print("\n1. Visit: https://biit.cs.ut.ee/gprofiler/gost")
print("\n2. Upload query:")
print("   - Paste contents of [DATASET].DEG_all.txt")
print("\n3. Configure settings:")
print("   - Organism: Homo sapiens")
print("   - Statistical domain scope: Custom")
print("   - Upload: [DATASET].background.txt")
print("   - Data sources: KEGG only (uncheck others)")
print("   - Significance: Benjamini-Hochberg FDR < 0.01")
print("\n4. Download results:")
print("   - Click 'Download' → 'Detailed results (CSV)'")
print("   - Save as: [DATASET]_gProfiler_intersections.csv")
print("\n5. Proceed to Step 2 with:")
print("   - [DATASET].all_genes_with_stats.tsv")
print("   - [DATASET]_gProfiler_intersections.csv")
print("="*80)

STEP 1: PREPROCESS GEO2R RESULTS FOR g:PROFILER

DEG criteria: adj.p < 0.01, |log2FC| > 0.58

Please upload GEO2R Top Table files (TSV format)
Expected files:
  - GSE43292_top_table.tsv (or filename containing '43292')
  - GSE20950_top_table.tsv (or filename containing '20950')


Saving GSE20950.top.table.tsv to GSE20950.top.table.tsv
Saving GSE43292.top.table.tsv to GSE43292.top.table.tsv

Processing: GSE20950.top.table.tsv
Initial rows: 54675
After removing NaN/empty: 45118
After removing ambiguous probes: 42904
Before deduplication: 42904
After deduplication: 20848 unique genes
DEGs passing thresholds: 1997

RESULTS:
  Background genes: 20848
  Total DEGs: 1997
    ↑ Up-regulated: 178
    ↓ Down-regulated: 1819

Saving files for GSE20950...
  ✓ GSE20950.background.txt (g:Profiler custom background)
  ✓ GSE20950.DEG_all.txt (g:Profiler query)
  ✓ GSE20950.DEG_up.txt
  ✓ GSE20950.DEG_down.txt
  ✓ GSE20950.DEG_with_stats.tsv
  ✓ GSE20950.all_genes_with_stats.tsv

Downloading files...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Processing: GSE43292.top.table.tsv
Initial rows: 33297
After removing NaN/empty: 22195
After removing ambiguous probes: 20052
Before deduplication: 20052
After deduplication: 19036 unique genes
DEGs passing thresholds: 877

RESULTS:
  Background genes: 19036
  Total DEGs: 877
    ↑ Up-regulated: 508
    ↓ Down-regulated: 369

Saving files for GSE43292...
  ✓ GSE43292.background.txt (g:Profiler custom background)
  ✓ GSE43292.DEG_all.txt (g:Profiler query)
  ✓ GSE43292.DEG_up.txt
  ✓ GSE43292.DEG_down.txt
  ✓ GSE43292.DEG_with_stats.tsv
  ✓ GSE43292.all_genes_with_stats.tsv

Downloading files...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


PREPROCESSING COMPLETE!

Processed datasets: GSE20950, GSE43292

NEXT STEP: g:Profiler Analysis
--------------------------------------------------------------------------------
For each dataset, perform the following:

1. Visit: https://biit.cs.ut.ee/gprofiler/gost

2. Upload query:
   - Paste contents of [DATASET].DEG_all.txt

3. Configure settings:
   - Organism: Homo sapiens
   - Statistical domain scope: Custom
   - Upload: [DATASET].background.txt
   - Data sources: KEGG only (uncheck others)
   - Significance: Benjamini-Hochberg FDR < 0.01

4. Download results:
   - Click 'Download' → 'Detailed results (CSV)'
   - Save as: [DATASET]_gProfiler_intersections.csv

5. Proceed to Step 2 with:
   - [DATASET].all_genes_with_stats.tsv
   - [DATASET]_gProfiler_intersections.csv
