## 1. Setup & Imports

Import required libraries and configure matplotlib for dark mode visualization.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from pathlib import Path

# Configure matplotlib for dark mode
plt.style.use('dark_background')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

print("✓ Libraries imported successfully")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

## 2. Configuration

Define file paths and organism identification patterns. Update the `file_paths` list with your TSV files.

In [None]:
# ============================================
# CONFIGURATION - Update these paths
# ============================================

# List your TSV files here (use raw strings or forward slashes)
file_paths = [
    r"C:\path\to\report.pg_matrix_E25_30_4_440960_800.tsv",
    r"C:\path\to\report.pg_matrix_E100_30_4_440960_800.tsv",
    # Add more files as needed
]

# Organism identification patterns
ORGANISM_PATTERNS = {
    "HeLa": ["_HUMAN", "HOMO_SAPIENS"],
    "E.coli": [
        "_ECOLI", "_ECOL", "_ECO2", "_ECO5", "_ECO7",
        "_SHIF", "_SHIB", "_SHIS", "ESCHERICHIA"
    ],
    "Yeast": ["_YEAST", "SACCHAROMYCES", "CEREVISIAE"],
}

ORGANISMS = ["HeLa", "E.coli", "Yeast"]

print(f"✓ Configuration loaded")
print(f"Files to process: {len(file_paths)}")

## 3. Data Loading Functions

Define helper functions for organism identification and data loading.

In [None]:
def identify_organism_vectorized(series):
    """Vectorized organism identification - much faster than row-by-row apply."""
    upper = series.fillna("").astype(str).str.upper()
    result = pd.Series("Unknown", index=series.index)

    for organism, patterns in ORGANISM_PATTERNS.items():
        mask = upper.str.contains("|".join(patterns), regex=True)
        result = result.where(~mask, organism)

    return pd.Categorical(result, categories=ORGANISMS + ["Unknown"])

print("✓ Organism identification function defined")

## 4. Load Data

Load TSV files and identify organisms. This creates a combined DataFrame with all samples.

In [None]:
all_data = []
file_to_raw_column = {}

for filepath in file_paths:
    print(f"Loading: {Path(filepath).name}")
    
    df = pd.read_csv(filepath, sep="\t", low_memory=False)
    source_name = Path(filepath).stem
    df["Source_File"] = source_name

    # Find and map the .raw column
    raw_cols = [col for col in df.columns if ".raw" in col.lower()]
    if raw_cols:
        file_to_raw_column[source_name] = raw_cols[0]
        print(f"  → Intensity column: {raw_cols[0]}")

    # Identify organism from protein name column
    protein_col = next(
        (col for col in ["Protein.Names", "Protein.Group"] if col in df.columns), None
    ) or next((col for col in df.columns if "protein" in col.lower()), None)

    df["Organism"] = (
        identify_organism_vectorized(df[protein_col]) if protein_col else "Unknown"
    )
    
    print(f"  → Rows: {len(df):,}")
    all_data.append(df)

# Combine all data
data = pd.concat(all_data, ignore_index=True)

print(f"\n✓ Data loaded successfully")
print(f"Total rows: {len(data):,}")
print(f"Total columns: {len(data.columns)}")
print(f"\nSample files: {sorted(data['Source_File'].unique())}")

## 5. Data Overview

Inspect the loaded data structure and organism distribution.

In [None]:
# Display first few rows
print("Data Preview:")
data.head()

In [None]:
# Organism distribution across all samples
print("Organism Distribution:")
organism_counts = data['Organism'].value_counts()
print(organism_counts)
print(f"\nTotal proteins: {organism_counts.sum():,}")

In [None]:
# Organism distribution by sample file
print("Organism Distribution by Sample:")
data.groupby(['Source_File', 'Organism']).size().unstack(fill_value=0)

## 6. Helper Functions for Analysis

Define utility functions for data extraction and normalization.

In [None]:
def get_organism_data(file_data, intensity_col, organism):
    """Extract positive numeric intensity data for an organism."""
    data_subset = pd.to_numeric(
        file_data[file_data['Organism'] == organism][intensity_col],
        errors='coerce'
    )
    return data_subset[data_subset > 0].dropna()

def get_hela_median(file_data, intensity_col):
    """Get HeLa median for normalization."""
    hela_data = get_organism_data(file_data, intensity_col, 'HeLa')
    return hela_data.median() if len(hela_data) > 0 else 1.0

def extract_mix_identifier(filename):
    """Extract mix identifier from filename, excluding E25/E100 prefix."""
    cleaned = re.sub(r'E[-_]?\d+[-_]?', '', filename, flags=re.IGNORECASE)
    match = re.search(r'(\d+_\d+_\d+_\d+)', cleaned)
    if match:
        return match.group(1)
    cleaned = cleaned.replace('report.pg_matrix_', '').replace('.tsv', '')
    return cleaned if cleaned else filename

print("✓ Helper functions defined")

## 7. Consensus Protein Calculation

This is the core algorithm that ensures we only compare proteins present in **BOTH** E25 and E100 samples.

### Why Consensus Proteins?
- Prevents "dilution" effect where missing proteins skew fold change medians toward zero
- Ensures true 4-fold difference is captured (E25 ≈ -1.0, E100 ≈ +1.0 in log2 space)
- Only compares proteins with valid intensity measurements in both conditions

In [None]:
def calculate_consensus_fold_changes(data, e25_file, e100_file, organism):
    """Calculate log2 fold changes for consensus proteins present in BOTH samples.
    
    Args:
        data: Full DataFrame containing both samples
        e25_file: E25 sample file name
        e100_file: E100 sample file name
        organism: 'E.coli' or 'Yeast'
    
    Returns:
        Tuple of (e25_log2_values, e100_log2_values) for consensus proteins only
    """
    if e25_file not in file_to_raw_column or e100_file not in file_to_raw_column:
        return None, None
    
    e25_intensity_col = file_to_raw_column[e25_file]
    e100_intensity_col = file_to_raw_column[e100_file]
    
    # Get data for each sample
    e25_data = data[data["Source_File"] == e25_file].copy()
    e100_data = data[data["Source_File"] == e100_file].copy()
    
    # Filter by organism
    e25_org = e25_data[e25_data["Organism"] == organism]
    e100_org = e100_data[e100_data["Organism"] == organism]
    
    if len(e25_org) == 0 or len(e100_org) == 0:
        return None, None
    
    # Find protein identifier column
    protein_col = next(
        (col for col in ["Protein.Group", "Protein.Ids", "Protein.Names"] if col in e25_org.columns),
        None
    )
    
    if protein_col is None:
        return None, None
    
    # Get valid proteins from each sample (non-zero, non-NaN intensities)
    e25_valid = e25_org[
        (e25_org[e25_intensity_col].notna()) & 
        (e25_org[e25_intensity_col] > 0)
    ]
    e100_valid = e100_org[
        (e100_org[e100_intensity_col].notna()) & 
        (e100_org[e100_intensity_col] > 0)
    ]
    
    # Find consensus proteins present in BOTH
    e25_proteins = set(e25_valid[protein_col])
    e100_proteins = set(e100_valid[protein_col])
    consensus_proteins = e25_proteins & e100_proteins
    
    print(f"  {organism}:")
    print(f"    E25 proteins: {len(e25_proteins)}")
    print(f"    E100 proteins: {len(e100_proteins)}")
    print(f"    Consensus proteins: {len(consensus_proteins)}")
    
    if len(consensus_proteins) == 0:
        return None, None
    
    # Filter to consensus proteins only
    e25_consensus = e25_valid[e25_valid[protein_col].isin(consensus_proteins)]
    e100_consensus = e100_valid[e100_valid[protein_col].isin(consensus_proteins)]
    
    # Calculate HeLa median for each sample
    e25_hela_median = get_hela_median(e25_data, e25_intensity_col)
    e100_hela_median = get_hela_median(e100_data, e100_intensity_col)
    
    print(f"    E25 HeLa median: {e25_hela_median:.2e}")
    print(f"    E100 HeLa median: {e100_hela_median:.2e}")
    
    # Calculate log2 normalized intensities for consensus proteins
    e25_intensities = e25_consensus[e25_intensity_col]
    e100_intensities = e100_consensus[e100_intensity_col]
    
    e25_normalized = np.log2(e25_intensities / e25_hela_median)
    e100_normalized = np.log2(e100_intensities / e100_hela_median)
    
    # Filter out invalid values
    e25_array = np.array(e25_normalized.values)
    e100_array = np.array(e100_normalized.values)
    
    e25_valid_vals = e25_array[np.isfinite(e25_array)]
    e100_valid_vals = e100_array[np.isfinite(e100_array)]
    
    print(f"    E25 median log2: {np.median(e25_valid_vals):.2f}")
    print(f"    E100 median log2: {np.median(e100_valid_vals):.2f}")
    
    return e25_valid_vals, e100_valid_vals

print("✓ Consensus protein calculation function defined")

## 8. Calculate Consensus Proteins for All Mixes

Identify E25/E100 pairs and calculate consensus proteins for each mix.

In [None]:
# Group samples by mix identifier
sample_files = sorted(data["Source_File"].unique())
mix_groups = {}

for source_file in sample_files:
    mix_id = extract_mix_identifier(source_file)
    if mix_id not in mix_groups:
        mix_groups[mix_id] = []
    mix_groups[mix_id].append(source_file)

print("Mix Groups:")
for mix_id, files in sorted(mix_groups.items()):
    print(f"  {mix_id}: {files}")

In [None]:
# Calculate consensus proteins for each mix
ecoli_results = []
yeast_results = []
sorted_mixes = sorted(mix_groups.keys())

for mix_id in sorted_mixes:
    print(f"\nProcessing mix: {mix_id}")
    mix_samples = sorted(mix_groups[mix_id])
    
    # Find E25 and E100 files
    e25_file = None
    e100_file = None
    
    for source_file in mix_samples:
        upper = source_file.upper()
        if re.search(r'E[-_]?25', upper):
            e25_file = source_file
        elif re.search(r'E[-_]?100', upper):
            e100_file = source_file
    
    if e25_file and e100_file:
        print(f"  E25: {e25_file}")
        print(f"  E100: {e100_file}")
        
        # Calculate consensus fold changes for E.coli
        e25_ecoli, e100_ecoli = calculate_consensus_fold_changes(
            data, e25_file, e100_file, "E.coli"
        )
        
        # Calculate consensus fold changes for Yeast
        e25_yeast, e100_yeast = calculate_consensus_fold_changes(
            data, e25_file, e100_file, "Yeast"
        )
        
        # Store results
        if e25_ecoli is not None and e100_ecoli is not None:
            ecoli_results.append(("E25", e25_ecoli, mix_id))
            ecoli_results.append(("E100", e100_ecoli, mix_id))
        
        if e25_yeast is not None and e100_yeast is not None:
            yeast_results.append(("E25", e25_yeast, mix_id))
            yeast_results.append(("E100", e100_yeast, mix_id))
    else:
        print(f"  ⚠ Missing E25 or E100 file")

print(f"\n✓ Consensus calculation complete")
print(f"E.coli results: {len(ecoli_results)} samples")
print(f"Yeast results: {len(yeast_results)} samples")

## 9. Protein ID Bar Chart

Visualize protein ID counts by organism for each sample.

In [None]:
# Calculate protein ID counts
counts = data.groupby(["Source_File", "Organism"]).size().unstack(fill_value=0)
org_order = ORGANISMS + ["Unknown"]
counts = counts.reindex(
    columns=[col for col in org_order if col in counts.columns], fill_value=0
)

print("Protein ID Counts:")
counts

In [None]:
# Plot protein ID bar chart
COLORS = {"HeLa": "#9b59b6", "E.coli": "#e67e22", "Yeast": "#16a085", "Unknown": "#95a5a6"}
plot_colors = [COLORS[col] for col in counts.columns]

fig, ax = plt.subplots(figsize=(12, 7))
counts.plot(
    kind="bar",
    stacked=True,
    ax=ax,
    color=plot_colors,
    edgecolor="black",
    linewidth=0.5,
    alpha=0.8,
)

# Add count labels on each bar segment
for i, sample in enumerate(counts.index):
    y_offset = 0
    for organism in counts.columns:
        count = counts.loc[sample, organism]
        if count > 0:
            bar_height = count
            y_pos = y_offset + bar_height / 2
            ax.text(
                i, y_pos, str(int(count)),
                ha='center', va='center',
                fontsize=9, fontweight='bold',
                color='white'
            )
            y_offset += bar_height

ax.set_xlabel("Sample", fontsize=12, fontweight="bold")
ax.set_ylabel("Number of Protein IDs", fontsize=12, fontweight="bold")
ax.set_title("Protein ID Counts by Organism", fontsize=14, fontweight="bold")
ax.legend(title="Organism", fontsize=10, loc="upper right")
ax.grid(axis="y", alpha=0.3)
ax.tick_params(axis="x", rotation=45, labelbottom=True)
plt.setp(ax.xaxis.get_majorticklabels(), ha="right")
plt.tight_layout()
plt.show()

## 10. Sample Comparison Box Plots

Visualize E25 vs E100 intensity comparisons for E.coli and Yeast using consensus proteins.

In [None]:
def add_mix_separators(ax, mix_boundaries, sorted_mixes, total_samples):
    """Add vertical separator lines and mix labels to plot."""
    ylim = ax.get_ylim()
    prev_boundary = 0
    
    for idx, boundary in enumerate(mix_boundaries[:-1]):
        ax.axvline(x=boundary + 0.5, color="#555555", linestyle="-", 
                  linewidth=2, alpha=0.8)
        
        mid_point = (prev_boundary + boundary) / 2 + 0.5
        if idx < len(sorted_mixes):
            ax.text(
                mid_point, ylim[1] * 0.95, sorted_mixes[idx],
                ha="center", va="top", fontsize=9,
                color="#888888", fontweight="bold"
            )
        prev_boundary = boundary
    
    if sorted_mixes:
        mid_point = (prev_boundary + total_samples) / 2 + 0.5
        ax.text(
            mid_point, ylim[1] * 0.95, sorted_mixes[-1],
            ha="center", va="top", fontsize=9,
            color="#888888", fontweight="bold"
        )

def plot_organism_comparison(ax, results, mix_boundaries, sorted_mixes, 
                             title, colors, label_map):
    """Helper method to plot box plot comparison for one organism."""
    original_labels = [r[0] for r in results]
    display_labels = [label_map.get(lbl, lbl) for lbl in original_labels]
    data_arrays = [r[1] for r in results]
    positions = np.arange(1, len(data_arrays) + 1)
    
    bp = ax.boxplot(
        data_arrays,
        positions=positions,
        widths=0.6,
        patch_artist=True,
        showfliers=True,
        showmeans=True,
        flierprops=dict(
            marker="o", markerfacecolor=colors[0], markersize=3,
            alpha=0.4, markeredgecolor="none"
        ),
        meanprops=dict(
            marker="s", markerfacecolor="white", markeredgecolor="white", markersize=5
        ),
    )
    
    for i, (patch, orig_label) in enumerate(zip(bp["boxes"], original_labels)):
        color = colors[0] if orig_label == "E25" else colors[1]
        patch.set_facecolor(color)
        patch.set_alpha(0.7)
        patch.set_edgecolor("white")
        patch.set_linewidth(1.5)
    
    plt.setp(bp["whiskers"], color="white", linewidth=1.5)
    plt.setp(bp["caps"], color="white", linewidth=1.5)
    plt.setp(bp["medians"], color="#2c3e50", linewidth=2.5)
    
    for i, data_arr in enumerate(data_arrays):
        median_val = np.median(data_arr)
        ax.text(
            i + 1.35, median_val, f"{median_val:.2f}",
            fontsize=9, va="center", color="white", fontweight="bold"
        )
    
    add_mix_separators(ax, mix_boundaries, sorted_mixes, len(data_arrays))
    
    ax.axhline(y=0, color="#f39c12", linestyle="--", linewidth=2, alpha=0.9, 
               label="Reference (1:1)")
    ax.set_ylabel("Log2 Intensity (HeLa-Normalized)", fontsize=12, fontweight="bold")
    ax.set_xlabel("Sample", fontsize=12, fontweight="bold")
    ax.set_title(title, fontsize=14, fontweight="bold")
    ax.set_xticks(positions)
    ax.set_xticklabels(display_labels, rotation=0, ha="center", fontsize=10)
    ax.grid(axis="y", alpha=0.3)
    ax.legend(fontsize=9)

print("✓ Plotting functions defined")

In [None]:
# Calculate mix boundaries for visualization
mix_boundaries = []
current_position = 0

for mix_id in sorted_mixes:
    # Each mix contributes 2 positions (E25 and E100)
    if current_position > 0 or len([f for f in mix_groups[mix_id] if 'E25' in f.upper() or 'E100' in f.upper()]) >= 2:
        current_position += 2
        mix_boundaries.append(current_position)

print(f"Mix boundaries: {mix_boundaries}")

In [None]:
# Create comparison plots
if ecoli_results or yeast_results:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 7))
    
    # Plot E.coli comparison
    if ecoli_results:
        plot_organism_comparison(
            ax1, ecoli_results, mix_boundaries, sorted_mixes,
            title="E.coli Intensity Comparison",
            colors=("#e67e22", "#3498db"),
            label_map={"E25": "E25", "E100": "E100"}
        )
    else:
        ax1.text(0.5, 0.5, "No E.coli data", ha="center", va="center",
                transform=ax1.transAxes, fontsize=14)
    
    # Plot Yeast comparison
    if yeast_results:
        plot_organism_comparison(
            ax2, yeast_results, mix_boundaries, sorted_mixes,
            title="Yeast Intensity Comparison",
            colors=("#16a085", "#9b59b6"),
            label_map={"E25": "Y150", "E100": "Y75"}
        )
    else:
        ax2.text(0.5, 0.5, "No Yeast data", ha="center", va="center",
                transform=ax2.transAxes, fontsize=14)
    
    plt.suptitle(
        "E25 vs E100 Intensity Comparison by Mix (HeLa-Normalized)",
        fontsize=15,
        fontweight="bold",
        y=0.98
    )
    plt.tight_layout()
    plt.show()
else:
    print("⚠ No comparison data available")

## 11. Statistical Summary

Calculate and display statistical summaries for validation.

In [None]:
# E.coli statistics
if ecoli_results:
    print("E.coli Statistics:")
    print("=" * 60)
    for label, data_arr, mix_id in ecoli_results:
        print(f"\n{mix_id} - {label}:")
        print(f"  Count: {len(data_arr)}")
        print(f"  Median: {np.median(data_arr):.3f}")
        print(f"  Mean: {np.mean(data_arr):.3f}")
        print(f"  Std: {np.std(data_arr):.3f}")
        print(f"  Q1: {np.percentile(data_arr, 25):.3f}")
        print(f"  Q3: {np.percentile(data_arr, 75):.3f}")
else:
    print("No E.coli results")

In [None]:
# Yeast statistics
if yeast_results:
    print("Yeast Statistics:")
    print("=" * 60)
    for label, data_arr, mix_id in yeast_results:
        display_label = "Y150" if label == "E25" else "Y75"
        print(f"\n{mix_id} - {display_label}:")
        print(f"  Count: {len(data_arr)}")
        print(f"  Median: {np.median(data_arr):.3f}")
        print(f"  Mean: {np.mean(data_arr):.3f}")
        print(f"  Std: {np.std(data_arr):.3f}")
        print(f"  Q1: {np.percentile(data_arr, 25):.3f}")
        print(f"  Q3: {np.percentile(data_arr, 75):.3f}")
else:
    print("No Yeast results")

## 12. Validation Checks

Verify expected fold change patterns:
- **E.coli**: E25 should be ~-1.0, E100 should be ~+1.0 (4-fold difference in log2 space)
- **Yeast**: Y150 should be higher than Y75 (2:1 ratio)

In [None]:
print("Validation Summary:")
print("=" * 60)

if ecoli_results:
    e25_ecoli_medians = [np.median(r[1]) for r in ecoli_results if r[0] == "E25"]
    e100_ecoli_medians = [np.median(r[1]) for r in ecoli_results if r[0] == "E100"]
    
    print("\nE.coli Expected Pattern:")
    print(f"  E25 median range: {min(e25_ecoli_medians):.2f} to {max(e25_ecoli_medians):.2f}")
    print(f"  E100 median range: {min(e100_ecoli_medians):.2f} to {max(e100_ecoli_medians):.2f}")
    print(f"  Expected: E25 ≈ -1.0, E100 ≈ +1.0")
    
    avg_diff = np.mean(e100_ecoli_medians) - np.mean(e25_ecoli_medians)
    print(f"  Average difference: {avg_diff:.2f} (expected ≈ 2.0 for 4-fold)")

if yeast_results:
    y150_medians = [np.median(r[1]) for r in yeast_results if r[0] == "E25"]  # Y150
    y75_medians = [np.median(r[1]) for r in yeast_results if r[0] == "E100"]  # Y75
    
    print("\nYeast Expected Pattern:")
    print(f"  Y150 median range: {min(y150_medians):.2f} to {max(y150_medians):.2f}")
    print(f"  Y75 median range: {min(y75_medians):.2f} to {max(y75_medians):.2f}")
    print(f"  Expected: Y150 > Y75 (2:1 ratio)")
    
    avg_diff = np.mean(y150_medians) - np.mean(y75_medians)
    print(f"  Average difference: {avg_diff:.2f} (expected ≈ 1.0 for 2-fold)")

print("\n✓ Analysis complete!")

## Notes

### Expected Results
- **E.coli**: 4-fold spike-in ratio
  - E25: 4 parts E.coli → log2(4) ≈ -1.0 relative to 1:1
  - E100: 1/4 parts E.coli → log2(0.25) ≈ +1.0 relative to 1:1
  
- **Yeast**: 2-fold spike-in ratio
  - Y150 (E25): 150 fmol
  - Y75 (E100): 75 fmol
  - Expect Y150 > Y75 by approximately 1.0 in log2 space

### Consensus Protein Algorithm
The consensus protein approach ensures accurate fold change measurements by:
1. Identifying proteins with valid intensity in BOTH E25 and E100
2. Calculating HeLa median independently for each sample
3. Normalizing only the consensus proteins
4. Preventing dilution effects from missing values

### Next Steps
- Use this notebook to validate new data files
- Test algorithm changes before updating the web app
- Export results to CSV for further analysis
- Document unexpected patterns or outliers