In [1]:
# this cell is tagged parameters

PYLIB_DIR = None

########################
# inputs for quant-only
########################

# Reference info
REF_gtf_file = None
REF_quant_file = None

# Predictions
FLAMES_gtf_file = None
FLAMES_quant_file = None

IsoQuant_quant_file = None

IsoSeq_gtf_file = None
IsoSeq_quant_file = None

LRAA_quant_file = None

Mandalorion_gtf_file = None
Mandalorion_quant_file = None

Oarfish_align_quant_file = None

Oarfish_reads_quant_file = None

Bambu_quant_file = None

ESPRESSO_quant_file = None

FLAIR_quant_file = None

Isosceles_gtf_file = None
Isosceles_quant_file = None

StringTie_quant_file = None

TALON_gtf_file = None
TALON_quant_file = None

In [None]:
import sys, os, re
sys.path.insert(0, PYLIB_DIR)


In [None]:
import BenchmarkingRoutines
from importlib import reload
reload(BenchmarkingRoutines)
from BenchmarkingRoutines import *

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# colors for plots
set_color_palette("FLAMES", "gainsboro", "solid")
set_color_palette("IsoQuant", "blue", "solid")
set_color_palette("IsoSeq", "orchid", "solid")
set_color_palette("LRAA", "teal", "solid")
set_color_palette("Mandalorion", "lightblue", "solid")
set_color_palette("Oarfish_align", "khaki", "solid")
set_color_palette("Oarfish_reads", "peachpuff", "solid")
set_color_palette("Bambu", "forestgreen", "solid")
set_color_palette("ESPRESSO", "brown", "solid")
set_color_palette("FLAIR", "pink", "solid")
set_color_palette("Isosceles", "red", "solid")
set_color_palette("StringTie", "aquamarine", "solid")
set_color_palette("TALON", "orange", "solid")


In [None]:
quant_only_dir = "processed_prog_results"

prog_quant_files = {  
    "FLAMES" : [FLAMES_quant_file, FLAMES_gtf_file],
    "IsoQuant" : [IsoQuant_quant_file, REF_gtf_file],
    "IsoSeq" : [IsoSeq_quant_file, IsoSeq_gtf_file],
    "LRAA" : [LRAA_quant_file, REF_gtf_file ],
    "Mandalorion" : [Mandalorion_quant_file, Mandalorion_gtf_file],
    "Oarfish_align" : [Oarfish_align_quant_file, REF_gtf_file],
    "Oarfish_reads" : [Oarfish_reads_quant_file, REF_gtf_file],
    "ESPRESSO" : [ESPRESSO_quant_file, REF_gtf_file],
    "FLAIR" : [FLAIR_quant_file, REF_gtf_file],
    "Isosceles" : [Isosceles_quant_file, Isosceles_gtf_file],
    "Bambu" : [Bambu_quant_file, REF_gtf_file],
    "StringTie" : [StringTie_quant_file, REF_gtf_file],
    "TALON" : [TALON_quant_file, TALON_gtf_file]
}


fullQuantsDf_dict = {}
for progname, (tsv_fname, gtf_fname) in prog_quant_files.items():
    if tsv_fname is None:
        continue

    print(progname, tsv_fname, gtf_fname)
    fullQuantsDf_dict[progname] = indexDfByIntronId(parseGTFtoIntronIDsandQuants(gtf_fname, tsv_fname))


progname_to_i_sample_df_dict_to_tsv(fullQuantsDf_dict, "progname_to_IntronId_expr_vals.tsv")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Build the expression matrix from your dictionary
def build_expression_matrix(fullQuantsDf_dict):
    """
    Build an expression matrix from a dictionary of dataframes.
    
    Parameters:
    fullQuantsDf_dict: dict with program names as keys and pandas DataFrames as values
                      DataFrames should be indexed by intronIds and contain a 'tpm' column
    
    Returns:
    pandas.DataFrame: Expression matrix with intronIds as rows and program names as columns
    """
    
    # Extract TPM values for each program
    tpm_data = {}
    
    for program_name, df in fullQuantsDf_dict.items():
        # Extract the tpm column and use the index (intronIds) as the row identifier
        tpm_data[program_name] = df['tpm']
    
    # Create the expression matrix
    expression_matrix = pd.DataFrame(tpm_data)
    
    # Fill any missing values with 0 (in case some intronIds are missing in some programs)
    expression_matrix = expression_matrix.fillna(0)
    
    return expression_matrix

# Step 2: Calculate correlation matrix
def calculate_correlation_matrix(expression_matrix, method='pearson'):
    """
    Calculate correlation matrix between programs based on TPM expression levels.
    
    Parameters:
    expression_matrix: pandas.DataFrame with intronIds as rows and programs as columns
    method: str, correlation method ('pearson', 'spearman', 'kendall')
    
    Returns:
    pandas.DataFrame: Correlation matrix
    """
    
    # Calculate correlation between programs (columns)
    correlation_matrix = expression_matrix.corr(method=method)
    
    return correlation_matrix

# Step 3: Visualize the correlation matrix
def plot_correlation_heatmap(correlation_matrix, figsize=(10, 8), title='Program Expression Correlation'):
    """
    Create a heatmap of the correlation matrix.
    
    Parameters:
    correlation_matrix: pandas.DataFrame, correlation matrix
    figsize: tuple, figure size
    title: str, plot title
    """
    
    plt.figure(figsize=figsize)
    
    # Create heatmap
    sns.heatmap(correlation_matrix, 
                annot=True,           # Show correlation values
                cmap='coolwarm',      # Color scheme
                center=0,             # Center colormap at 0
                square=True,          # Make cells square
                fmt='.3f',            # Format numbers to 3 decimal places
                cbar_kws={'label': 'Correlation Coefficient'})
    
    plt.title(title, fontsize=14, pad=20)
    plt.xlabel('Programs', fontsize=12)
    plt.ylabel('Programs', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()

# Step 4: Additional analysis functions
def get_correlation_summary(correlation_matrix):
    """
    Get summary statistics of the correlation matrix.
    """
    # Get upper triangle of correlation matrix (excluding diagonal)
    mask = np.triu(np.ones_like(correlation_matrix, dtype=bool), k=1)
    upper_triangle = correlation_matrix.where(mask)
    
    correlations = upper_triangle.stack().values
    
    summary = {
        'mean_correlation': np.mean(correlations),
        'median_correlation': np.median(correlations),
        'std_correlation': np.std(correlations),
        'min_correlation': np.min(correlations),
        'max_correlation': np.max(correlations)
    }
    
    return summary

def find_highly_correlated_pairs(correlation_matrix, threshold=0.8):
    """
    Find pairs of programs with correlation above threshold.
    """
    # Get upper triangle to avoid duplicates
    mask = np.triu(np.ones_like(correlation_matrix, dtype=bool), k=1)
    upper_triangle = correlation_matrix.where(mask)
    
    # Find correlations above threshold
    high_corr = upper_triangle.stack()
    high_corr_pairs = high_corr[abs(high_corr) >= threshold].sort_values(ascending=False)
    
    return high_corr_pairs



In [None]:
# Main execution example:

# Build expression matrix
expression_matrix = build_expression_matrix(fullQuantsDf_dict)

print(f"Expression matrix shape: {expression_matrix.shape}")
print(f"Number of introns: {expression_matrix.shape[0]}")
print(f"Number of programs: {expression_matrix.shape[1]}")
print("\nFirst few rows and columns:")
print(expression_matrix.iloc[:5, :5])



In [None]:
# Calculate correlation matrix
correlation_matrix = calculate_correlation_matrix(expression_matrix, method='pearson')

print(f"\nCorrelation matrix shape: {correlation_matrix.shape}")
print("\nCorrelation matrix:")
print(correlation_matrix)



In [None]:
# Plot correlation heatmap
plot_correlation_heatmap(correlation_matrix)



In [None]:
# Get correlation summary
summary = get_correlation_summary(correlation_matrix)
print("\nCorrelation Summary:")
for key, value in summary.items():
    print(f"{key}: {value:.4f}")



In [None]:
# Find highly correlated pairs
high_corr_pairs = find_highly_correlated_pairs(correlation_matrix, threshold=0.8)
if not high_corr_pairs.empty:
    print(f"\nHighly correlated pairs (|r| >= 0.8):")
    for (prog1, prog2), corr in high_corr_pairs.items():
        print(f"{prog1} - {prog2}: {corr:.4f}")
else:
    print("\nNo highly correlated pairs found (threshold = 0.8)")



In [None]:
# Simple dendrogram
print("\nPlotting dendrogram...")
linkage_matrix, dendro = plot_dendrogram(correlation_matrix, method='average')

# Dendrogram with clustered heatmap
print("\nPlotting clustered heatmap with dendrograms...")
reordered_correlation, _ = plot_dendrogram_with_heatmap(correlation_matrix, method='average')



In [None]:
# Get cluster assignments
print("\nAnalyzing clusters...")
cluster_assignments = get_clusters(correlation_matrix, method='average', distance_threshold=0.5)
print(f"\nCluster assignments:")
for program, cluster in cluster_assignments.items():
    print(f"  {program}: Cluster {cluster}")



In [None]:
# Analyze clusters
cluster_analysis = analyze_clusters(cluster_assignments, correlation_matrix)
print(f"\nCluster Analysis:")
for cluster_id, analysis in cluster_analysis.items():
    print(f"\nCluster {cluster_id}:")
    print(f"  Size: {analysis['size']} programs")
    print(f"  Programs: {', '.join(analysis['programs'])}")
    if analysis['size'] > 1:
        print(f"  Mean within-cluster correlation: {analysis['mean_within_correlation']:.4f}")
        print(f"  Min within-cluster correlation: {analysis['min_within_correlation']:.4f}")
        print(f"  Max within-cluster correlation: {analysis['max_within_correlation']:.4f}")



In [None]:
# Alternative clustering with different number of clusters
print(f"\n" + "-"*30)
print("Alternative clustering (4 clusters):")
cluster_assignments_4 = get_clusters(correlation_matrix, method='average', n_clusters=4)
for program, cluster in cluster_assignments_4.items():
    print(f"  {program}: Cluster {cluster}")



In [None]:
# Save results
expression_matrix.to_csv('expression_matrix.csv')
correlation_matrix.to_csv('correlation_matrix.csv')
cluster_assignments.to_csv('cluster_assignments.csv')