In [None]:
# this cell is tagged parameters

PYLIB_DIR = None

########################
# inputs for quant-only
########################

# Reference info
REF_gtf_file = None
REF_quant_file = None
REF_reduced_gtf_file = None

# Predictions
FLAMES_gtf_file = None
FLAMES_quant_file = None

IsoQuant_gtf_file = None
IsoQuant_quant_file = None

IsoSeq_gtf_file = None
IsoSeq_quant_file = None

LRAA_gtf_file = None
LRAA_quant_file = None

Mandalorion_gtf_file = None
Mandalorion_quant_file = None

Bambu_gtf_file = None
Bambu_quant_file = None

ESPRESSO_gtf_file = None
ESPRESSO_quant_file = None

FLAIR_gtf_file = None
FLAIR_quant_file = None

Isosceles_gtf_file = None
Isosceles_quant_file = None

StringTie_gtf_file = None
StringTie_quant_file = None

TALON_gtf_file = None
TALON_quant_file = None

# for de-novo Venn mode - not counting non-unique non-reference splice patterns as FPs
IGNORE_NONUNIQUE_NONREF = False

In [None]:
import sys, os, re
sys.path.insert(0, PYLIB_DIR)

In [None]:
import BenchmarkingRoutines
from importlib import reload
reload(BenchmarkingRoutines)
from BenchmarkingRoutines import *

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:

set_color_palette("FLAMES", "gainsboro", "solid")
set_color_palette("IsoQuant", "blue", "solid")
set_color_palette("IsoSeq", "orchid", "solid")
set_color_palette("LRAA", "teal", "solid")
set_color_palette("Mandalorion", "lightblue", "solid")
set_color_palette("ESPRESSO", "brown", "solid")
set_color_palette("Bambu", "forestgreen", "solid")
set_color_palette("FLAIR", "pink", "solid")
set_color_palette("Isosceles", "red", "solid")
set_color_palette("StringTie", "aquamarine", "solid")
set_color_palette("TALON", "orange", "solid")


In [None]:
include_strand_in_intronId=False

i_ref_df = indexDfByIntronId(parseGTFtoIntronIDsandQuants(REF_gtf_file, REF_quant_file, include_strand_in_intronId=include_strand_in_intronId))
i_ref_df

In [None]:
i_ref_df["tpm"] = i_ref_df["tpm"] / i_ref_df["tpm"].sum() * 1e6

In [None]:
downsampled_gtf = REF_reduced_gtf_file
i_downsampled_gtf_df = indexDfByIntronId(parseGTFtoIntronIDs(downsampled_gtf, include_strand=include_strand_in_intronId))
i_downsampled_gtf_df['downsampled'] = True


In [None]:
i_ref_df = i_ref_df.join(i_downsampled_gtf_df['downsampled'])
i_ref_df.fillna(False, inplace=True)

In [None]:
i_ref_df.head()

In [None]:
i_ref_df.copy().reset_index().to_csv("refDf.intron_ids_and_expression.tsv", sep="\t", index=False)

In [None]:
downsampled_quant_files_dir = "processed_prog_results"
downsampled_gtf_files_dir =  "raw_prog_results"

prog_quant_files = {  
    "FLAMES" : [FLAMES_quant_file, FLAMES_gtf_file],
    "IsoQuant" : [IsoQuant_quant_file, IsoQuant_gtf_file],
    "IsoSeq" : [IsoSeq_quant_file, IsoSeq_gtf_file],
    "LRAA" : [LRAA_quant_file, LRAA_gtf_file ],
    "Mandalorion" : [Mandalorion_quant_file, Mandalorion_gtf_file],
    "ESPRESSO" : [ESPRESSO_quant_file, ESPRESSO_gtf_file],
    "FLAIR" : [FLAIR_quant_file, FLAIR_gtf_file],
    "Isosceles" : [Isosceles_quant_file, Isosceles_gtf_file],
    "Bambu" : [Bambu_quant_file, Bambu_gtf_file],
    "StringTie" : [StringTie_quant_file, StringTie_gtf_file],
    "TALON" : [TALON_quant_file, TALON_gtf_file]
}
    
downsampled_dict = {}
for progname, (tsv_fname, gtf_fname) in prog_quant_files.items():
    
    if tsv_fname is None:
        continue
    
    print(progname, gtf_fname, tsv_fname)
    downsampled_dict[progname] = indexDfByIntronId(parseGTFtoIntronIDsandQuants(gtf_fname, tsv_fname, 
                                                                                include_strand_in_intronId=include_strand_in_intronId))
 

progname_to_i_sample_df_dict_to_tsv(downsampled_dict, "progname_to_IntronId_expr_vals.tsv")

In [None]:
scatterplot_adj(i_ref_df, downsampled_dict)

In [None]:
spearman_df = cor_spearman_barplot(i_ref_df, downsampled_dict)
spearman_df.to_csv("spearman_expr_cor.tsv", sep="\t", quoting=csv.QUOTE_NONE)

In [None]:
pearson_df = cor_pearson_barplot(i_ref_df, downsampled_dict)
pearson_df.to_csv("pearson_expr_cor.tsv", sep="\t", quoting=csv.QUOTE_NONE)

In [None]:
median_rel_diff_df = rel_diff_barplot(i_ref_df, downsampled_dict, 'median')
median_rel_diff_df.to_csv("median_rel_diff.tsv", sep="\t", quoting=csv.QUOTE_NONE)

In [None]:
mean_rel_diff_df = rel_diff_barplot(i_ref_df, downsampled_dict, 'mean')
mean_rel_diff_df.to_csv("mean_rel_diff.tsv", sep="\t", quoting=csv.QUOTE_NONE)

In [None]:
rel_diff_vs_expr_percentile_plot(i_ref_df, downsampled_dict, 33, 'median',
                                 'all ref-reduced sets, all ref transcripts')

In [None]:
rel_diff_vs_expr_percentile_plot(i_ref_df, downsampled_dict, 33, 'mean',
                                 'all ref-reduced sets, all ref transcripts')

In [None]:
downsampled_kept_intron_ids = i_ref_df[i_ref_df['downsampled'] == False].index

In [None]:
rel_diff_vs_expr_percentile_plot(i_ref_df, downsampled_dict, 33, 'median', 
                                 'downsampled reduced sets, kept transcripts only', 
                                intron_ids_use = downsampled_kept_intron_ids)

In [None]:
# identify those isoforms that are non-reference and non-unique and remove them from the predictions so they don't count as FPs.
if IGNORE_NONUNIQUE_NONREF:
    # Step 1: Combine all DataFrames with program labels
    all_dfs = []
    for progname, df in downsampled_dict.items():
        df_copy = df.copy()
        df_copy['source_program'] = progname
        all_dfs.append(df_copy)

    combined_df = pd.concat(all_dfs)


    # Step 2: Filter to intronIds appearing at least twice
    intron_counts = combined_df.index.value_counts()
    common_introns = intron_counts[intron_counts > 1].index

    # Find intronIds in result_df that are NOT in i_ref_df
    intron_ids_to_exclude = result_df.index.difference(i_ref_df.index)
    print(f"Found {len(intron_ids_to_exclude)} non-ref intronIds to not count as FPs")

    for progname, df in downsampled_dict.items():
        downsampled_dict[progname] = df[~df.index.isin(intron_ids_to_exclude)]



In [None]:
IsoformIdentificationSensitivityPlot(i_ref_df, downsampled_dict, 33, 'median', 'downsampled set')

In [None]:
IsoformIdentificationSensitivityPlot(i_ref_df, downsampled_dict, 33, 'median', 'downsampled set', downsampled_kept_intron_ids)

In [None]:
full_transcriptome_TPR_FDR_F1_df, all_TP_FP_FN_df = overall_knownTPR_novelTPR_and_FDR_barplot(i_ref_df, downsampled_dict)
full_transcriptome_TPR_FDR_F1_df.to_csv("full_transcriptome_TPR_FDR_F1.tsv", sep="\t", quoting=csv.QUOTE_NONE)
all_TP_FP_FN_df.to_csv("full_transcriptome_TPR_FDR_F1.class_assignments.tsv", sep="\t", quoting=csv.QUOTE_NONE)

In [None]:
known_and_novel_TPR_FDR_F1_df, novel_TP_FP_FN_df = overall_knownTPR_novelTPR_and_FDR_barplot(i_ref_df, downsampled_dict, downsampled_kept_intron_ids)
known_and_novel_TPR_FDR_F1_df.to_csv("known_and_novel_TPR_FDR_F1.tsv", sep="\t", quoting=csv.QUOTE_NONE)
novel_TP_FP_FN_df.to_csv("novel_TPR_FDR_F1.class_assignments.tsv", sep="\t", quoting=csv.QUOTE_NONE)

In [None]:
TPR_F1_PPV_plot(i_ref_df, downsampled_dict)

In [None]:
TPR_F1_PPV_plot(i_ref_df, downsampled_dict, novel_intron_ids=downsampled_kept_intron_ids)