# isoformsFromFLAIR.ipynb
## Marcus Viscardi,    January 21, 2023

So on Friday Jan 20th, I reran the pipelines for all four new libs and the two pilot libraries with the tag for FLAIR analysis turned on. Here I want to try and dig into those FLAIR outputs and see if I can pick out any isoform dependent effects such as NMD-sensitivity, or tail length differences!

Large detail here. FLAIR adds a transcript_id tag, but this doesn't necessarily match the gene_id tag carried over from featureCounts (I *think*). This means I'll need to add a step to "back calculate" the gene_id/_name

In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sea
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from tqdm import tqdm
from itertools import product
from pathlib import Path
import sys
sys.path.insert(0, '/data16/marcus/scripts/nanoporePipelineScripts')
import nanoporePipelineCommon as npCommon
from nanoporeReadPlotting.finalizingReadAndCoveragePlotting_matplotlib import plot_reads, coverage_plotting_5tera

CONVERSION_DICT = {"xrn-1-5tera": "oldN2",
                   "xrn-1-5tera-smg-6": "oldS6",
                   "5tera_xrn-1-KD_wt": "newN2",
                   "5tera_xrn-1-KD_smg-5": "newS5",
                   "5tera_xrn-1-KD_smg-6": "newS6",
                   "5tera_xrn-1-KD_smg-7": "newS7",
                   "5tera_xrn-1-KD_wt_rerun": "newerN2",
                   "sPM57": "sPM57",
                   "sPM58": "sPM58",
                   }
REV_CONVERSION_DICT = {val: key for key, val in CONVERSION_DICT.items()}

from typing import List

def load_flair_and_filter_assignments_with_genes(target_lib_name: str,
                                                 trust_flair_assignment=True) -> pd.DataFrame:
    # Load the transcripts tsv and the genes parquet:
    lib_txns_df = npCommon.adjust_5_ends(pd.read_table(npCommon.pick_lib_return_path(target_lib_name, file_midfix='_mergedWithTranscripts', file_suffix='.tsv'))).set_index('read_id')
    lib_genes_df = npCommon.adjust_5_ends(pd.read_parquet(npCommon.pick_lib_return_path(target_lib_name, file_midfix='_mergedOnReads', file_suffix='.parquet'))).set_index('read_id')
    # Also load my pre-parsed gtf file which will allow us to convert the FLAIR transcript_id to gene_ids/_names/_biotypes!
    gtf_df = pd.read_parquet("/data16/marcus/genomes/elegansRelease100/Caenorhabditis_elegans.WBcel235.100.gtf.parquet")[['chr', 'feature', 'gene_id', 'gene_name', 'gene_biotype', 'transcript_id', 'transcript_biotype']]
    gtf_df = gtf_df.query("feature == 'transcript'")[['transcript_id', 'gene_id', 'gene_name', 'gene_biotype', 'transcript_biotype']]
    # Merge those:
    lib_txns_df = lib_txns_df.reset_index().merge(gtf_df, on='transcript_id', how='left',
                                                  suffixes=('_original', ''))
    
    # Merge the FLAIR dataframe and the gene assignment info from the gene dataframe:
    lib_txns_extended_df = pd.merge(lib_txns_df,
                                    lib_genes_df[['gene_id', 'gene_name', 'chr', 'chr_pos']].reset_index(),
                                    on=['read_id', 'chr', 'chr_pos'],
                                    # inner vs. outer here really just changes whether on not we carry over all the reads that made it through to the gene table but not the transcript table!
                                    how='left',
                                    suffixes=('', '_fromGeneAssign')).set_index('read_id')
    
    # Perform the filtering for matched assignment between both methods. This is conservative, but at least it avoids issues arising from FLAIR not giving us enough info to figure out which read on multimappers we are looking at!
    if trust_flair_assignment:
        print(f"We are trusting the flair assignment despite {lib_txns_extended_df[lib_txns_extended_df.gene_id != lib_txns_extended_df.gene_id_fromGeneAssign].shape[0]} genes not matching with gene assignment!")
    else:
        read_count_before_filter = lib_txns_extended_df.shape[0]
        lib_txns_extended_df = lib_txns_extended_df[lib_txns_extended_df.gene_id == lib_txns_extended_df.gene_id_fromGeneAssign]
        read_count_after_filter = lib_txns_extended_df.shape[0]
        reads_lost = read_count_before_filter - read_count_after_filter
        print(f"After filtering for reads where both FLAIR and geneAssign/featureCount assignments match, {read_count_after_filter:,} of {read_count_before_filter:,} reads remain.\n"
              f"This is a loss of {reads_lost:,} reads, or {reads_lost / read_count_before_filter:0.2%}")
    lib_txns_extended_df['lib'] = CONVERSION_DICT[target_lib_name]
    return  lib_txns_extended_df.reset_index()


def long_to_wide(input_df, wide_target_cols=['gene_rpm'], expand_col='lib') -> pd.DataFrame:
    w = input_df[wide_target_cols].unstack(level=expand_col)
    w.columns = w.columns.map('{0[0]}_{0[1]}'.format)
    return w.reset_index().fillna(0)

def compress_df(input_df, keep_transcript_info=True, group_by_t5=True, additional_groupby_columns: List[str] = None, calc_protein_coding_rpm=True):
    groupby_col_list = ["lib",
                        "chr_id",
                        "gene_id",
                        "gene_name"]
    print(f"Creating groupby dataframe merged on: {groupby_col_list}")
    if keep_transcript_info:
        print(f"\t+ [transcript_id]")
        groupby_col_list.append("transcript_id")
    if group_by_t5:
        print(f"\t+ [t5] tag")
        groupby_col_list.append("t5")
    if additional_groupby_columns:
        for additional_col in additional_groupby_columns:
            print(f"\t+ [{additional_col}]")
            groupby_col_list.append(additional_col)
    # Holy crap, the observed=True helps to keep this from propagating out to 129,151,669,691,968 rows...
    groupby_obj = input_df.groupby(groupby_col_list, observed=True)
    # Change the compressed prefix so that I am count gene hits or transcript hits, depending on set up!
    if not keep_transcript_info:
        compressed_prefix = "gene"
    else:
        compressed_prefix = "transcript"
    tqdm.pandas(desc=f"Counting reads per {compressed_prefix}")
    compressed_df = groupby_obj["read_id"].progress_apply(len).to_frame(name=f"{compressed_prefix}_hits")
    
    compressed_df["mean_polya_length"] = groupby_obj["polya_length"].mean()
    compressed_df["median_polya_length"] = groupby_obj["polya_length"].median()
    compressed_df["called_polya_count"] = groupby_obj["polya_length"].count()
    compressed_df["called_polya_frac"] = compressed_df["called_polya_count"] / compressed_df[f"{compressed_prefix}_hits"]
    
    compressed_df["mean_read_length"] = groupby_obj["read_length"].mean()
    compressed_df["median_read_length"] = groupby_obj["read_length"].median()
    # RPM and fractional hits calculations
    # Need to first create columns of NA values, tobe overwritten
    compressed_df[f"{compressed_prefix}_rpm"] = pd.NA
    compressed_df[f"{compressed_prefix}_frac_hits"] = pd.NA
    if group_by_t5:
        compressed_df[f"{compressed_prefix}_t5group_rpm"] = pd.NA
    if calc_protein_coding_rpm:
        compressed_df[f"{compressed_prefix}_proteinCoding_rpm"] = pd.NA
        gene_id_and_biotype_df = pd.read_parquet("/data16/marcus/genomes/elegansRelease100/Caenorhabditis_elegans.WBcel235.100.gtf.parquet")[['gene_id', 'gene_biotype']].drop_duplicates()
        compressed_df = compressed_df.reset_index().merge(gene_id_and_biotype_df, on='gene_id', how='left').set_index(compressed_df.index.names)
    # Only look at one library at a time (so the normalization is per lib not whole df)
    for lib in compressed_df.index.unique(level='lib').to_list():
        # Create the 'norm_factor' which will be the total # of read hits in that lib
        norm_factor = compressed_df.query(f"lib == '{lib}'")[f"{compressed_prefix}_hits"].sum()
        # Turn the total number of read hits into the 'million of read hits'
        rpm_norm_factor = norm_factor / 1000000
        # For each library divide gene_hits by the rpm norm factor to get rpm
        rpm_series = compressed_df.query(f"lib == '{lib}'")[f"{compressed_prefix}_hits"] / rpm_norm_factor
        # Use a series fill, so that we can fill that library's part of the DF without effecting others
        compressed_df[f"{compressed_prefix}_rpm"] = compressed_df[f"{compressed_prefix}_rpm"]. \
            fillna(value=rpm_series)
        # Same as above, but with fraction of hits, rather than a rpm calc (practically same thing)
        gene_frac_hits_series = compressed_df.query(f"lib == '{lib}'")[f"{compressed_prefix}_hits"] / norm_factor
        compressed_df[f"{compressed_prefix}_frac_hits"] = compressed_df[f"{compressed_prefix}_frac_hits"]. \
            fillna(value=gene_frac_hits_series)
        if group_by_t5:
            # We can also calculate an adapted-specific RPM:
            for adapted_or_not in ["+", "-"]:
                norm_factor = compressed_df.query(f"lib == '{lib}'")\
                    .query(f"t5 == '{adapted_or_not}'")[f"{compressed_prefix}_hits"].sum()
                rpm_norm_factor = norm_factor / 1_000_000
                rpm_series = compressed_df.query(f"lib == '{lib}'")\
                    .query(f"t5 == '{adapted_or_not}'")[f"{compressed_prefix}_hits"] / rpm_norm_factor
                compressed_df[f"{compressed_prefix}_t5group_rpm"] = compressed_df[
                    f"{compressed_prefix}_t5group_rpm"].fillna(value=rpm_series, axis='index')
        if calc_protein_coding_rpm:
            protein_coding_norm_factor = compressed_df.query(f"lib == '{lib}'").query(f"gene_biotype == 'protein_coding'")[f"{compressed_prefix}_hits"].sum()
            protein_coding_rpm_norm_factor = protein_coding_norm_factor / 1000000
            protein_coding_rpm_series = compressed_df.query(f"lib == '{lib}'")[f"{compressed_prefix}_hits"] / protein_coding_rpm_norm_factor
            compressed_df[f"{compressed_prefix}_proteinCoding_rpm"] = compressed_df[f"{compressed_prefix}_proteinCoding_rpm"]. \
                fillna(value=protein_coding_rpm_series)
    return compressed_df

print(f"Imports done at {npCommon.get_dt(for_print=True)}")

# First lets check to see if my usual method and the FLAIR method at least assigned things to the same genes!

The detail here is that I need to reverse look up the gene_id from the transcript_id that FLAIR provided!!

So, messing around with the FLAIR dataframe has helped me realize/remember an issue. The FLAIR outputs only spit out read_id and transcript_id (at least in the way that I hacked it together). This means that any multimapping reads will have no means to resolve which assignment went with which map! This isn't a large scale issue at the moment, in this case (newN2) I can see 76.5k reads that match gene_ids from gene analysis and FLAIR, but ~18.3k are assigned by FLAIR but not my method, another ~500 reads are explicitly mismatched between the methods. I can toss everything besides the 76.5k out of 95.3k reads

For now, we will go with the conservative method, throw out everything that doesn't perfectly match. Maybe I'll have this spit out a percentage lost when doing this for my records.

## Load libs:

If there isn't an avaible preprocessed file, this lib loading step will take a little.

In [None]:
regenerate = False
libs_to_load = sorted({
    "oldN2",
    # "oldS6",
    "newN2",
    # "newS5",
    # "newS6",
    # "newS7",
    "sPM57",
    "sPM58",
    "newerN2"
    })
try:
    if regenerate:
        raise ValueError
    
    reads_df_raw_path = npCommon.find_newest_matching_file(f"./output_files/*_{'-'.join(libs_to_load)}_5TERA.reads_df.transcripts.parquet")
    txn_df_raw_path = npCommon.find_newest_matching_file(f"./output_files/*_{'-'.join(libs_to_load)}_5TERA.compressed_df.transcripts.parquet")
    print(f"Found preprocessed files at:\n\t{reads_df_raw_path}\nand:\n\t{txn_df_raw_path}")
    
    txn_df_raw = pd.read_parquet(txn_df_raw_path)
    reads_df_raw = pd.read_parquet(reads_df_raw_path)
except ValueError:
    print(f"Could not find preprocessed files matching these libs: {'/'.join(libs_to_load)}\nGoing to create new ones from scratch! This will take longer.")
    lib_dict = {}
    for lib in libs_to_load:
        lib_dict[lib] = load_flair_and_filter_assignments_with_genes(REV_CONVERSION_DICT[lib])
    
    reads_df_raw = pd.concat(list(lib_dict.values()))
    
    txn_df_raw = compress_df(reads_df_raw, keep_transcript_info=True).sort_index(level=['lib',
                                                                                        'chr_id',
                                                                                        'gene_id',
                                                                                        'gene_name',
                                                                                        'transcript_id'])
    print(f"Saving new parquets to speed up future runs.")
    reads_df_raw.to_parquet(f"./output_files/{npCommon.get_dt()}_{'-'.join(libs_to_load)}_5TERA.reads_df.transcripts.parquet")
    txn_df_raw.to_parquet(f"./output_files/{npCommon.get_dt()}_{'-'.join(libs_to_load)}_5TERA.compressed_df.transcripts.parquet")
print(f"Lib load done @ {npCommon.get_dt(for_print=True)}")

In [None]:
old_df_raw = reads_df_raw[reads_df_raw.lib.str.startswith('old')].copy()

new_df_raw = reads_df_raw[reads_df_raw.lib.str.startswith('new')].copy()

sPM_df_raw = reads_df_raw[reads_df_raw.lib.str.startswith('sPM')].copy()


old_txn_df = compress_df(old_df_raw, keep_transcript_info=True).sort_index(level=['lib', 'chr_id', 'gene_id', 'gene_name', 'transcript_id'])
old_gene_df = compress_df(old_df_raw, keep_transcript_info=False).sort_index(level=['lib', 'chr_id', 'gene_id', 'gene_name'])

new_txn_df = compress_df(new_df_raw, keep_transcript_info=True).sort_index(level=['lib', 'chr_id', 'gene_id', 'gene_name', 'transcript_id'])
new_gene_df = compress_df(new_df_raw, keep_transcript_info=False).sort_index(level=['lib', 'chr_id', 'gene_id', 'gene_name'])

sPM_txn_df = compress_df(sPM_df_raw, keep_transcript_info=True).sort_index(level=['lib', 'chr_id', 'gene_id', 'gene_name', 'transcript_id'])
sPM_gene_df = compress_df(sPM_df_raw, keep_transcript_info=False).sort_index(level=['lib', 'chr_id', 'gene_id', 'gene_name'])

# Reproducibility check:

In [None]:
col_targets = ['gene_rpm', 'mean_polya_length', 'gene_hits', 'mean_read_length', 'gene_proteinCoding_rpm']
new_wide_df = long_to_wide(new_gene_df, wide_target_cols=col_targets).sort_values('gene_rpm_newN2', ascending=False)
triple_wide_df_raw = new_wide_df.merge(long_to_wide(old_gene_df, wide_target_cols=col_targets), on=['chr_id', 'gene_id', 'gene_name', 't5'])
super_wide_df_raw = triple_wide_df_raw.merge(long_to_wide(sPM_gene_df, wide_target_cols=col_targets), on=['chr_id', 'gene_id', 'gene_name', 't5'])
print(new_gene_df.columns)
print(super_wide_df_raw.columns)

In [None]:
super_wide_df = super_wide_df_raw.copy(deep=True)
triple_wide_df = triple_wide_df_raw.copy(deep=True)

cutoff = 5
prefix_target = "gene_hits"
for lib in ["oldN2", "newN2", "newerN2", "sPM57", "sPM58"]:
    super_wide_df = super_wide_df.query(f"{prefix_target}_{lib} >= {cutoff}")
    if lib.endswith("N2"):
        triple_wide_df = triple_wide_df.query(f"{prefix_target}_{lib} >= {cutoff}")
super_wide_df['is_MtDNA'] = super_wide_df_raw['chr_id'] == 'MtDNA'


sea.set_style('ticks')
sea.set_context('talk')

prefix_to_plot = "gene_proteinCoding_rpm"  # "mean_read_length" #  "gene_rpm" # "mean_polya_length"  # "gene_proteinCoding_rpm"

cols_to_plot = [col for col in triple_wide_df.columns if col.startswith(prefix_to_plot)]
g = sea.PairGrid(super_wide_df,
                 vars=cols_to_plot,
                 diag_sharey=True,
                 corner=True,
                 hue='is_MtDNA',
                 palette=['0.2', 'r'],
                 )
if prefix_to_plot in ('gene_rpm', 'gene_proteinCoding_rpm'):
    g.set(xscale='log',
          yscale='log',
          )
# else:
#     g.set(xscale='log',
#       yscale='log',
#       )
# g.map_diag(sea.ecdfplot,
#            # sea.kdeplot,
#            # color='0.2',
#            )
g.map_lower(sea.scatterplot,
            alpha=0.4, marker='+',
            # color='0.2',
            )
# g.map_upper(sea.histplot,
#             # color='0.0',
#             )

g.figure.supxlabel(prefix_to_plot)
g.figure.supylabel(prefix_to_plot)
###
# Get axis labels for each subplot
real_axes=[]
for ax in g.axes.flat:
    if ax:
        real_axes.append(ax)
        x_label = ax.get_xlabel()
        if x_label:
            ax.set_xlabel(x_label.rsplit('_')[-1])
        y_label = ax.get_ylabel()
        if y_label:
            ax.set_ylabel(y_label.rsplit('_')[-1])
real_axes[-1].get_shared_x_axes().join(*real_axes)
real_axes[-1].get_shared_y_axes().join(*real_axes)
xlims = real_axes[-1].get_xlim()
ylims = real_axes[-1].get_ylim()
min_lim = min(xlims[0], ylims[0])
max_lim = max(xlims[1], ylims[1])
real_axes[-1].set_xlim(min_lim, max_lim)
real_axes[-1].set_ylim(min_lim, max_lim)
###
plt.tight_layout()

save_dir = f"/home/marcus/Insync/marcus.viscardi@gmail.com/Google Drive" \
           f"/insync_folder/5TERA_ReadsAndTails_Plots/raw_figures_from_python" \
           f"/{npCommon.get_dt()}_N2_reproducibility"
save_dir = Path(save_dir)
save_dir.mkdir(exist_ok=True)
save_path = save_dir / f'scatterMatrix_{prefix_to_plot}'
for file_type in ('.svg', '.png'):
    plt.savefig(str(save_path) + file_type,
                # dpi=150,
                )
plt.show()

# Rewrite of barplot script:

This is just an effort to rewrite some old code in a less verbose way! Took some effort but works great!

The down-side here is that the gene level plots are actually only showing reads that successfully had their isoforms identified. There are a good number of reads that were unable to assign isoforms because their ambiguous, but they still were obviously the target gene!!

In [None]:
target_gene = 'rpl-3'
context = 'talk'  # 'paper' 'talk' 'poster'

plot_column_suffix = "proteinCoding_rpm"  # "t5group_rpm" or "rpm" or "proteinCoding_rpm"

adapted_species_adjustment_factor = 0.1
show_adapted_species = True

nmd_sensitive_genes_and_txns = (('ubl-1', ['H06I04.4b.1']),
                                ('rpl-3', ['F13B10.2b']),
                                ('odc-1', ['K11C4.4.1']),
                                ('rpl-12', ['JC8.3c.2', 'JC8.3b']),
                                ('rpl-30', ['Y106G6H.3c.1']),
                                ('rpl-1', ['Y71F9AL.13b.4', 'Y71F9AL.13b.2']),
                                ('rpl-26', ['F28C6.7b.1']),
                                ('rps-22', ['F53A3.3b.1']),
                                )

sea.set_style("whitegrid")
sea.set_context(context)
if context == 'paper':
    fig_size = (8, 8)
if context == 'talk':
    fig_size = (10, 10)
if context == 'poster':
    fig_size = (12, 12)


for target_gene, _ in nmd_sensitive_genes_and_txns:
    old_df = old_df_raw.copy(deep=True)
    new_df = new_df_raw.copy(deep=True)
    for potential_target_gene, NMD_sensitive_txns in nmd_sensitive_genes_and_txns:
        if target_gene == potential_target_gene:
            old_df['NMD Sensitive Isoform'] = old_df['transcript_id'].isin(NMD_sensitive_txns)
            new_df['NMD Sensitive Isoform'] = new_df['transcript_id'].isin(NMD_sensitive_txns)
    
    fig, axes = plt.subplots(2, 2, figsize=fig_size,
                             sharex=False,
                             sharey='row')
    
    for row, libs_name in enumerate(['pilot', 'new']):
        if libs_name == 'pilot':
            df = old_df
        else:
            df = new_df
        for col, gene_or_transcript_level in enumerate(['gene', 'transcript']):
            cols_to_index = ['lib', 'chr_id', 'gene_id', 'gene_name']
            reset_index_level = ['lib']
            if gene_or_transcript_level == 'transcript':
                cols_to_index.append('NMD Sensitive Isoform')
                additional_groupby = ['NMD Sensitive Isoform']
                reset_index_level.append('NMD Sensitive Isoform')
            else:
                additional_groupby = None
            
            plot_df = compress_df(df,
                                  keep_transcript_info=False,  # The 'NMD Sensitive Isoform' column will retain this info here!
                                  additional_groupby_columns=additional_groupby,
                                  group_by_t5=True,
                         ).sort_index(level=cols_to_index)
            adapt_df = plot_df.query(f"gene_name == '{target_gene}'").xs('+', level='t5')[f'gene_{plot_column_suffix}'] / adapted_species_adjustment_factor
            unadapt_df = plot_df.query(f"gene_name == '{target_gene}'").xs('-', level='t5')[f'gene_{plot_column_suffix}']
            plot_df = pd.merge(unadapt_df,adapt_df, on=adapt_df.index.names, how='outer',
                               suffixes=('_unadapted', '_adapted')).fillna(0)
            if gene_or_transcript_level == 'transcript':
                hue_col = plot_df.index.get_level_values('NMD Sensitive Isoform')
            else:
                hue_col = None
            
            
            
            
            bar_names = ["_".join(map(str, a)) for a in zip(*[list(plot_df.index.get_level_values(level)) for level in reset_index_level])]
            bar_unadapted_heights = plot_df[f'gene_{plot_column_suffix}_unadapted']
            bar_adapted_heights = plot_df[f'gene_{plot_column_suffix}_adapted']
            print(bar_names)
            axes[row][col].bar(bar_names,
                               bar_unadapted_heights,
                               1,
                               color=['orange' if 'True' in label else 'black' for label in bar_names])
            if show_adapted_species:
                axes[row][col].bar(bar_names,
                                   bar_adapted_heights,
                                   1,
                                   bottom=bar_unadapted_heights,
                                   color='red')
            axes[row][col].set_title(f"{gene_or_transcript_level.title()} RPMs on\n{target_gene} in {libs_name} libs")
            axes[row][col].set_xticks(bar_names, bar_names, rotation=45, ha='right')
            axes[row][col].set_ylabel(plot_column_suffix)
            # print(plot_df[f'gene_{plot_column_suffix}_adapted'] / (plot_df[f'gene_{plot_column_suffix}_unadapted'] + plot_df[f'gene_{plot_column_suffix}_adapted']))
    plt.tight_layout()
    save_path = f"./output_files/isoform_plots/{npCommon.get_dt()}_{target_gene}_{plot_column_suffix}.new.barPlots"
    for file_type in ['.svg', '.png']:
        plt.savefig(save_path + file_type,
                    dpi=300)
    plt.show()
plot_df

# Plotting Tails of isoforms

This shows a very interesting effect! It looks like most NMD target transcripts captured are nacent mRNAs, that have yet to reach the "steady state" tail lengths that we see with non-NMD target transcripts. We assume these non-target reads are coming from mRNAs that have entered the translation pool.

### Feb 22, 2023 Notes:
I have a few ToDo's remaining for this analysis:
1. First, Josh reccomended that I try to subsample the "steady state" mRNA population (the non-NMD targets) and see how often I end up with a distribution similar to the adapted (post cleavage) NMD target reads. This would allow me to get an idea of the variabilty of the technique, and ifwhat I am seeing with sPM57/58 is within noise.
2. Second, I should try and set up some kind of "full length" cutoff for all of the reads that are not adapted. This would let me account for the imcomplete ligation of the 5TERA adapter that I thnk I am getting!

In [None]:
import matplotlib
def plot_tails_from_isoforms(target_gene, df_to_plot, plot_type='ecdfPlot', show_adapted_species = False, save_dir=".", width_per_lib=4, height=8):
    """
    
    :param target_gene:
    :param plot_type: 'boxPlot' or 'ecdfPlot' or 'violinPlot'
    :param show_adapted_species: True or False
    :return: 
    """
    lib_set = "all"
    target_df = df_to_plot
    plot_df = target_df.set_index(['lib', 'gene_id', 'gene_name', 't5', 'transcript_id']).xs((target_gene, '-'), level=(
    'gene_name', 't5')).reset_index()
    for potential_target_gene, NMD_sensitive_txns in nmd_sensitive_genes_and_txns:
        if target_gene == potential_target_gene:
            plot_df['NMD Sensitive Isoform'] = plot_df['transcript_id'].isin(NMD_sensitive_txns)
    unique_libs = plot_df.lib.unique()
    unique_lib_count = len(unique_libs)
    fig, axes = plt.subplots(1, unique_lib_count, figsize=(width_per_lib * unique_lib_count, height),
                             sharex='all',
                             sharey='all')
    plt.suptitle(f"Poly(A) Tail Lengths for {target_gene}")
    sea.set_style("whitegrid")
    sea.set_context("poster")
    ticks = [10, 100, 1000]
    tick_line_factors = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
    tick_lines = [a * b for a, b in product(ticks, tick_line_factors)]
    tick_labels = [str(int(label)) if label in ticks else '' for label in tick_lines]
    for i, unique_lib in enumerate(unique_libs):
        if plot_type == 'boxPlot':
            sea.boxplot(plot_df.query(f"lib == '{unique_lib}'"),
                        y='polya_length',
                        x='NMD Sensitive Isoform',
                        ax=axes[i])
            axes[i].set_yscale('log')
            axes[i].set_yticks(tick_lines)
            axes[i].set_ylim((2, 500))
            axes[i].set_yticklabels(tick_labels)
        elif plot_type == 'violinPlot':
            try:
                sea.violinplot(plot_df.query(f"lib == '{unique_lib}'"),
                               y='polya_length',
                               x='lib',
                               hue='NMD Sensitive Isoform',
                               ax=axes[i],
                               split=True)
            except ValueError:
                sea.violinplot(plot_df.query(f"lib == '{unique_lib}'"),
                               y='polya_length',
                               x='lib',
                               hue='NMD Sensitive Isoform',
                               ax=axes[i])
            axes[i].set_yscale('log')
            axes[i].set_yticks(tick_lines)
            axes[i].set_yticklabels(tick_labels)
            axes[i].get_legend().remove()
        elif plot_type == 'ecdfPlot':
            sea.ecdfplot(plot_df.query(f"lib == '{unique_lib}'"),
                         x='polya_length',
                         hue='NMD Sensitive Isoform',
                         ax=axes[i],
                         linewidth=5,
                         )
            axes[i].set_xscale('log')
            axes[i].set_xticks(tick_lines)
            axes[i].set_xticklabels(tick_labels)
            axes[i].set_xlim((5, 500))
            axes[i].get_legend().remove()
        else:
            raise NotImplementedError(
                f"{plot_type} is not currently supported! b/c I haven't coded it or you have a typo...")
        lib_converter = {'oldN2': 'Wildtype (pilot)',
                         'oldS6': 'Δsmg-6 (pilot)',
                         'newN2': 'Wildtype (new)',
                         'newS5': 'Δsmg-5 (new)',
                         'newS6': 'Δsmg-6 (new)',
                         'sPM57': 'Wildtype (Parissa)',
                         'sPM58': '~Wildtype (Parissa)',
                         }
        axes[i].set_title(f"{lib_converter[unique_lib]}")
    if plot_type == 'ecdfPlot': 
        plt.legend(["True", "False"],
                   bbox_to_anchor=(0.9, 0.99),
                   loc="upper right",
                   bbox_transform=fig.transFigure, ncol=2,
                   title=f'NMD Sensitive Isoform',
                   handlelength=1.5,
                   fontsize='x-small',
                   title_fontsize='x-small',)
    elif plot_type == 'violinPlot':
        handles, labels = axes[-1].get_legend_handles_labels()
        fig.legend(handles, labels, bbox_to_anchor=(1, 1),
                   loc="upper right",
                   bbox_transform=fig.transFigure, ncol=2,
                   title=f'NMD Sensitive Isoform',
                   fontsize='x-small',
                   title_fontsize='x-small')
    plt.tight_layout()
    save_path = f"{save_dir}/{npCommon.get_dt()}_{target_gene}_tails_{lib_set}.{plot_type}"
    for file_type in ['.svg', '.png']:
        plt.savefig(save_path + file_type,
                    dpi=300)
    plt.show()

In [None]:
nmd_sensitive_genes_and_txns = (# ('ubl-1', ['H06I04.4b.1']),
                                #('rpl-3', ['F13B10.2b']),
                                #('odc-1', ['K11C4.4.1']),
                                # ('rpl-12', ['JC8.3c.2', 'JC8.3b']),
                                #('rpl-30', ['Y106G6H.3c.1']),
                                #('rpl-1', ['Y71F9AL.13b.4', 'Y71F9AL.13b.2']),
                                ('rpl-26', ['F28C6.7b.1']),
                                )

target_genes = [gene for gene, _ in nmd_sensitive_genes_and_txns]
plot_types = [
    'ecdfPlot',
    'boxPlot',
    'violinPlot',
]

for gene, plot_kind in product(target_genes, plot_types):
    target_gene_folder_path = f"./output_files/isoform_plots/{npCommon.get_dt()}_tailPlots_from_{gene}"
    try:
        os.mkdir(target_gene_folder_path)
    except FileExistsError:
        pass
    plot_df = reads_df_raw.copy(deep=True)
    plot_tails_from_isoforms(gene, plot_df, plot_type=plot_kind,
                             save_dir=target_gene_folder_path,
                             width_per_lib=3.5,
                             height=8)

# Long to wide tool
Let's try and see if I can write a simple long_to_wide tool

This would be great for my general ability to split libs up and compare across them


In [None]:
df = reads_df_raw.copy(deep=True)
comp_df = compress_df(df,
                      keep_transcript_info=False,).sort_index(level=['lib', 'chr_id', 'gene_id', 'gene_name'])
# print(comp_df.columns)
long_to_wide(comp_df, wide_target_cols=['gene_rpm', 'mean_polya_length'])

# Scatter plot of isoform RPMS
Make a scatter plot of isoforms and their RPMs between N2 and smg-5/6 treatments. Maybe this will better pick out things like rpl-12 which I have missed but is obviously a great target!

This will be a little annoying b/c i'll need to split and re-merge the columns!

#
## Better option (In next cell down!):
I think the above design for a scatter doesn't work because the actual isoforms of interest are not changing that dramatically. A better way to plot this would be to figure out the fraction that gene makes up of the total RPM, then plot that ratio between two libs. 

A NMD sensitive isoform would light up in this analysis because it's suddenly making up a way larger portion of the total gene's RPM when I break NMD vs in WT.

In [None]:
plot_col = 'transcript_rpm'

df = reads_df_raw.query("lib == 'oldN2' | lib == 'oldS6'").copy(deep=True)
compressed_df = compress_df(df, group_by_t5=False)

lib_names = compressed_df.index.get_level_values('lib').to_series().unique()

dfs = []
for lib in lib_names:
    dfs.append(compressed_df.query(f"lib == '{lib}'").reset_index(level='lib')[plot_col].rename(f"{plot_col}_{lib}"))
plot_df = pd.concat(dfs, axis=1).fillna(0)

fig = px.scatter(plot_df.reset_index(),
                 x=f'{plot_col}_{lib_names[0]}',
                 y=f'{plot_col}_{lib_names[1]}',
                 hover_name='gene_name',
                 hover_data=['transcript_id'])
fig.show(renderer='browser')

# Trying to plot change in isoform usage within genes

Like noted in the above markdown cell. This is kinda a weird analysis but it does capture different information than the rocket plots!

In [None]:
df = reads_df_raw.query("lib == 'oldN2' | lib == 'oldS6'").copy(deep=True)
compressed_df = compress_df(df, group_by_t5=False)

mini_df = compressed_df.sort_values(["lib", "gene_id"])[['transcript_hits', 'transcript_rpm', 'transcript_proteinCoding_rpm']]
# Cool way to do this from: https://stackoverflow.com/questions/23377108/pandas-percentage-of-total-with-groupby
mini_df['fraction_of_gene_rpm'] = mini_df['transcript_rpm'] / mini_df.groupby(['lib', 'gene_id'])['transcript_rpm'].transform('sum')
mini_df['test_col'] = mini_df['fraction_of_gene_rpm'] * mini_df.groupby(['lib', 'gene_id'])['transcript_rpm'].transform('sum')

wide_mini_df = long_to_wide(mini_df, wide_target_cols=['fraction_of_gene_rpm', 'transcript_rpm'])
wide_mini_df['combined_rpm'] = wide_mini_df.transcript_rpm_oldN2 + wide_mini_df.transcript_rpm_oldS6 + 5
wide_mini_df = wide_mini_df.query("combined_rpm >= 100").query("fraction_of_gene_rpm_oldN2 + fraction_of_gene_rpm_oldS6 != 2")
wide_mini_df['long_name'] = wide_mini_df.gene_name + " (" + wide_mini_df.transcript_id + ")"

fig = px.scatter(wide_mini_df,
                 x='fraction_of_gene_rpm_oldN2',
                 y='fraction_of_gene_rpm_oldS6',
                 size='combined_rpm',
                 size_max=25,
                 hover_name='long_name')
fig.show(renderer='browser')

# Trying to combine tail and isoform information
So for some genes like rpl-30, the splice junction is long and ahead of the NMD eliciting STOP so that most of the degradation intermediates retain splice information. This makes it clear that they came from the NMD isoform! Genes like ubl-1 are very much the opposite case, in that all degradation intermediates are completely ambiguous as it if they came from NMD or not... *(but we ~know they did)*

I want to try and get code working that is able to take advantage of these few genes where deg. intermediates retain isoform information. That's what this next section will do.

Two large pieces fall into this:
1. We have to make sure that reads that do not have enough information to confidently ID the right isoform are not arbitrarily getting assigned
2. Plot tail lengths with the combinations of adapted/unadapted and NMD sensitive/insensitive


### Feb 23, 2023:
I should really try and get a bootstrapping test of this going to see how often I see the NMD adapted type tails coming out of the unadapted NMD insensitive reads


In [None]:
log_xaxis = True

target_libs = (# "newN2",
               "oldN2",
               "oldS6",
               # "newS6",
               # "newS5",
               # "sPM57",
               # "sPM58",
               )

# At the isoform edge position we can differentiate cleavage species' parent isoform, left of which, we have enough info to differentiate between the NMD sensitive or insenstive isoforms!
# I manually identified that same isoform edge information for the following genes:
targets_and_iso_edge_dict = {"rpl-30": [10_436_409, 'left'],
                             # "rpl-26": [8_603_272, 'left'],
                             # "rpl-3": [3_868_327, 'left'],
                             # "rpl-1": [2_876_019, 'right'],
                             # "rpl-12": [13_240_023, 'right'],
                             "ubl-1": [3_068_573, 'left'],
                             "rps-22": [1_950_996, 'left']
                             }

nmd_sensitive_genes_and_txns = (('ubl-1', ['H06I04.4b.1']),
                                ('rpl-3', ['F13B10.2b']),
                                ('odc-1', ['K11C4.4.1']),  # This analysis isn't quite the same...
                                ('rpl-12', ['JC8.3c.2', 'JC8.3b']),
                                ('rpl-30', ['Y106G6H.3c.1']),
                                ('rpl-1', ['Y71F9AL.13b.4', 'Y71F9AL.13b.2']),
                                ('rpl-26', ['F28C6.7b.1']),
                                ('rps-22', ['F53A3.3b.1']),
                                )

sea.set_style("whitegrid")
sea.set_context("poster")

palette = {"NMD'ed: +  t5: -":"darkred",
           "NMD'ed: +  t5: +":"red", 
           "NMD'ed: -  t5: -":"darkblue",
           "NMD'ed: -  t5: +":"blue"}


def plots_tails(library_df, targeted_gene, lib_target, save_dir=f"./output_files/isoform_plots/NMD_and_Adapted_tailPlots"):
    fig, ax = plt.subplots(figsize=(8, 6))
    g = sea.ecdfplot(library_df.sort_values('NMD_and_t5'),
                     ax=ax,
                     x='polya_length',
                     hue='NMD_and_t5',
                     palette=palette,
                     log_scale=log_xaxis,
                     )
    # Change the line styles:
    for i, line in enumerate(g.lines):
        if line.get_color().startswith("dark"):
            line.set_linestyle("-")
        else:
            line.set_linestyle("--")
    plt.title(f"Tail lengths from {lib_target} for {targeted_gene}")
    if log_xaxis:
        ax.set_xbound(10, 200)
        ax.set_xticks([10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200])
        ax.set_xticklabels([10, 20, 30, 40, 50, None, 70, None, None, 100, 200])
    else:
        ax.set_xbound(-1, 200)
    # sea.move_legend(g, "lower right")
    ax.get_legend().remove()
    plt.tight_layout()
    save_dir = Path(save_dir)
    if not save_dir.exists():
        print(f"Making new directory at: {save_dir}")
        save_dir.mkdir()
    
    save_path = str(save_dir) + f"/{npCommon.get_dt()}_{targeted_gene}_{lib_target}_NMD-and-t5_tailLength.ecdf"
    if log_xaxis:
        save_path += ".logAxis"
    for file_type in ['.svg', '.png']:
        plt.savefig(save_path + file_type, dpi=300)
    plt.show()


for target_gene, (iso_edge, left_or_right) in targets_and_iso_edge_dict.items():
    if left_or_right == 'left':
        comparator = "<="
        anti_comparator = ">="
    elif left_or_right == 'right':
        comparator = ">="
        anti_comparator = "<="
    else:  # This is just to shut pycharm up
        comparator = None
        anti_comparator = None
    if target_gene == 'ubl-1':
        df = reads_df_raw.copy(deep=True).query(f"gene_name == '{target_gene}'")
    else:
        df = reads_df_raw.copy(deep=True).query(f"gene_name == '{target_gene}'").query(f"chr_pos {comparator} {iso_edge}")
    nmd_sensitive_genes_and_txns_dict = {potential_target_gene: NMD_sensitive_txns for potential_target_gene, NMD_sensitive_txns in nmd_sensitive_genes_and_txns}
    df['NMD_Sensitive_Isoform'] = df['transcript_id'].isin(nmd_sensitive_genes_and_txns_dict[target_gene]).replace({True: '+', False: '-'})
    if target_gene == 'ubl-1':
        df.NMD_Sensitive_Isoform.mask(((df.t5 == '+') & (df.eval(f"chr_pos {anti_comparator} {iso_edge}"))), '+', inplace=True)
    
    for lib_for_tail_plot in df.lib.unique():
        if lib_for_tail_plot not in target_libs:
            continue
        print(lib_for_tail_plot, target_gene)
        lib_df = df.query(f"lib == '{lib_for_tail_plot}'").copy()
        lib_df['NMD_and_t5'] = lib_df.apply(lambda row: f"NMD'ed: {str(row['NMD_Sensitive_Isoform'])}  t5: {row['t5']}", axis=1)

        plots_tails(lib_df, target_gene, lib_for_tail_plot,
                    save_dir=f"/home/marcus/Insync/marcus.viscardi@gmail.com/Google Drive/insync_folder/5TERA_ReadsAndTails_Plots/raw_figures_from_python/{npCommon.get_dt()}_isoformTailPlots_{'-'.join(target_libs)}")

# Read Plotting W/ Isoforms

The goal here is to run my read plotting and coverage plotting scripts, but leverage the additional isoform information from FLAIR.

# Make it prettier

The below code shows how to make subsets of plots, which might be really nice for making the plots more manageable!
[Link to StackOverflow where I got this from!](https://stackoverflow.com/a/67694491/13316742)

In [None]:
from nanoporeReadPlotting.finalizingReadAndCoveragePlotting_matplotlib import coverage_plotting_5tera

sea.set_style("whitegrid")
sea.set_context("talk")

target_genes = {# "rpl-30",
                # "rpl-26",
                # "rpl-3",
                # "rpl-1",
                # "rpl-12",
                "ubl-1",
                # 'aly-3',
                # 'K08D12.3',
                # 'hel-1',
                # 'rsp-1',
                # 'rps-22',
                }

iso_edge_dict = {
    # gene: [edge, compare, anticompare]
    "rpl-30": [10_436_409, "<=", ">="],
    "rpl-26": [8_603_272, "<=", ">="],
    "rpl-3": [3_868_327, "<=", ">="],
    "rpl-1": [2_876_019, ">=", "<="],
    "rpl-12": [13_240_023, ">=", "<="],
    "ubl-1": [3_068_573, "<=", ">="],
    "rps-22": [1_950_996, "<=", ">="]
    }

nmd_sensitive_genes_and_txns = (('ubl-1', ['H06I04.4b.1']),
                                ('rpl-3', ['F13B10.2b']),
                                ('odc-1', ['K11C4.4.1']),  # This analysis isn't quite the same... b/c only 1 isoform
                                ('rpl-12', ['JC8.3c.2', 'JC8.3b']),
                                ('rpl-30', ['Y106G6H.3c.1']),
                                ('rpl-1', ['Y71F9AL.13b.4', 'Y71F9AL.13b.2']),
                                ('rpl-26', ['F28C6.7b.1']),
                                ('rps-22', ['F53A3.3b.1']),
                                ('rsp-1', ['W02B12.3b.2']),
                                ('hel-1', ['C26D10.2b']),
                                ('K08D12.3', ['K08D12.3b.1']),
                                ('aly-3', ['M18.7b']),
                                )

compare_libs_sets = (("oldN2",),
                     ("oldN2", "oldS6"),
                     # ("sPM57", "sPM58"),
                     # ("newN2", "oldN2", "newS5", "newS6", "oldS6"),
                     )

def plot_coverages_for_isoforms(plot_df, target_gene, compare_libs,
                                targets_and_nontargets_side_by_side=True,
                                plot_ambiguous_reads=False,
                                quiet=True, save_dir=None):
    num_compare_libs = len(compare_libs)
    if not quiet:
        print(f"comparing {num_compare_libs} libraries: {' '.join(compare_libs)}")
    if targets_and_nontargets_side_by_side:
        fig_size = [5, 2.5 * num_compare_libs]  # Width, Height
        if plot_ambiguous_reads:
            fig_size[1] += 1.25 * num_compare_libs
        outer_row_num, outer_col_num = num_compare_libs, 1
    else:
        fig_size = [3.5 * num_compare_libs, 5]  # Width, Height
        if plot_ambiguous_reads:
            fig_size[1] += 2.5
        outer_row_num, outer_col_num = 1, num_compare_libs
    
    fig = plt.figure(constrained_layout=True,
                     figsize=fig_size,
                     )
    # This will produce sets of sub-figures that we can build up figures inside! These could be 1 'subfig' per lib.
    subfigs = fig.subfigures(outer_row_num,
                             outer_col_num)  # Rows, Cols. This could work the other way too!
    
    if num_compare_libs == 1:
        flat_subfigs = [subfigs]
    else:
        flat_subfigs = subfigs.flat
    # This will allow me to save all of these into a list, then I can force them to all share Y-scale!
    adapted_axes = []
    unadapted_axes = []
    for outer_index, (subfig, lib) in enumerate(zip(flat_subfigs, compare_libs)):
        subfig.suptitle(lib)
        lib_df = plot_df.query(f"lib == '{lib}'").copy()

        if targets_and_nontargets_side_by_side:
            subfig_rows, subfig_cols = 2, 2
            height_ratios = [1, 2]
            if plot_ambiguous_reads:
                subfig_cols += 1
        else:
            subfig_rows, subfig_cols = 4, 1
            height_ratios = [1, 2, 1, 2]
            if plot_ambiguous_reads:
                subfig_rows += 2
                height_ratios += [1, 2]

        # Other option here would be to make the libs stack: 4 rows, 1 col. The unstacking code below will handle either option!
        axes = subfig.subplots(subfig_rows,  # num rows: adapted & unadapted
                               subfig_cols,  # num cols: NMD sensitive and not sensitive
                               height_ratios=height_ratios,
                               )
        if len(axes.shape) == 2 and axes.shape[0] == 2:
            target_axes, nontarget_axes = axes.transpose()
            ambiguous_axes = None
        elif len(axes.shape) == 2 and axes.shape[0] == 3:  # In the case of plotting ambiguous reads!
            target_axes, nontarget_axes, ambiguous_axes = axes.transpose()
        elif len(axes.shape) == 1 and axes.shape[0] == 4:
            target_axes, nontarget_axes = np.array_split(axes, 2)
            ambiguous_axes = None
        elif len(axes.shape) == 1 and axes.shape[0] == 6:  # In the case of plotting ambiguous reads!
            target_axes, nontarget_axes, ambiguous_axes = np.array_split(axes, 3)
        else:
            raise NotImplementedError
        
        NMD_target_iterator = ['+', '-']
        axes_group_iterator = [target_axes, nontarget_axes]
        
        adapted_axes += [target_axes[0], nontarget_axes[0]]
        unadapted_axes += [target_axes[1], nontarget_axes[1]]
        if plot_ambiguous_reads:
            adapted_axes += [ambiguous_axes[0]]
            unadapted_axes += [ambiguous_axes[1]]
        
        if not plot_ambiguous_reads:
            axes_iterator = zip(NMD_target_iterator, axes_group_iterator)
        else:
            NMD_target_iterator += ['~']
            axes_group_iterator += [ambiguous_axes]
            axes_iterator = zip(NMD_target_iterator, axes_group_iterator)
        
        for target, target_or_non_axes in axes_iterator:  # Either two steps here or three (if ambiguous reads getting plotted!)
            print(f"Calculating coverage for NMD-({target}) isoform in {lib} lib:")
            # print(lib_df.query(f"NMD_Sensitive_Isoform == '{target}'"))
            coverage_plotting_5tera(lib_df,
                                    gene_name=target_gene,
                                    provide_axes=target_or_non_axes,
                                    rpm_normalize=True,
                                    additional_plot_df_query=f"NMD_Sensitive_Isoform == '{target}'",
                                    # it's import to filter inside of this method rather than ahead of time, this is b/c the coverage_plotting script uses all the reads to calculate rpm!!
                                    quiet=quiet,
                                    )
            target_or_non_axes[1].set_xlabel(f"NMD ({target})")
            if targets_and_nontargets_side_by_side and target == '-':
                target_or_non_axes[0].set_yticklabels([])
                target_or_non_axes[1].set_yticklabels([])
    adapted_axes[0].get_shared_y_axes().join(*adapted_axes)
    unadapted_axes[0].get_shared_y_axes().join(*unadapted_axes)
    # plt.subplots_adjust(wspace=0, hspace=0)
    libs_string = '/'.join(compare_libs)
    # plt.suptitle(f"Read Coverage for {target_gene}\nin {libs_string}")
    if isinstance(save_dir, str):
        save_dir = Path(save_dir)
        if not save_dir.exists():
            print(f"Making new directory at: {save_dir}")
            save_dir.mkdir()
        save_path = str(save_dir) + f"/{target_gene}_{'-'.join(compare_libs)}_coveragePlots"
        if plot_ambiguous_reads:
            save_path += "_withAmbiguouslyAssignedReads"
        print(f"Saving file to {save_path}.svg/png")
        for file_type in ('.svg', '.png'):
            plt.savefig(save_path + file_type)
    plt.show()


for target_gene in target_genes:
    print(f"Making plots for {target_gene}")
    df = reads_df_raw.copy(deep=True)  #.sample(100000)  # TODO: Drop this sample step, it just made things faster
    nmd_sensitive_genes_and_txns_dict = {potential_target_gene: NMD_sensitive_txns for potential_target_gene, NMD_sensitive_txns in nmd_sensitive_genes_and_txns}
    df['NMD_Sensitive_Isoform'] = df['transcript_id'].isin(nmd_sensitive_genes_and_txns_dict[target_gene]).replace({True: '+', False: '-'})
    if target_gene == 'ubl-1':
        iso_edge, comparator, anti_comparator = iso_edge_dict[target_gene]
        df.NMD_Sensitive_Isoform.mask(((df.t5 == '+') & (df.eval(f"chr_pos {anti_comparator} {iso_edge}"))), '~', inplace=True)
    for compare_libs in compare_libs_sets:
        plot_coverages_for_isoforms(df, target_gene, compare_libs,
                                    targets_and_nontargets_side_by_side=False,
                                    save_dir=f"/home/marcus/Insync/marcus.viscardi@gmail.com/Google Drive/insync_folder/5TERA_ReadsAndTails_Plots/raw_figures_from_python/{npCommon.get_dt()}_coveragePlots3_{'-'.join(compare_libs)}",
                                    plot_ambiguous_reads=True,
                                    quiet=True)

In [None]:
target_gene = 'ubl-1'
df = reads_df_raw.copy(deep=True)  #.sample(100000)  # TODO: Drop this sample step, it just made things faster
nmd_sensitive_genes_and_txns_dict = {potential_target_gene: NMD_sensitive_txns for potential_target_gene, NMD_sensitive_txns in nmd_sensitive_genes_and_txns}
df['NMD_Sensitive_Isoform'] = df['transcript_id'].isin(nmd_sensitive_genes_and_txns_dict[target_gene]).replace({True: '+', False: '-'})
iso_edge, comparator, anti_comparator = iso_edge_dict[target_gene]
df.NMD_Sensitive_Isoform.mask(((df.t5 == '+') & (df.eval(f"chr_pos {anti_comparator} {iso_edge}"))), '~', inplace=True)

df.query(f"gene_name == '{target_gene}'").query(f"NMD_Sensitive_Isoform == '~'").query(f"lib == 'oldN2'")[['read_id', 't5', 'gene_name_original', 'transcript_id', 'gene_name', 'gene_name_fromGeneAssign', 'NMD_Sensitive_Isoform']]

***

# *ets-4* Weirdness
I've had some weirdness going on with *ets-4* where it doesn't seem like it's being assigned correctly. I'm going to try to explore this.

I think part of what's happening is that the annotation for *ets-4* overlaps with another gene (*ceh-60*), and for whatever reason this is tanking assignments(?).

For example, I can see on IGV that there are 30 clear reads in my oldN2 lib on *ets-4*. But **ZERO(?!)** of these make it to this stage... why?

The only read I see from oldN2 is "f1250dbe-d5f9-47a6-a60f-d55ff5a6f909", and this read is obviously from *ceh-60* when you look at it on IGV. It's in the opposite direction ffs!

Is FLAIR not strand aware?! That seems like a massive oversight...

**I need to go look if I am using FLAIR incorrectly, or if I forgot a stand-aware flag of some sort**

In [None]:
reads_df_raw.query("gene_name == 'ets-4' | gene_name_fromGeneAssign == 'ets-4'")