# decappingQuantification2_metaPlots.ipynb
## Marcus Viscardi,    July 11, 2024

Check out `README.md` for a more overarching view of what is happening here.

The `decappingQuantification.ipynb` script was getting REALLY bloated.
This script trys to take the parts of that script that were related to producing meta plots and has them a little more streamlined!

In [1]:
from typing import Tuple

import nanoporePipelineCommon as npCommon

import numpy as np
import pandas as pd

import re

import seaborn as sea
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

from tqdm.auto import tqdm

from icecream import ic
from datetime import datetime

from pathlib import Path

from scipy.stats import mannwhitneyu, ks_2samp

import pickle as pkl

pd.set_option('display.width', 200)
pd.set_option('display.max_columns', None)

def __time_formatter__():
    now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    return f"ic: {now} | > "
ic.configureOutput(prefix=__time_formatter__)

_ = ic("Imports done!")

ic: 2024-08-27 11:42:38 | > 'Imports done!'


In [2]:
gtf_parquet_path = Path("/data16/marcus/genomes/plus_cerENO2_elegansRelease100/230327_allChrs_plus-cerENO2.gtf.parquet")

gtf_df = pd.read_parquet(gtf_parquet_path)

current_dir = Path.cwd()

In [3]:
libs_to_run = [
    "oldN2",
    # "oldS6",
    "newerN2",
    "newerS6",
    "newerS5",
    "thirdN2",
    "thirdS5",
    "thirdS6",
    "temp25cN2",
    "temp25cS5",
    "temp25cS6",
    "temp25cS7",
]
genes = [
    'rps-22',
    'ubl-1',
    'rpl-30',
    # 'eef-1A.1',  # rev  # Something weird is going on...
    'rla-1',  # rev
    'rps-25',  # rev
    'Y37E3.8',  # rev
    'rpl-10',
    'rla-0',
    'rpl-18',
    'rps-3',
    'rpl-38',
    'rpl-24.1',
    'hel-1',
    'rpl-7A',
    'ZK228.4',
    'R06C1.4',
    'rpl-12',
    'pqn-70',
    'rsp-5',
    'rpl-1',
    'C53H9.2',
    'rpl-3',
    'rsp-6',
    'rpl-26',
    'K08D12.3',
    'aly-3',
]
genes = [ # Anything still commented below is because it is too long of a gene for use to consistently hit it's TSS
    'rps-22',
    'ubl-1',
    'rpl-30',
    # # 'eef-1A.1',  # rev  # Something weird is going on...
    'rla-1',  # rev
    'rps-25',  # rev
    'Y37E3.8',  # rev
    'rpl-10',
    # 'rla-0',
    'rpl-18',
    'rps-3',
    # 'rpl-38',
    'rpl-24.1',
    # 'hel-1',
    # 'rpl-7A',
    # 'ZK228.4',
    'R06C1.4',
    # 'rpl-12',
    # 'pqn-70',
    # 'rsp-5',
    # 'rpl-1',
    # 'C53H9.2',
    # 'rpl-3',
    # 'rsp-6',
    'rpl-26',
    'K08D12.3',
    # 'aly-3',
    'C45B2.8'  # 240815: New one!
]
plot_genes = genes[:]

run_info = {
    'libs': sorted(libs_to_run),
    'genes': sorted(plot_genes),
}
run_info

{'libs': ['newerN2',
  'newerS5',
  'newerS6',
  'oldN2',
  'temp25cN2',
  'temp25cS5',
  'temp25cS6',
  'temp25cS7',
  'thirdN2',
  'thirdS5',
  'thirdS6'],
 'genes': ['C45B2.8',
  'K08D12.3',
  'R06C1.4',
  'Y37E3.8',
  'rla-1',
  'rpl-10',
  'rpl-18',
  'rpl-24.1',
  'rpl-26',
  'rpl-30',
  'rps-22',
  'rps-25',
  'rps-3',
  'ubl-1']}

In [4]:
load_preprocessed_data = True
preprocessed_data_path = current_dir / 'preprocessed_data' / 'decapping_data.parquet'
preprocessed_info_path = current_dir / 'preprocessed_data' / 'decapping_data_info.pkl'
if not preprocessed_data_path.exists() or not preprocessed_info_path.exists():
    load_preprocessed_data = False
    print("Preprocessed data not found, will reload data.")
if load_preprocessed_data:
    print("Checking preprocessed data...")
    with open(preprocessed_info_path, 'rb') as f:
        preprocessed_info = pkl.load(f)
    if preprocessed_info != run_info:
        print("Preprocessed data does not match current run info, will reload data.")
        load_preprocessed_data = False
    else:
        print("Preprocessed data matches current run info, will load data.")
if not load_preprocessed_data:
    print("Will reload data and save preprocessed data.")
    with open(preprocessed_info_path, 'wb') as f:
        pkl.dump(run_info, f)

Will reload data and save preprocessed data.


In [5]:
if not load_preprocessed_data:
    obj_dict = {}
    for lib in libs_to_run:
        print(f"\nLoading {lib}...", end="")
        obj = npCommon.NanoporeRun(run_nickname=lib)
        obj_dict[lib] = obj
        # obj.load_mergedOnReads()
        obj.load_nmd_targets(return_sambamobj=False)
        print(" Done!")


Loading oldN2... Done!

Loading newerN2... Done!

Loading newerS6... Done!

Loading newerS5... Done!

Loading thirdN2... Done!

Loading thirdS5... Done!

Loading thirdS6... Done!

Loading temp25cN2... Done!

Loading temp25cS5... Done!

Loading temp25cS6... Done!

Loading temp25cS7... Done!


In [6]:
if not load_preprocessed_data:
    gene_ids_to_names_dict = gtf_df.query('feature == "gene"').set_index('gene_id')['gene_name'].to_dict()
    gene_names_to_ids_dict = gtf_df.query('feature == "gene"').set_index('gene_name')['gene_id'].to_dict()
    for gene_name in plot_genes:
        gene_id = gene_names_to_ids_dict[gene_name]
        print(f"{gene_name:>10} -> {gene_id}")
    gene_ids = [gene_names_to_ids_dict[gene_name] for gene_name in plot_genes]

    rps-22 -> WBGene00004491
     ubl-1 -> WBGene00006725
    rpl-30 -> WBGene00004444
     rla-1 -> WBGene00004409
    rps-25 -> WBGene00004494
   Y37E3.8 -> WBGene00021350
    rpl-10 -> WBGene00004421
    rpl-18 -> WBGene00004430
     rps-3 -> WBGene00004472
  rpl-24.1 -> WBGene00004436
   R06C1.4 -> WBGene00011059
    rpl-26 -> WBGene00004440
  K08D12.3 -> WBGene00019537
   C45B2.8 -> WBGene00016662


In [7]:
def load_simple_nmd_targets(nmd_targets_path, target_gene_ids_to_names_dict) -> pd.DataFrame:
    target_gene_names_to_ids_dict = {v: k for k, v in target_gene_ids_to_names_dict.items()}
    reads_dict = {}
    with pysam.AlignmentFile(nmd_targets_path, 'rb') as test_bam:
        for read in test_bam.fetch():
            read_assigned_gene_id = read.get_tag('gA')
            # print(read.query_name, read_assigned_gene_id, target_gene_ids_to_names_dict[read_assigned_gene_id], read.get_tag('t5'), read.get_tag('nC'))
            reads_dict[read.query_name] = {
                'gene_id': read_assigned_gene_id,
                'gene_name': target_gene_ids_to_names_dict[read_assigned_gene_id],
                't5': read.get_tag('t5'),
                'nmd_assignment': read.get_tag('nC'),
                'cigar': read.cigarstring,
                'chr_pos': read.reference_start,
                'chr': read.reference_name,
                'seq': read.query_sequence,
            }
    df = pd.DataFrame(reads_dict).T
    return df

if not load_preprocessed_data:
    import pysam
    
    nmd_genes = []
    
    for lib, lib_obj in obj_dict.items():
        print(f"Loading {lib} NMD Targets from {lib_obj.nmd_targets_bam_path}...", end="")
        lib_obj.load_nmd_targets(return_sambamobj=False)
        print(" Done!")
        lib_obj.nmd_targets_df = load_simple_nmd_targets(lib_obj.nmd_targets_bam_path, gene_ids_to_names_dict)
        # print(lib_obj.nmd_targets_df.gene_name.unique())
        nmd_genes.extend(lib_obj.nmd_targets_df.gene_name.unique())
    nmd_genes = list(set(nmd_genes))
    print(nmd_genes)

Loading oldN2 NMD Targets from /data16/marcus/working/211118_nanoporeRun_totalRNA_5108_xrn-1-KD_5TERA/output_dir/NMD_targets/240702_nmd_targets.all.merge.bam... Done!
Loading newerN2 NMD Targets from /data16/marcus/working/230327_nanoporeRun_totalRNA_wt_xrn-1-KD_5TERA_rerun/output_dir/NMD_targets/240702_nmd_targets.all.merge.bam... Done!
Loading newerS6 NMD Targets from /data16/marcus/working/230403_nanoporeRun_totalRNA_smg-6_xrn-1-KD_5TERA_rerun/output_dir/NMD_targets/240702_nmd_targets.all.merge.bam... Done!
Loading newerS5 NMD Targets from /data16/marcus/working/230410_nanoporeRun_totalRNA_smg-5_xrn-1-KD_5TERA_rerun/output_dir/NMD_targets/240702_nmd_targets.all.merge.bam... Done!
Loading thirdN2 NMD Targets from /data16/marcus/working/230918_nanoporeRun_sMV013_wt_xrn-1-KD_5TERA/output_dir/NMD_targets/240702_nmd_targets.all.merge.bam... Done!
Loading thirdS5 NMD Targets from /data16/marcus/working/230918_nanoporeRun_sMV014_smg-5_xrn-1-KD_5TERA/output_dir/NMD_targets/240702_nmd_target

In [12]:
UNANNOTATED_TSS_DICT = {  # these are manual annotations of transcription start sites that are not in the GTF file
    'rpl-7A': [4_390_579, 4_390_464],
}

UNANNOTATED_STOPS_DICT = {  # these are manual annotations of stops that are not in the GTF file
    'rps-22': [1_950_853],  # cassette exon contains PTC
    'rpl-30': [10_436_332],  # extension of first exon contains PTC
    'rpl-7A': [4_389_880, 4_389_745],  # unannotated extension of 3rd exon contains PTC (second loc due to potential FS)
    'rsp-6': [7_790_576],  # longer unannotated form of cassette exon causes frame shift
    'C53H9.2': [1_833_467],  # shorter first exon leaves frame shift
    'pqn-70': [11_226_879],  # skipped exon leads to a frame change
    'rpl-1': [2_875_908],  # new orf started in the 3' extension of the first exon, causing a frame change
    'R06C1.4': [11_931_081],  # unannotated cassette exon contains a frame shift
    'rpl-3': [3_868_335],  # incorrectly annotated first intron 3' truncation (2nd exon extended) contains stop
    'hel-2': [8_327_970],  # later 5' SS adds to 3rd exon and contains PTC
    'rpl-12': [13_240_078],  # later 5' SS of second intron adds extension to 2nd exon with a PTC
    'aly-3': [12_123_859], # intron retention of the third intron has a PTC
}

BAD_TSS_ANNOTATIONS_DICT = {  # these are annotations that exist in the GTF that are incorrect (based on never being seen in FL nanopore libs)
    'rpl-30': [10_436_466, 10_436_408],
    'rpl-1': [2_876_039, 2_876_034],
    'rpl-12': [13_240_204],
    
}

def get_other_read_end(row):
    cigar = row['cigar']
    read_pos = row['chr_pos']
    if cigar is None:
        raise ValueError("Cigar is None!")
    if read_pos is None:
        raise ValueError("read_pos is None!")
    pattern = r"(\d*)([MDISN])"
    matches = re.findall(pattern, cigar)
    for match in matches:
        count, key = match
        if key in ['M', 'D', 'N']:
            read_pos += int(count)
    return read_pos

def near_a_tss_or_stop(row, _strand, tss_positions, stop_positions, col_for_pos='chr_pos', full_window_size=50, window_bias_downstream=0.50, make_window_correction=True) -> Tuple[bool, bool]:
    chr_pos = row[col_for_pos]
    if _strand == '-' and make_window_correction:
        window_bias_downstream = 1 - window_bias_downstream
    elif _strand != '+' and make_window_correction:
        raise ValueError(f"Strand must be '+' or '-' not {_strand}")
    
    near_tss = False
    near_stop = False
    chr_pos_upstream = chr_pos - full_window_size * (1 - window_bias_downstream)
    chr_pos_downstream = chr_pos + full_window_size * window_bias_downstream
    for tss in set(tss_positions):
        if chr_pos_upstream <= tss <= chr_pos_downstream:
            near_tss = True
            break
    for stop in set(stop_positions):
        if chr_pos_upstream <= stop <= chr_pos_downstream:
            near_stop = True
            break
    return near_tss, near_stop

def convert_near_tss_and_stop_to_hue(row, tss_color='g', stop_color='r', neither_color='k', both_color='y'):
    near_tss, near_stop = row['near_tss'], row['near_stop']
    if near_tss and near_stop:
        return both_color
    elif near_tss:
        return tss_color
    elif near_stop:
        return stop_color
    else:
        return neither_color

def just_extract_gene_data(gtf_df, test_gene, plot_lib_df_dict, window_size=100,
                           up_v_down_fraction=0.25, number_of_bins=40,
                            flanking_gene_fraction=0.05, print_stats=False):    
    extra_stops_dict = UNANNOTATED_STOPS_DICT
    extra_tss_dict = UNANNOTATED_TSS_DICT
    bad_tss_dict = BAD_TSS_ANNOTATIONS_DICT
    
    gtf_for_gene = gtf_df.query("gene_name == @test_gene")
    gene_start, gene_end = gtf_for_gene.query("feature == 'gene'").loc[:, ['start', 'end']].values[0]
    gene_length = gene_end - gene_start
    
    strand = gtf_for_gene.query("feature == 'gene'").loc[:, 'strand'].values[0]
    if print_stats:
        print(f"Gene target ({test_gene}) on strand: {strand}")
    if strand == '-':
        if print_stats:
            print("Because this gene is on the reverse strand, we will have to calculate out the other end of the reads with the CIGAR strings. This will take slightly longer.")
        stop_locations = gtf_for_gene.query("feature == 'stop_codon'").loc[:, 'end'].values
        start_sites = gtf_for_gene.query("feature == 'transcript'").loc[:, 'end'].values
        up_v_down_fraction = 1 - up_v_down_fraction
        flipped = True
    else:
        stop_locations = gtf_for_gene.query("feature == 'stop_codon'").loc[:, 'start'].values
        start_sites = gtf_for_gene.query("feature == 'transcript'").loc[:, 'start'].values
        flipped = False
        
    stop_locations = list(stop_locations)
    start_sites = list(start_sites)
    
    if test_gene in extra_stops_dict:
        for stop_location in extra_stops_dict[test_gene]:
            stop_locations.append(stop_location)
    if test_gene in extra_tss_dict:
        for tss_location in extra_tss_dict[test_gene]:
            start_sites.append(tss_location)
    if test_gene in bad_tss_dict:
        for bad_tss_location in bad_tss_dict[test_gene]:
            if bad_tss_location in start_sites:
                start_sites.remove(bad_tss_location)
            else:
                raise ValueError(f"Bad TSS location {bad_tss_location} not in start_sites for gene {test_gene}\n"
                                 f"Start Sites: {start_sites}")
    summary_dict_for_gene = {}
    for i, (lib, lib_df) in enumerate(plot_lib_df_dict.items()):

        if print_stats:
            print(f"\t{lib}:")
        
        gene_df = lib_df.query('gene_name == @test_gene').copy()
        gene_df.replace({'t5': {0: '-', 1: '+', '+': '+', '-': '-'}}, inplace=True)
        if flipped:
            gene_df.loc[:, 'chr_pos'] = gene_df.apply(get_other_read_end, axis=1)
        gene_df.loc[:, 'near_tss'], gene_df.loc[:, 'near_stop'] = zip(*gene_df.apply(near_a_tss_or_stop, args=(strand, start_sites, stop_locations),
            full_window_size=window_size,window_bias_downstream=up_v_down_fraction, axis=1))
        gene_df.loc[:, 'hue'] = gene_df.apply(convert_near_tss_and_stop_to_hue, axis=1)
        
        unadapted_df = gene_df.query('t5 == "-"').copy()
        adapted_df = gene_df.query('t5 == "+"').copy()
        
        # Create annotations to put in ax5
        ada_near_stop_count = adapted_df.query('near_stop').shape[0]
        ada_near_tss_count = adapted_df.query('near_tss').shape[0]
        unada_near_stop_count = unadapted_df.query('near_stop').shape[0]
        unada_near_tss_count = unadapted_df.query('near_tss').shape[0]
        ada_total_count = adapted_df.shape[0]
        unada_total_count = unadapted_df.shape[0]
        summary_dict_for_gene[lib] = {
            'Adapted Near Stop': ada_near_stop_count,
            'Adapted Near TSS': ada_near_tss_count,
            'Total Adapted': ada_total_count,
            'Unadapted Near Stop': unada_near_stop_count,
            'Unadapted Near TSS': unada_near_tss_count,
            'Total Unadapted': unada_total_count,
        }
    return summary_dict_for_gene

if not load_preprocessed_data:
    plot_libs = libs_to_run
    
    window_size_for_search = 100  # 100nt seems to make sense (+50 and -50 from the TSS or STOP)
    
    target_nmd_types = ('nmd_target', 'ambiguous', 'non_nmd_target')
    subset_nmd_types_list = [target_nmd_types[0], target_nmd_types[1], target_nmd_types[2], target_nmd_types[:2], target_nmd_types[:]]
    subset_nmd_types_names = ['NMD_Targets', 'Ambiguous_Reads', 'NMD_NonTargets', 'NMD_TargetsAndAmbiguous', 'All_Types']
    
    plot_lib_objs_dict = {lib: obj_dict[lib] for lib in plot_libs}
    overall_summaries_nested_dict = {}
    for gene in nmd_genes[::-1]:
        # if gene != "rpl-7A":
        #     continue
        print(f"{gene:#^50}")
        gene_summary_dicts = {}
        
        for subset_nmd_types, subset_nmd_types_name in zip(subset_nmd_types_list, subset_nmd_types_names):
            try:
                lib_df_dict_for_gene = {lib: lib_obj.nmd_targets_df.query('gene_name == @gene & nmd_assignment in @subset_nmd_types')
                                        for lib, lib_obj in plot_lib_objs_dict.items()}
                # summary_dict = plot_gene_data3(gtf_df, gene, lib_df_dict_for_gene, print_stats=False,
                #                                title_override=f"{gene}\nNMD Target Types: {subset_nmd_types}",
                #                                save_name_override=f"{gene}_{subset_nmd_types_name}",
                #                                current_dir=current_dir,)
                summary_dict = just_extract_gene_data(gtf_df, gene, lib_df_dict_for_gene, print_stats=False, window_size=window_size_for_search)
                gene_summary_dicts[subset_nmd_types_name] = summary_dict
            except Exception as e:
                print(f"Error: {e}")
                print(f"Gene: {gene}")
                print(f"NMD Types: {subset_nmd_types}")
                print(f"Continuing...")
        overall_summaries_nested_dict[gene] = gene_summary_dicts
    
    plotting_list_for_df = []
    
    dict_index = 0
    for gene, gene_dict in overall_summaries_nested_dict.items():
        for nmd_target_type, target_type_dict in gene_dict.items():
            print(f"{gene} ({nmd_target_type}):")
            for lib, lib_dict in target_type_dict.items():
                print(f"\t{lib}:")
                print(f"\t\tAdapted Near Stop:   {lib_dict['Adapted Near Stop']}")
                print(f"\t\tAdapted Near TSS:    {lib_dict['Adapted Near TSS']}")
                print(f"\t\tTotal Adapted:       {lib_dict['Total Adapted']}")
                print(f"\t\tUnadapted Near Stop: {lib_dict['Unadapted Near Stop']}")
                print(f"\t\tUnadapted Near TSS:  {lib_dict['Unadapted Near TSS']}")
                print(f"\t\tTotal Unadapted:     {lib_dict['Total Unadapted']}")
                try:
                    print(f"\t\tAda Stop/Unada FL: {lib_dict['Adapted Near Stop']/lib_dict['Unadapted Near TSS']:.2%}")
                except ZeroDivisionError:
                    print(f"\t\tAda Stop/Unada FL: -- NO UNADAPTED FL --")
                plotting_list_for_df.append(
                    {
                        'gene_name': gene,
                        'nmd_target_type': nmd_target_type,
                        'lib': lib,
                        'adapted_near_stop': lib_dict['Adapted Near Stop'],
                        'adapted_near_tss': lib_dict['Adapted Near TSS'],
                        'total_adapted': lib_dict['Total Adapted'],
                        'unadapted_near_stop': lib_dict['Unadapted Near Stop'],
                        'unadapted_near_tss': lib_dict['Unadapted Near TSS'],
                        'total_unadapted': lib_dict['Total Unadapted'],
                    }
                )
    
    plotting_summary_df = pd.DataFrame.from_records(plotting_list_for_df)
    plotting_summary_df

######################rsp-6#######################
######################hel-1#######################
######################rpl-7A######################
#####################ZK228.4######################
Error: not enough values to unpack (expected 2, got 0)
Gene: ZK228.4
NMD Types: non_nmd_target
Continuing...
#####################K08D12.3#####################
######################rpl-3#######################
######################aly-3#######################
######################rpl-26######################
######################rpl-30######################
######################pqn-70######################
######################rsp-5#######################
#####################C53H9.2######################
######################rpl-12######################
#####################R06C1.4######################
######################rpl-1#######################
######################ubl-1#######################
######################rps-22######################
rsp-6 (NMD_Targets):
	ol

In [13]:
if not load_preprocessed_data:
    # Before anything let's copy down code so we can regenerate the input df
    summary_df = plotting_summary_df.copy()
    
    lib_strain_dict = {'N2': 'wildtype',
                       'S5': 'smg-5',
                       'S6': 'smg-6',
                       'S7': 'smg-7'}
    
    summary_df['strain'] = summary_df.lib.str[-2:].map(lib_strain_dict)
    
    summary_df['total_near_tss'] = summary_df['adapted_near_tss'] + summary_df['unadapted_near_tss']
    summary_df['total_near_stop'] = summary_df['adapted_near_stop'] + summary_df['unadapted_near_stop']
    summary_df['total_reads'] = summary_df['total_adapted'] + summary_df['total_unadapted']
    
    # There are a few different ways to do this!
    summary_df['frac_near_tss'] = summary_df['total_near_tss'] / summary_df['total_reads']
    
    summary_df['frac_near_tss'] = summary_df['adapted_near_tss'] / summary_df['total_reads']
    
    print(f"Saving preprocessed data to {preprocessed_data_path}")
    summary_df.to_parquet(preprocessed_data_path)
else:
    print("Loading preprocessed data...")
    summary_df = pd.read_parquet(preprocessed_data_path)
summary_df

Saving preprocessed data to /data16/marcus/scripts/nanoporePipelineScripts/deadenylationPaperFigureUpdates2/preprocessed_data/decapping_data.parquet


Unnamed: 0,gene_name,nmd_target_type,lib,adapted_near_stop,adapted_near_tss,total_adapted,unadapted_near_stop,unadapted_near_tss,total_unadapted,strain,total_near_tss,total_near_stop,total_reads,frac_near_tss
0,rsp-6,NMD_Targets,oldN2,2,0,4,2,9,17,wildtype,9,4,21,0.000000
1,rsp-6,NMD_Targets,newerN2,0,0,9,4,4,19,wildtype,4,4,28,0.000000
2,rsp-6,NMD_Targets,newerS6,0,0,0,53,115,235,smg-6,115,53,235,0.000000
3,rsp-6,NMD_Targets,newerS5,1,0,1,20,61,110,smg-5,61,21,111,0.000000
4,rsp-6,NMD_Targets,thirdN2,2,0,4,4,3,18,wildtype,3,6,22,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
919,rps-22,All_Types,thirdS6,3,3,13,90,660,930,smg-6,663,93,943,0.003181
920,rps-22,All_Types,temp25cN2,4,3,22,6,663,822,wildtype,666,10,844,0.003555
921,rps-22,All_Types,temp25cS5,0,3,4,51,1806,2098,smg-5,1809,51,2102,0.001427
922,rps-22,All_Types,temp25cS6,0,4,5,56,1713,2083,smg-6,1717,56,2088,0.001916


In [None]:
import scipy.stats as stats
from colorama import Fore, Style

def run_stats_intralib(df_in, compare_col, sig_cutoff=0.05, steps=[100, 10, 1],
                       strains=('wildtype', 'smg-5', 'smg-6'),
                       output_file: Path = None):
    test_results = {}
    output_string = ""
    for strain in strains:
        nmd_df = df_in.query("strain == @strain & nmd_target_type == 'NMD_Targets'")
        non_nmd_df = df_in.query("strain == @strain & nmd_target_type == 'NMD_NonTargets'")
        merge_df = nmd_df.merge(non_nmd_df,
                                on=('gene_name', 'strain', 'rep', 'lib', 'name'),
                                suffixes=('_nmd', '_non_nmd'))
        paired_test = stats.ttest_rel(merge_df[f'{compare_col}_nmd'], merge_df[f'{compare_col}_non_nmd'])
        unpaired_test = stats.ttest_ind(nmd_df[compare_col], non_nmd_df[compare_col])
        test_results[strain] = {'paired': paired_test, 'unpaired': unpaired_test}
    output_string += f"{'Strain':^8} | {'Paired p-val (df)':^15} | {'Unpaired p-val (df)':^15}\n"
    output_string += '-' * 50 + '\n'
    for strain, test_dict in test_results.items():
        paired_str = f" {test_dict['paired'].pvalue:^1.3e}   ({test_dict['paired'].df:>2})"
        unpaired_str = f"   {test_dict['unpaired'].pvalue:^1.3e}   ({int(test_dict['unpaired'].df):>2})"
        if test_dict['paired'].pvalue < sig_cutoff / steps[0]:
            paired_str = Fore.MAGENTA + paired_str + Style.RESET_ALL
        elif test_dict['paired'].pvalue < sig_cutoff / steps[1]:
            paired_str = Fore.RED + paired_str + Style.RESET_ALL
        elif test_dict['paired'].pvalue < sig_cutoff / steps[2]:
            paired_str = Fore.YELLOW + paired_str + Style.RESET_ALL
        if test_dict['unpaired'].pvalue < sig_cutoff / steps[0]:
            unpaired_str = Fore.MAGENTA + unpaired_str + Style.RESET_ALL
        elif test_dict['unpaired'].pvalue < sig_cutoff / steps[1]:
            unpaired_str = Fore.RED + unpaired_str + Style.RESET_ALL
        elif test_dict['unpaired'].pvalue < sig_cutoff / steps[2]:
            unpaired_str = Fore.YELLOW + unpaired_str + Style.RESET_ALL
        output_string += f"{strain:>8} | {paired_str} | {unpaired_str}\n"
    key_string = (f"Key: "
                  f"{Fore.MAGENTA}p < {sig_cutoff / steps[0]}{Style.RESET_ALL} "
                  f"{Fore.RED}p < {sig_cutoff / steps[1]}{Style.RESET_ALL} "
                  f"{Fore.YELLOW}p < {sig_cutoff / steps[2]}{Style.RESET_ALL}\n")
    print(output_string + key_string)
    ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
    output_string = ansi_escape.sub('', output_string)
    if output_file is not None:
        with open(output_file, 'a') as f:
            f.write(output_string)
    return output_string

def run_stats_interlib(df_in, compare_col, sig_cutoff=0.05, steps=[100, 10, 1],
                       comparisons=(('wildtype', 'smg-5'), ('wildtype', 'smg-6'), ('smg-5', 'smg-6')),
                       nmd_type='NMD_NonTargets',
                       output_file=None):
    # Now let's do a similar comparison but between non-NMD-targets between libs
    test_results = {}
    output_string = ""
    for comp in comparisons:
        nmd_df = df_in.query("strain == @comp[0] & nmd_target_type == @nmd_type")
        non_nmd_df = df_in.query("strain == @comp[1] & nmd_target_type == @nmd_type")
        drop_cols = ['nmd_target_type', 'strain', 'lib', 'name']
        merge_df = nmd_df.drop(columns=drop_cols).merge(non_nmd_df.drop(columns=drop_cols), on=('gene_name', 'rep'), suffixes=('_lib1', '_lib2'))
        paired_test = stats.ttest_rel(merge_df[f'{compare_col}_lib1'], merge_df[f'{compare_col}_lib2'])
        unpaired_test = stats.ttest_ind(nmd_df[compare_col], non_nmd_df[compare_col])
        test_results[comp] = {'paired': paired_test, 'unpaired': unpaired_test}
    
    output_string += f"{'Comp':^14} | {'Paired p-val (df)':^15} | {'Unpaired p-val (df)':^15}\n"
    output_string += '-'*56 + '\n'
    
    for (lib1, lib2), test_dict in test_results.items():
        paired_str = f" {test_dict['paired'].pvalue:^1.3e}   ({test_dict['paired'].df:>2})"
        unpaired_str = f"   {test_dict['unpaired'].pvalue:^1.3e}   ({int(test_dict['unpaired'].df):>2})"
        if lib1 == 'wildtype':
            lib1 = 'wt'
        if test_dict['paired'].pvalue < sig_cutoff / steps[0]:
            paired_str = Fore.MAGENTA + paired_str + Style.RESET_ALL
        elif test_dict['paired'].pvalue < sig_cutoff / steps[1]:
            paired_str = Fore.RED + paired_str + Style.RESET_ALL
        elif test_dict['paired'].pvalue < sig_cutoff / steps[2]:
            paired_str = Fore.YELLOW + paired_str + Style.RESET_ALL
        if test_dict['unpaired'].pvalue < sig_cutoff / steps[0]:
            unpaired_str = Fore.MAGENTA + unpaired_str + Style.RESET_ALL
        elif test_dict['unpaired'].pvalue < sig_cutoff / steps[1]:
            unpaired_str = Fore.RED + unpaired_str + Style.RESET_ALL
        elif test_dict['unpaired'].pvalue < sig_cutoff / steps[2]:
            unpaired_str = Fore.YELLOW + unpaired_str + Style.RESET_ALL
        output_string += f"{lib1:>5} vs {lib2:<5} | {paired_str} | {unpaired_str}\n"
    key_string = (f"Key: "
                  f"{Fore.MAGENTA}p < {sig_cutoff / steps[0]}{Style.RESET_ALL} "
                  f"{Fore.RED}p < {sig_cutoff / steps[1]}{Style.RESET_ALL} "
                  f"{Fore.YELLOW}p < {sig_cutoff / steps[2]}{Style.RESET_ALL}\n")
    print(output_string + key_string)
    ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
    output_string = ansi_escape.sub('', output_string)
    if output_file is not None:
        with open(output_file, 'a') as f:
            f.write(output_string)
    return output_string

In [None]:
# Maybe the NMD targets have higher adapted_TSS fractions in the WT not because they have high numbers of adapted_TSS reads, but b/c they have lower numbers of the denominator (all reads)
# To remedy this, let's try comparing the adapted_TSS to the unadapted_TSS

# Additionally, it would be nice to do an adapted RPM comparison against Adapt TSS reads. But this needs more reads!!

# I need to circle WAY back to retain both adapt and unadapted TSSes.. UGH LETS DO IT
in_df = summary_df.copy()
set_dict = {'old': 1, 'newer': 2, 'third': 3, 'temp25c': 4}

in_df['rep'] = in_df.lib.str[:-2].map(set_dict)
in_df = in_df.set_index(['gene_name', 'strain', 'rep','lib', 'nmd_target_type']).sort_index()
better_df = in_df.loc[:, :, :, :, ['NMD_Targets', 'NMD_NonTargets'], :].drop(columns=['frac_near_tss'])
better_df

In [None]:
min_total = 1
min_adapted = 5

def run_box_and_ttests(input_df: pd.DataFrame,
                       target_num: str, target_denom: str,
                       min_total_reads: int = 1,
                       min_adapted_reads: int = 1,
                       min_tss_ada_reads: int = 0,
                       add_tss_ada_pseudo: float = -1.0,
                       output_dir: Path = current_dir / 'plots' / 'decapping',
                       log_y: bool = True,
                       log2_y: bool = False,
                       template: str = 'none') -> None:
    input_df = input_df.copy()
    if not output_dir.exists():
        output_dir.mkdir(parents=True)
        print(f"Made {output_dir}")
    
    target_col = f"{target_num} / {target_denom}"
    
    input_df['adapted_near_tss'] = input_df['adapted_near_tss'].fillna(0)
    if add_tss_ada_pseudo > 0:
        input_df['adapted_near_tss'] += add_tss_ada_pseudo
    input_df['unadapted_near_tss'] = input_df['unadapted_near_tss'].fillna(0)
    input_df[target_col] = input_df[target_num] / input_df[target_denom]
    
    box_df = input_df.copy().reset_index()
    
    box_df['strain'] = pd.Categorical(box_df['strain'], ['wildtype', 'smg-5', 'smg-6', 'smg-7'])
    box_df.sort_values(by=['strain', 'nmd_target_type'], inplace=True)
    
    box_df = box_df[box_df['strain'] != 'smg-7']
    box_df['name'] = box_df['gene_name'] + ' ' + box_df['strain'].astype(str) + ' rep' + box_df['rep'].astype(str)
    
    box_df = box_df[box_df['total_reads'] >= min_total_reads].dropna(axis=0)
    box_df = box_df[box_df['total_adapted'] >= min_adapted_reads].dropna(axis=0)
    
    if log2_y:
        box_df = box_df[box_df[target_col] > 0]
        box_df[f"log2({target_col})"] = np.log2(box_df[target_col])
        target_col = f"log2({target_col})"
        log_y = False
    
    if log_y:
        box_df = box_df[box_df[target_col] > 0]
    
    if min_tss_ada_reads > 0:
        box_df = box_df[box_df['adapted_near_tss'] >= min_tss_ada_reads]
    
    # These colors are updated from the final plots!
    plot_colors = (('#37c871', '#16502d'),  # NMD greens
                   ('#64a7ff', '#00008b'),  # Non NMD blues
                   ('#676767', '#414141'),  # ambiguous greys
                   )
    colors_dict = {'NMD_Targets': plot_colors[0][0],
                   'NMD_NonTargets': plot_colors[1][0],}
    
    fig = px.box(box_df,
                 x='strain',
                 y=target_col,
                 color='nmd_target_type',
                 color_discrete_map=colors_dict,
                 points='all',
                 hover_name='name',
                 log_y=log_y,
                 hover_data=[target_num, target_denom],
                 template=template,
                 title=f"{target_col} by Strain<br>Min Total Reads: {min_total_reads}, Min Adapted Reads: {min_adapted_reads}, Y Scale: {'Log' if log_y else 'Linear'}",
                 height=500,
                 width=1000,
                 )
    if target_denom == 'total_reads':
        fig.update_yaxes(title_text=f'{target_num} / <br>{target_denom}')
    else:
        fig.update_yaxes(# range=[-0.1, 1.1],
                         title_text=f'{target_num} / <br>{target_denom}')
    fig.update_layout(
        font=dict(
            # family="Courier New, monospace",
            size=18,
            # color="RebeccaPurple",
        ),
                 )
    fig.update_yaxes(nticks=7, ticklabelposition="inside")
    save_friendly_target_col = target_col.replace(' ', '_').replace('/', '_vs_')
    if log_y:
        save_friendly_target_col += '_logY'
    else:
        save_friendly_target_col += '_linY'
        
    if add_tss_ada_pseudo > 0:
        save_friendly_target_col += f"_pseudoTSS{add_tss_ada_pseudo}"
    save_friendly_target_col += f"_minTotal{min_total_reads}_minAdapted{min_adapted_reads}_minTSSAdapted{min_tss_ada_reads}"
    fig.write_html(output_dir / f"{npCommon.get_dt(for_file=True)}_boxPlot_{save_friendly_target_col}.html")
    fig.write_image(output_dir / f"{npCommon.get_dt(for_file=True)}_boxPlot_{save_friendly_target_col}.png", scale=5)
    fig.show(renderer='firefox')
    
    output_file = output_dir / f'{npCommon.get_dt(for_file=True)}_boxPlotStats_{save_friendly_target_col}.txt'
    output_file.touch(exist_ok=True)
    with open(output_file, 'w') as f:
        f.write(f"Stats for {target_col}\n")
        f.write("\nIntra Library Stats w/ NMD Targets:\n")
    print("Targets")
    run_stats_interlib(box_df, target_col, nmd_type='NMD_Targets', output_file=output_file)
    with open(output_file, 'a') as f:
        f.write("\nIntra Library Stats w/ Non Targets:\n")
    print("NonTargets")
    run_stats_interlib(box_df, target_col, nmd_type='NMD_NonTargets', output_file=output_file)
    
    with open(output_file, 'a') as f:
        f.write("\nInter Library Stats:\n")
    run_stats_intralib(box_df, target_col, output_file=output_file)

def run_cdf_and_ttests(input_df: pd.DataFrame,
                       target_num: str, target_denom: str,
                       min_total_reads: int = 1,
                       min_adapted_reads: int = 1,
                       min_tss_ada_reads: int = 0,
                       output_dir: Path = current_dir / 'plots' / 'decapping',
                       log_y: bool = True,
                       log2_y: bool = False,
                       template: str = 'none') -> None:
    if not output_dir.exists():
        output_dir.mkdir(parents=True)
        print(f"Made {output_dir}")
    
    target_col = f"{target_num} / {target_denom}"
    
    input_df['adapted_near_tss'] = input_df['adapted_near_tss'].fillna(0)
    input_df['unadapted_near_tss'] = input_df['unadapted_near_tss'].fillna(0)
    input_df[target_col] = input_df[target_num] / input_df[target_denom]
    
    box_df = input_df.copy().reset_index()
    
    box_df['strain'] = pd.Categorical(box_df['strain'], ['wildtype', 'smg-5', 'smg-6', 'smg-7'])
    box_df.sort_values(by=['strain', 'nmd_target_type'], inplace=True)
    
    box_df = box_df[box_df['strain'] != 'smg-7']
    box_df['name'] = box_df['gene_name'] + ' ' + box_df['strain'].astype(str) + ' rep' + box_df['rep'].astype(str)
    
    box_df = box_df[box_df['total_reads'] >= min_total_reads].dropna(axis=0)
    box_df = box_df[box_df['total_adapted'] >= min_adapted_reads].dropna(axis=0)
    
    if log2_y:
        box_df = box_df[box_df[target_col] > 0]
        box_df[f"log2({target_col})"] = np.log2(box_df[target_col])
        target_col = f"log2({target_col})"
        log_y = False
    
    if log_y:
        box_df = box_df[box_df[target_col] > 0]
    
    if min_tss_ada_reads > 0:
        box_df = box_df[box_df['adapted_near_tss'] >= min_tss_ada_reads]
    
    # These colors are updated from the final plots!
    plot_colors = (('#37c871', '#16502d'),  # NMD greens
                   ('#64a7ff', '#00008b'),  # Non NMD blues
                   ('#676767', '#414141'),  # ambiguous greys
                   )
    colors_dict = {'NMD_Targets': plot_colors[0][0],
                   'NMD_NonTargets': plot_colors[1][0],}
    
    fig = px.ecdf(box_df,
                 facet_col='strain',
                 x=target_col,
                 color='nmd_target_type',
                 color_discrete_map=colors_dict,
                 hover_name='name',
                 log_y=log_y,
                 hover_data=[target_num, target_denom],
                 template=template,
                 title=f"{target_col} by Strain<br>Min Total Reads: {min_total_reads}, Min Adapted Reads: {min_adapted_reads}, Y Scale: {'Log' if log_y else 'Linear'}",
                 # height=500,
                 # width=1000,
                 )
    fig.update_xaxes(title_text=f'{target_num} / <br>{target_denom}')
    fig.update_layout(
        font=dict(
            # family="Courier New, monospace",
            size=18,
            # color="RebeccaPurple",
        ),
                 )
    # fig.update_yaxes(nticks=7, ticklabelposition="inside")
    save_friendly_target_col = target_col.replace(' ', '_').replace('/', '_vs_')
    save_friendly_target_col += f"_minTotal_{min_total_reads}_minAdapted_{min_adapted_reads}"
    fig.write_html(output_dir / f"cdfPlot_{save_friendly_target_col}.html")
    fig.write_image(output_dir / f"cdfPlot_{save_friendly_target_col}.png", scale=5)
    fig.show(renderer='firefox')
    
    output_file = output_dir / f'cdfPlotStats_{save_friendly_target_col}.txt'
    output_file.touch(exist_ok=True)
    with open(output_file, 'w') as f:
        f.write(f"Stats for {target_col}\n")
        f.write("\nIntra Library Stats w/ NMD Targets:\n")
    print("Targets")
    run_stats_interlib(box_df, target_col, nmd_type='NMD_Targets', output_file=output_file)
    with open(output_file, 'a') as f:
        f.write("\nIntra Library Stats w/ Non Targets:\n")
    print("NonTargets")
    run_stats_interlib(box_df, target_col, nmd_type='NMD_NonTargets', output_file=output_file)
    
    with open(output_file, 'a') as f:
        f.write("\nInter Library Stats:\n")
    run_stats_intralib(box_df, target_col, output_file=output_file)

def run_xy_and_ttests(input_df: pd.DataFrame,
                      target_num: str, target_denom: str,
                      min_total_reads: int = 1,
                      min_adapted_reads: int = 1,
                      min_tss_ada_reads: int = 0,
                      add_tss_ada_pseudo: int = -1,
                      log_y: bool = True,
                      extra_savename: str = '',
                      output_dir: Path = current_dir / 'plots' / 'decapping',
                      template: str = 'none') -> None:
    if not output_dir.exists():
        output_dir.mkdir(parents=True)
        print(f"Made {output_dir}")
    
    target_col = f"{target_num} / {target_denom}"
    
    input_df['adapted_near_tss'] = input_df['adapted_near_tss'].fillna(0)
    if add_tss_ada_pseudo > 0:
        input_df['adapted_near_tss'] += add_tss_ada_pseudo
    input_df['unadapted_near_tss'] = input_df['unadapted_near_tss'].fillna(0)
    input_df[target_col] = input_df[target_num] / input_df[target_denom]
    
    box_df = input_df.copy().reset_index()
    
    box_df['strain'] = pd.Categorical(box_df['strain'], ['wildtype', 'smg-5', 'smg-6', 'smg-7'])
    box_df.sort_values(by=['strain', 'nmd_target_type'], inplace=True)
    
    box_df = box_df[box_df['strain'] != 'smg-7']
    box_df['name'] = box_df['gene_name'] + ' ' + box_df['strain'].astype(str) + ' rep' + box_df['rep'].astype(str)
    
    box_df = box_df[box_df['total_reads'] >= min_total_reads].dropna(axis=0)
    box_df = box_df[box_df['total_adapted'] >= min_adapted_reads].dropna(axis=0)
    
    if min_tss_ada_reads > 0:
        box_df = box_df[box_df['adapted_near_tss'] >= min_tss_ada_reads]
    
    # These colors are updated from the final plots!
    plot_colors = (('#37c871', '#16502d'),  # NMD greens
                   ('#64a7ff', '#00008b'),  # Non NMD blues
                   ('#676767', '#414141'),  # ambiguous greys
                   )
    colors_dict = {'NMD_Targets': plot_colors[0][0],
                   'NMD_NonTargets': plot_colors[1][0],}
    
    fig = px.scatter(box_df,
                 x=target_num,
                 y=target_denom,
                 facet_col='strain',
                 color='nmd_target_type',
                 color_discrete_map=colors_dict,
                 hover_name='name',
                 hover_data=[target_num, target_denom],
                 template=template,
                 log_y=log_y,
                 title=f"{target_num} x {target_denom} by Strain<br>"
                       f"Min Total: {min_total_reads}, Min Adapt: {min_adapted_reads}, Min TSS Adapt: {min_tss_ada_reads}",
                 opacity=0.7,
                 )
    # fig2 = px.density_contour(box_df,
    #              x=target_num,
    #              y=target_denom,
    #              color='nmd_target_type',
    #              color_discrete_map=colors_dict,
    #              hover_name='name',
    #              hover_data=[target_num, target_denom],
    #              template=template,
    #              log_y=log_y,
    #              title=f"{target_num} x {target_denom} by Strain<br>"
    #                    f"Min Total: {min_total_reads}, Min Adapt: {min_adapted_reads}, Min TSS Adapt: {min_tss_ada_reads}",
    #              height=500,
    #              width=700,
    #              )
    # fig = go.Figure(data=fig1.data + fig2.data)
    fig.update_layout(
        font=dict(
            # family="Courier New, monospace",
            size=18,
            # color="RebeccaPurple",
        ),
                 )
    save_friendly_target_col = target_col.replace(' ', '_').replace('/', '_vs_')
    
    save_friendly_target_col += f"_minTotal{min_total_reads}_minAdapted{min_adapted_reads}_minTSSAdapted{min_tss_ada_reads}"
    if log_y:
        save_friendly_target_col += '_logY'
    else:
        save_friendly_target_col += '_linY'
    
    save_friendly_target_col += extra_savename
    
    fig.write_html(output_dir / f"xyPlot_{save_friendly_target_col}.html")
    fig.write_image(output_dir / f"xyPlot_{save_friendly_target_col}.png", scale=5,
                 height=500,
                 width=700,)
    fig.show(renderer='firefox')
    
    # output_file = output_dir / f'boxPlotStats_{save_friendly_target_col}.txt'
    # output_file.touch(exist_ok=True)
    # with open(output_file, 'w') as f:
    #     f.write(f"Stats for {target_col}\n")
    #     f.write("\nIntra Library Stats w/ NMD Targets:\n")
    # print("Targets")
    # run_stats_interlib(box_df, target_col, nmd_type='NMD_Targets', output_file=output_file)
    # with open(output_file, 'a') as f:
    #     f.write("\nIntra Library Stats w/ Non Targets:\n")
    # print("NonTargets")
    # run_stats_interlib(box_df, target_col, nmd_type='NMD_NonTargets', output_file=output_file)
    # 
    # with open(output_file, 'a') as f:
    #     f.write("\nInter Library Stats:\n")
    # run_stats_intralib(box_df, target_col, output_file=output_file)

min_adapteds = [
    1,
    # 2, 3, 4,
    5,
    # 10,
]

target_denominators = [
    'total_reads',
    # 'unadapted_near_tss',
    'total_adapted',
    # 'total_unadapted',
]
for min_adapted in min_adapteds:
    for target_denominator in target_denominators:
        print(f"\n\nUsing {target_denominator=} for denominator... And {min_adapted=}...\n")
        # run_cdf_and_ttests(better_df, 'adapted_near_tss', target_denominator, log_y=True, min_adapted_reads=min_adapted,
        #                    min_tss_ada_reads=1)
        # run_xy_and_ttests(
        #     better_df, # better_df.query("strain == 'wildtype'"),
        #     'adapted_near_tss', target_denominator,
        #     extra_savename=f"_wFacets",
        #     min_adapted_reads=min_adapted, min_tss_ada_reads=0, log_y=True)
        run_box_and_ttests(
            better_df, # better_df.query("strain == 'wildtype'"),
            'adapted_near_tss', target_denominator,
            min_adapted_reads=min_adapted, min_tss_ada_reads=0, log_y=True,
            add_tss_ada_pseudo=0.1,
        )
print("Done!")

In [None]:
# Below was an individual gene plotter that doesn't really fit in this analysis now
# ToDo: Move it to `decappingQuantification2_perGenePlots.ipynb` and update it to work in there

# # Let's see if we can make quick bar plots for individual gene species
# indv_df = better_df.copy()
# 
# per_gene_plots_dir = current_dir / 'plots' / 'decappingPerGene'
# per_gene_plots_dir.mkdir(exist_ok=True)
# 
# target_num = 'adapted_near_tss'
# target_denom = 'total_adapted'
# 
# target_col = f"{target_num} / {target_denom}"
# add_pseudocount = False
# log_y = False
# 
# # Let's try adding a pseudo-count to the adapted_near_tss and total_adapted columns
# if add_pseudocount:
#     indv_df['adapted_near_tss'] += 0.1
#     indv_df['total_adapted'] += 0.1
#     indv_df['adapted_near_tss / total_adapted'] = indv_df['adapted_near_tss'] / indv_df['total_adapted']
# 
# # Drop replicate 4 (the temp 25C data)
# indv_df = indv_df.query("rep != 4")
# 
# 
# genes_to_ind_plot = [
#     'rpl-30',
#     'rps-22',
#     'rpl-7A',
#     'ubl-1',
# ]
# 
# genes_to_ind_plot = indv_df.index.get_level_values('gene_name').unique()
# 
# # These colors are updated from the final plots!
# plot_colors = (('#37c871', '#16502d'),  # NMD greens
#                ('#64a7ff', '#00008b'),  # Non NMD blues
#                ('#676767', '#414141'),  # ambiguous greys
#                )
# colors_dict = {'NMD_Targets': plot_colors[0][1],
#                'NMD_NonTargets': plot_colors[1][1],}
# 
# for gene_name in genes_to_ind_plot:
#     print(f"Working on {gene_name}...")
#     gene_df = indv_df.query("gene_name == @gene_name")
#     # Drop things with NA values:
#     gene_df = gene_df.dropna(axis=0)
#     # Drop things without any adapted reads near the TSS:
#     # gene_df = gene_df.query("adapted_near_tss > 0")
#     # Simplify the indexes:
#     gene_df = gene_df.reset_index()
#     gene_df['strain'] = pd.Categorical(gene_df['strain'], ['wildtype', 'smg-5', 'smg-6', 'smg-7'])
#     gene_df.sort_values(by=['strain', 'nmd_target_type'], inplace=True)
#     gene_df['name'] = gene_df['gene_name'] + ' ' + gene_df['strain'].astype(str) + ' rep' + gene_df['rep'].astype(str)
#     fig = px.box(gene_df,
#                  x='strain',
#                  y=target_col,
#                  color='nmd_target_type',
#                  color_discrete_map={'NMD_Targets': plot_colors[0][0],
#                                      'NMD_NonTargets': plot_colors[1][0],},
#                  points='all',
#                  hover_name='name',
#                  log_y=log_y,
#                  title=f"{gene_name} Adapted Near TSS / Total Adapted",
#                  hover_data=['adapted_near_tss', 'total_adapted', 'total_reads'],
#                  template='plotly_white',
#                  height=500,
#                  width=1000,
#                  )
#     fig.update_yaxes(range=[-0.1, 1.1], title_text=f'{target_num} / <br>{target_denom}')
#     fig.update_layout(
#         font=dict(
#             # family="Courier New, monospace",
#             size=20,
#             # color="RebeccaPurple",
#         ),
#     )
#     # fig = px.bar(gene_df,
#     #          x='strain',
#     #          y=target_col,
#     #          color='nmd_target_type',
#     #          color_discrete_map={'NMD_Targets': plot_colors[0][1],
#     #                              'NMD_NonTargets': plot_colors[1][1],},
#     #          hover_name='name',
#     #          barmode='group',
#     #          facet_col='rep',
#     #          log_y=log_y,
#     #          title=f"{gene_name} Adapted Near TSS / Total Adapted",
#     #          hover_data=['adapted_near_tss', 'total_adapted', 'total_reads'],
#     #          )
#     fig.write_image(per_gene_plots_dir / f"{gene_name}_adaptedNearTSS_vs_totalAdapted.png")
#     fig.write_image(per_gene_plots_dir / f"{gene_name}_adaptedNearTSS_vs_totalAdapted.svg")
#     fig.write_html(per_gene_plots_dir / f"{gene_name}_adaptedNearTSS_vs_totalAdapted.html")
#     fig.show()

# 8/16/24 After meeting with Josh
One of the major factors that confuses the box plots above is the massive difference in depth between the NMD-targets and the Non-targets in the WT libraries

Another question is about if each gene has similar effects across the different replicates.

The solution for the first issue is to bootstrap the non-NMD targets to the same depth as the NMD targets and then compare the two.

The solution for the second is to try the means of the replicates AND look at genes on their own.

# First let's work on bootstrapping the libraries

In [64]:
better_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,adapted_near_stop,adapted_near_tss,total_adapted,unadapted_near_stop,unadapted_near_tss,total_unadapted,total_near_tss,total_near_stop,total_reads
gene_name,strain,rep,lib,nmd_target_type,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
C53H9.2,smg-5,2,newerS5,NMD_Targets,0,0,1,0,92,139,92,0,140
C53H9.2,smg-5,3,thirdS5,NMD_Targets,0,0,2,0,124,211,124,0,213
C53H9.2,smg-5,4,temp25cS5,NMD_Targets,0,0,0,0,87,115,87,0,115
C53H9.2,smg-6,2,newerS6,NMD_Targets,0,2,3,0,126,199,128,0,202
C53H9.2,smg-6,3,thirdS6,NMD_Targets,0,0,1,0,15,34,15,0,35
...,...,...,...,...,...,...,...,...,...,...,...,...,...
ubl-1,smg-7,4,temp25cS7,NMD_NonTargets,0,1,1,21,452,516,453,21,517
ubl-1,wildtype,1,oldN2,NMD_NonTargets,0,12,13,26,1516,1589,1528,26,1602
ubl-1,wildtype,2,newerN2,NMD_NonTargets,0,1,2,46,1168,1296,1169,46,1298
ubl-1,wildtype,3,thirdN2,NMD_NonTargets,1,5,8,41,1131,1248,1136,42,1256


In [190]:
def bootstrap_row_optimized(index: tuple, row: pd.Series, num_samples: int, sample_size: int) -> pd.Series:

    num_ada_tss = row['adapted_near_tss']
    num_unada_tss = row['unadapted_near_tss']
    num_ada_not_tss = row['total_adapted'] - row['adapted_near_tss']
    num_unada_not_tss = row['total_unadapted'] - row['unadapted_near_tss']

    set_up_array = np.array([
        [1, 1],  # adapted, near TSS
        [1, 0],  # adapted, not TSS
        [0, 1],  # unadapted, near TSS
        [0, 0],  # unadapted, not TSS
    ])
    bootstrap_array = np.repeat(set_up_array, [num_ada_tss, num_ada_not_tss, num_unada_tss, num_unada_not_tss], axis=0)

    bootstrap_results = []
    for cycle in range(num_samples):
        sample_indices = np.random.choice(bootstrap_array.shape[0], size=sample_size, replace=True)
        sample = bootstrap_array[sample_indices]
        unique, counts = np.unique(sample, axis=0, return_counts=True)
        bootstrap_results.append(dict(zip(map(tuple, unique), counts)))

    col_conversion_dict = {
        (1, 1): 'ada_near_tss',
        (1, 0): 'ada_not_tss',
        (0, 1): 'unada_near_tss',
        (0, 0): 'unada_not_tss',
    }

    # Convert list of dicts to DataFrame
    bootstrap_df = pd.DataFrame(bootstrap_results).fillna(0).astype(int)
    bootstrap_df.columns = [col_conversion_dict[col] for col in bootstrap_df.columns]
    
    means = bootstrap_df.mean().rename(index)
    return means

def bootstrap_row_for_apply(row, bootstrap_cycles: int = 1000, sample_size: int = 10):
    index = row.name
    return bootstrap_row_optimized(index, row, num_samples=bootstrap_cycles, sample_size=sample_size)

In [253]:
tqdm.pandas(desc="Bootstrapping")
bootstrap_df = better_df.query("total_adapted >= 5").progress_apply(bootstrap_row_for_apply, axis=1).fillna(0).astype(float)
bootstrap_df['total_reads'] = bootstrap_df['ada_near_tss'] + bootstrap_df['ada_not_tss'] + bootstrap_df['unada_near_tss'] + bootstrap_df['unada_not_tss']
bootstrap_df['total_adapted'] = bootstrap_df['ada_near_tss'] + bootstrap_df['ada_not_tss']

Bootstrapping:   0%|          | 0/104 [00:00<?, ?it/s]

In [175]:

bootstrap_df['adapted_near_tss / total_reads'] = bootstrap_df['ada_near_tss'] / bootstrap_df['total_reads']
bootstrap_df['adapted_near_tss / total_adapted'] = bootstrap_df['ada_not_tss'] / bootstrap_df['total_adapted']

In [261]:
bootstrap_df.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,ada_near_tss,ada_not_tss,unada_near_tss,unada_not_tss,total_reads,total_adapted
gene_name,strain,rep,lib,nmd_target_type,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
R06C1.4,smg-5,2,newerS5,NMD_Targets,0.057,0.303,5.649,3.991,10.0,0.36
R06C1.4,smg-5,3,thirdS5,NMD_Targets,0.116,0.112,6.582,3.19,10.0,0.228
R06C1.4,smg-5,4,temp25cS5,NMD_Targets,0.203,0.08,6.228,3.489,10.0,0.283
R06C1.4,smg-6,2,newerS6,NMD_Targets,0.088,0.076,5.702,4.134,10.0,0.164
R06C1.4,wildtype,1,oldN2,NMD_Targets,0.0,4.301,1.235,4.464,10.0,4.301
R06C1.4,wildtype,2,newerN2,NMD_Targets,0.0,3.003,1.537,5.46,10.0,3.003
R06C1.4,wildtype,3,thirdN2,NMD_Targets,0.0,4.492,0.0,5.508,10.0,4.492
aly-3,wildtype,2,newerN2,NMD_Targets,0.0,7.104,0.0,2.896,10.0,7.104
rpl-1,wildtype,1,oldN2,NMD_Targets,0.911,1.316,7.551,0.222,10.0,2.227
rpl-1,wildtype,2,newerN2,NMD_Targets,1.367,0.559,5.851,2.223,10.0,1.926


In [258]:
better_df.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,adapted_near_stop,adapted_near_tss,total_adapted,unadapted_near_stop,unadapted_near_tss,total_unadapted,total_near_tss,total_near_stop,total_reads
gene_name,strain,rep,lib,nmd_target_type,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
C53H9.2,smg-5,2,newerS5,NMD_Targets,0,0,1,0,92,139,92,0,140
C53H9.2,smg-5,3,thirdS5,NMD_Targets,0,0,2,0,124,211,124,0,213
C53H9.2,smg-5,4,temp25cS5,NMD_Targets,0,0,0,0,87,115,87,0,115
C53H9.2,smg-6,2,newerS6,NMD_Targets,0,2,3,0,126,199,128,0,202
C53H9.2,smg-6,3,thirdS6,NMD_Targets,0,0,1,0,15,34,15,0,35
C53H9.2,smg-6,4,temp25cS6,NMD_Targets,0,0,0,0,79,118,79,0,118
C53H9.2,smg-7,4,temp25cS7,NMD_Targets,0,0,0,0,3,3,3,0,3
C53H9.2,wildtype,1,oldN2,NMD_Targets,0,0,2,0,11,16,11,0,18
C53H9.2,wildtype,2,newerN2,NMD_Targets,0,0,0,0,6,10,6,0,10
C53H9.2,wildtype,3,thirdN2,NMD_Targets,0,0,1,0,3,4,3,0,5


In [260]:
def run_box_bootstrapped(input_df: pd.DataFrame,
                         target_num: str, target_denom: str,
                         min_total_reads: int = 1,
                         min_adapted_reads: int = 1,
                         min_tss_ada_reads: int = 0,
                         add_tss_ada_pseudo: float = -1.0,
                         output_dir: Path = current_dir / 'plots' / 'decapping_bootstrapped',
                         bootstrap_cycles: int = 1000,
                         bootstrap_sample_size: int = 10,
                         log_y: bool = True,
                         log2_y: bool = False,
                         template: str = 'none') -> None:
    input_df = input_df.copy()
    input_df = input_df.query("adapted_near_tss >= @min_tss_ada_reads")
    input_df = input_df.query("total_reads >= @min_total_reads")
    input_df = input_df.query("total_adapted >= @min_adapted_reads")
    
    
    if not output_dir.exists():
        output_dir.mkdir(parents=True)
        print(f"Made {output_dir}")
    
    tqdm.pandas(desc="Bootstrapping")
    boot_df = input_df.query("total_adapted >= @min_adapted_reads").progress_apply(bootstrap_row_for_apply, bootstrap_cycles=bootstrap_cycles, sample_size=bootstrap_sample_size, axis=1).fillna(0).astype(float)
    boot_df.rename(columns={'ada_near_tss': 'adapted_near_tss',
                            'ada_not_tss': 'ada_not_tss',
                            'unada_near_tss': 'unadapted_near_tss',
                            'unada_not_tss': 'unada_not_tss'},
                        inplace=True)
    print(boot_df.columns)
    boot_df['total_reads'] = boot_df['adapted_near_tss'] + boot_df['ada_not_tss'] + boot_df['unadapted_near_tss'] + boot_df['unada_not_tss']
    boot_df['total_adapted'] = boot_df['adapted_near_tss'] + boot_df['ada_not_tss']
    boot_df['total_near_tss'] = boot_df['adapted_near_tss'] + boot_df['unadapted_near_tss']
    input_df = boot_df
    
    target_col = f"{target_num} / {target_denom}"
    
    input_df['adapted_near_tss'] = input_df['adapted_near_tss'].fillna(0)
    if add_tss_ada_pseudo > 0:
        input_df['adapted_near_tss'] += add_tss_ada_pseudo
    input_df['unadapted_near_tss'] = input_df['unadapted_near_tss'].fillna(0)
    input_df[target_col] = input_df[target_num] / input_df[target_denom]
    
    box_df = input_df.copy().reset_index()
    
    box_df['strain'] = pd.Categorical(box_df['strain'], ['wildtype', 'smg-5', 'smg-6', 'smg-7'])
    box_df.sort_values(by=['strain', 'nmd_target_type'], inplace=True)
    
    box_df = box_df[box_df['strain'] != 'smg-7']
    box_df['name'] = box_df['gene_name'] + ' ' + box_df['strain'].astype(str) + ' rep' + box_df['rep'].astype(str)
    
    if log2_y:
        box_df = box_df[box_df[target_col] > 0]
        box_df[f"log2({target_col})"] = np.log2(box_df[target_col])
        target_col = f"log2({target_col})"
        log_y = False
    
    if log_y:
        box_df = box_df[box_df[target_col] > 0]
    
    if min_tss_ada_reads > 0:
        box_df = box_df[box_df['adapted_near_tss'] >= min_tss_ada_reads]
    
    # These colors are updated from the final plots!
    plot_colors = (('#37c871', '#16502d'),  # NMD greens
                   ('#64a7ff', '#00008b'),  # Non NMD blues
                   ('#676767', '#414141'),  # ambiguous greys
                   )
    colors_dict = {'NMD_Targets': plot_colors[0][0],
                   'NMD_NonTargets': plot_colors[1][0],}
    
    fig = px.box(box_df,
                 x='strain',
                 y=target_col,
                 color='nmd_target_type',
                 color_discrete_map=colors_dict,
                 points='all',
                 hover_name='name',
                 log_y=log_y,
                 hover_data=[target_num, target_denom],
                 template=template,
                 title=f"{target_col} by Strain<br>"
                       f"Min Total Reads: {min_total_reads}, Min Adapted Reads: {min_adapted_reads}, Y Scale: {'Log' if log_y else 'Linear'}<br>"
                       f"Bootstrapped {bootstrap_cycles} cycles of {bootstrap_sample_size} samples",
                 height=500,
                 width=1000,
                 )
    
    # # Can we add a line for the means?
    # means = box_df.groupby(['strain', 'nmd_target_type']).mean().reset_index()
    # fig.add_trace(go.Scatter(x=means['strain'], y=means[target_col], mode='lines+markers', marker=dict(size=10, color='black'), name='Mean'))
    
    if target_denom == 'total_reads':
        fig.update_yaxes(title_text=f'{target_num} / <br>{target_denom}')
    else:
        fig.update_yaxes(# range=[-0.1, 1.1],
                         title_text=f'{target_num} / <br>{target_denom}')
    fig.update_layout(
        font=dict(
            # family="Courier New, monospace",
            size=16,
            # color="RebeccaPurple",
        ),
                 )
    fig.update_yaxes(nticks=7, ticklabelposition="inside")
    save_friendly_target_col = target_col.replace(' ', '_').replace('/', '_vs_')
    if log_y:
        save_friendly_target_col += '_logY'
    else:
        save_friendly_target_col += '_linY'
        
    if add_tss_ada_pseudo > 0:
        save_friendly_target_col += f"_pseudoTSS{add_tss_ada_pseudo}"
    save_friendly_target_col += f"_minTotal{min_total_reads}_minAdapted{min_adapted_reads}_minTSSAdapted{min_tss_ada_reads}"
    save_friendly_target_col += f"_bootstrapped{bootstrap_cycles}cycleX{bootstrap_sample_size}samples"
    print(f"Saving file to {output_dir / f'{npCommon.get_dt(for_file=True)}_boxPlot_{save_friendly_target_col}.png/html'}")
    fig.write_html(output_dir / f"{npCommon.get_dt(for_file=True)}_boxPlot_{save_friendly_target_col}.html")
    fig.write_image(output_dir / f"{npCommon.get_dt(for_file=True)}_boxPlot_{save_friendly_target_col}.png", scale=5)
    fig.show(renderer='firefox')


min_adapteds = [
    1,
    # 2,
    3,
    # 4,
    5,
    # 10,
]

target_numerators = [
    'adapted_near_tss',
    'total_near_tss',
]

target_denominators = [
    'total_reads',
    # 'unadapted_near_tss',
    'total_adapted',
    # 'total_unadapted',
]
for min_adapted in min_adapteds:
    for target_numerator in target_numerators:
        for target_denominator in target_denominators:
            print(f"\n\nUsing {target_denominator=} for denominator... And {min_adapted=}...\n")
            run_box_bootstrapped(
                better_df,
                target_numerator,
                target_denominator,
                min_adapted_reads=min_adapted,
                min_tss_ada_reads=0,
                bootstrap_cycles=100,
                bootstrap_sample_size=10,
                log_y=True,
                add_tss_ada_pseudo=0.001,
            )
print("Done!")



Using target_denominator='total_reads' for denominator... And min_adapted=1...


Bootstrapping:   0%|          | 0/246 [00:00<?, ?it/s]

Index(['adapted_near_tss', 'ada_not_tss', 'unadapted_near_tss', 'unada_not_tss'], dtype='object')
Saving file to /data16/marcus/scripts/nanoporePipelineScripts/deadenylationPaperFigureUpdates2/plots/decapping_bootstrapped/240820_boxPlot_adapted_near_tss__vs__total_reads_logY_pseudoTSS0.001_minTotal1_minAdapted1_minTSSAdapted0_bootstrapped100cycleX10samples.png/html








Using target_denominator='total_adapted' for denominator... And min_adapted=1...


Bootstrapping:   0%|          | 0/246 [00:00<?, ?it/s]

Index(['adapted_near_tss', 'ada_not_tss', 'unadapted_near_tss', 'unada_not_tss'], dtype='object')
Saving file to /data16/marcus/scripts/nanoporePipelineScripts/deadenylationPaperFigureUpdates2/plots/decapping_bootstrapped/240820_boxPlot_adapted_near_tss__vs__total_adapted_logY_pseudoTSS0.001_minTotal1_minAdapted1_minTSSAdapted0_bootstrapped100cycleX10samples.png/html








Using target_denominator='total_reads' for denominator... And min_adapted=1...


Bootstrapping:   0%|          | 0/246 [00:00<?, ?it/s]

Index(['adapted_near_tss', 'ada_not_tss', 'unadapted_near_tss', 'unada_not_tss'], dtype='object')
Saving file to /data16/marcus/scripts/nanoporePipelineScripts/deadenylationPaperFigureUpdates2/plots/decapping_bootstrapped/240820_boxPlot_total_near_tss__vs__total_reads_logY_pseudoTSS0.001_minTotal1_minAdapted1_minTSSAdapted0_bootstrapped100cycleX10samples.png/html








Using target_denominator='total_adapted' for denominator... And min_adapted=1...


Bootstrapping:   0%|          | 0/246 [00:00<?, ?it/s]

Index(['adapted_near_tss', 'ada_not_tss', 'unadapted_near_tss', 'unada_not_tss'], dtype='object')
Saving file to /data16/marcus/scripts/nanoporePipelineScripts/deadenylationPaperFigureUpdates2/plots/decapping_bootstrapped/240820_boxPlot_total_near_tss__vs__total_adapted_logY_pseudoTSS0.001_minTotal1_minAdapted1_minTSSAdapted0_bootstrapped100cycleX10samples.png/html








Using target_denominator='total_reads' for denominator... And min_adapted=3...


Bootstrapping:   0%|          | 0/145 [00:00<?, ?it/s]

Index(['adapted_near_tss', 'ada_not_tss', 'unadapted_near_tss', 'unada_not_tss'], dtype='object')
Saving file to /data16/marcus/scripts/nanoporePipelineScripts/deadenylationPaperFigureUpdates2/plots/decapping_bootstrapped/240820_boxPlot_adapted_near_tss__vs__total_reads_logY_pseudoTSS0.001_minTotal1_minAdapted3_minTSSAdapted0_bootstrapped100cycleX10samples.png/html








Using target_denominator='total_adapted' for denominator... And min_adapted=3...


Bootstrapping:   0%|          | 0/145 [00:00<?, ?it/s]

Index(['adapted_near_tss', 'ada_not_tss', 'unadapted_near_tss', 'unada_not_tss'], dtype='object')
Saving file to /data16/marcus/scripts/nanoporePipelineScripts/deadenylationPaperFigureUpdates2/plots/decapping_bootstrapped/240820_boxPlot_adapted_near_tss__vs__total_adapted_logY_pseudoTSS0.001_minTotal1_minAdapted3_minTSSAdapted0_bootstrapped100cycleX10samples.png/html








Using target_denominator='total_reads' for denominator... And min_adapted=3...


Bootstrapping:   0%|          | 0/145 [00:00<?, ?it/s]

Index(['adapted_near_tss', 'ada_not_tss', 'unadapted_near_tss', 'unada_not_tss'], dtype='object')
Saving file to /data16/marcus/scripts/nanoporePipelineScripts/deadenylationPaperFigureUpdates2/plots/decapping_bootstrapped/240820_boxPlot_total_near_tss__vs__total_reads_logY_pseudoTSS0.001_minTotal1_minAdapted3_minTSSAdapted0_bootstrapped100cycleX10samples.png/html








Using target_denominator='total_adapted' for denominator... And min_adapted=3...


Bootstrapping:   0%|          | 0/145 [00:00<?, ?it/s]

Index(['adapted_near_tss', 'ada_not_tss', 'unadapted_near_tss', 'unada_not_tss'], dtype='object')
Saving file to /data16/marcus/scripts/nanoporePipelineScripts/deadenylationPaperFigureUpdates2/plots/decapping_bootstrapped/240820_boxPlot_total_near_tss__vs__total_adapted_logY_pseudoTSS0.001_minTotal1_minAdapted3_minTSSAdapted0_bootstrapped100cycleX10samples.png/html








Using target_denominator='total_reads' for denominator... And min_adapted=5...


Bootstrapping:   0%|          | 0/104 [00:00<?, ?it/s]

Index(['adapted_near_tss', 'ada_not_tss', 'unadapted_near_tss', 'unada_not_tss'], dtype='object')
Saving file to /data16/marcus/scripts/nanoporePipelineScripts/deadenylationPaperFigureUpdates2/plots/decapping_bootstrapped/240820_boxPlot_adapted_near_tss__vs__total_reads_logY_pseudoTSS0.001_minTotal1_minAdapted5_minTSSAdapted0_bootstrapped100cycleX10samples.png/html








Using target_denominator='total_adapted' for denominator... And min_adapted=5...


Bootstrapping:   0%|          | 0/104 [00:00<?, ?it/s]

Index(['adapted_near_tss', 'ada_not_tss', 'unadapted_near_tss', 'unada_not_tss'], dtype='object')
Saving file to /data16/marcus/scripts/nanoporePipelineScripts/deadenylationPaperFigureUpdates2/plots/decapping_bootstrapped/240820_boxPlot_adapted_near_tss__vs__total_adapted_logY_pseudoTSS0.001_minTotal1_minAdapted5_minTSSAdapted0_bootstrapped100cycleX10samples.png/html








Using target_denominator='total_reads' for denominator... And min_adapted=5...


Bootstrapping:   0%|          | 0/104 [00:00<?, ?it/s]

Index(['adapted_near_tss', 'ada_not_tss', 'unadapted_near_tss', 'unada_not_tss'], dtype='object')
Saving file to /data16/marcus/scripts/nanoporePipelineScripts/deadenylationPaperFigureUpdates2/plots/decapping_bootstrapped/240820_boxPlot_total_near_tss__vs__total_reads_logY_pseudoTSS0.001_minTotal1_minAdapted5_minTSSAdapted0_bootstrapped100cycleX10samples.png/html








Using target_denominator='total_adapted' for denominator... And min_adapted=5...


Bootstrapping:   0%|          | 0/104 [00:00<?, ?it/s]

Index(['adapted_near_tss', 'ada_not_tss', 'unadapted_near_tss', 'unada_not_tss'], dtype='object')
Saving file to /data16/marcus/scripts/nanoporePipelineScripts/deadenylationPaperFigureUpdates2/plots/decapping_bootstrapped/240820_boxPlot_total_near_tss__vs__total_adapted_logY_pseudoTSS0.001_minTotal1_minAdapted5_minTSSAdapted0_bootstrapped100cycleX10samples.png/html






Done!


In [252]:
def run_box_with_means(input_df: pd.DataFrame,
                       target_num: str, target_denom: str,
                       min_total_reads: int = 1,
                       min_adapted_reads: int = 1,
                       min_tss_ada_reads: int = 0,
                       add_tss_ada_pseudo: float = -1.0,
                       min_number_reps_observed: int = -1,
                       output_dir: Path = current_dir / 'plots' / 'decapping',
                       log_y: bool = True,
                       log2_y: bool = False,
                       template: str = 'none') -> pd.DataFrame:
    input_df = input_df.copy()
    if not output_dir.exists():
        output_dir.mkdir(parents=True)
        print(f"Made {output_dir}")
    
    target_col = f"{target_num} / {target_denom}"
    
    input_df['adapted_near_tss'] = input_df['adapted_near_tss'].fillna(0)
    if add_tss_ada_pseudo > 0:
        input_df['adapted_near_tss'] += add_tss_ada_pseudo
    input_df['unadapted_near_tss'] = input_df['unadapted_near_tss'].fillna(0)
    input_df[target_col] = input_df[target_num] / input_df[target_denom]
    
    box_df = input_df.copy().reset_index()
    
    box_df['strain'] = pd.Categorical(box_df['strain'], ['wildtype', 'smg-5', 'smg-6', 'smg-7'])
    box_df.sort_values(by=['strain', 'nmd_target_type'], inplace=True)
    
    box_df = box_df[box_df['strain'] != 'smg-7']
    box_df['name'] = box_df['gene_name'] + ' ' + box_df['strain'].astype(str)
    
    box_df = box_df[box_df['total_reads'] >= min_total_reads].dropna(axis=0)
    box_df = box_df[box_df['total_adapted'] >= min_adapted_reads].dropna(axis=0)
    
    if log2_y:
        box_df = box_df[box_df[target_col] > 0]
        box_df[f"log2({target_col})"] = np.log2(box_df[target_col])
        target_col = f"log2({target_col})"
        log_y = False
    
    if log_y:
        box_df = box_df[box_df[target_col] > 0]
    
    if min_tss_ada_reads > 0:
        box_df = box_df[box_df['adapted_near_tss'] >= min_tss_ada_reads]
    
    groupby_obj = box_df.reset_index().set_index(['strain', 'gene_name', 'nmd_target_type']).groupby(['strain', 'gene_name', 'nmd_target_type'])
    keep_cols = [target_col, target_num, target_denom]
    groupby_df = groupby_obj[keep_cols].mean()
    std_cols = [f'{col}_std' for col in keep_cols]
    groupby_df[std_cols] = groupby_obj[keep_cols].std()
    groupby_df['successful_reps'] = groupby_obj['rep'].apply(lambda x: ','.join(map(str, x)))
    groupby_df['num_successful_reps'] = groupby_obj['rep'].count()
    groupby_df.reset_index(inplace=True)
    if min_number_reps_observed > 0:
        groupby_df = groupby_df.query("num_successful_reps >= @min_number_reps_observed")
    
    # These colors are updated from the final plots!
    plot_colors = (('#37c871', '#16502d'),  # NMD greens
                   ('#64a7ff', '#00008b'),  # Non NMD blues
                   ('#676767', '#414141'),  # ambiguous greys
                   )
    colors_dict = {'NMD_Targets': plot_colors[0][0],
                   'NMD_NonTargets': plot_colors[1][0],}
    
    fig = px.box(groupby_df,
                 x='strain',
                 y=target_col,
                 color='nmd_target_type',
                 color_discrete_map=colors_dict,
                 points='all',
                 hover_name='gene_name',
                 log_y=log_y,
                 hover_data=[target_num, target_denom, 'successful_reps', 'num_successful_reps'] + std_cols,
                 template=template,
                 title=f"{target_col} by Strain<br>"
                       f"Min Total Reads: {min_total_reads}, Min Adapted Reads: {min_adapted_reads}, Y Scale: {'Log' if log_y else 'Linear'}<br>"
                       f"Means of Replicates",
                 height=500,
                 width=1000,
                 )
    if target_denom == 'total_reads':
        fig.update_yaxes(title_text=f'{target_num} / <br>{target_denom}')
    else:
        fig.update_yaxes(# range=[-0.1, 1.1],
                         title_text=f'{target_num} / <br>{target_denom}')
    fig.update_layout(
        font=dict(
            # family="Courier New, monospace",
            size=16,
            # color="RebeccaPurple",
        ),
                 )
    fig.update_yaxes(nticks=7, ticklabelposition="inside")
    save_friendly_target_col = target_col.replace(' ', '_').replace('/', '_vs_')
    if log_y:
        save_friendly_target_col += '_logY'
    else:
        save_friendly_target_col += '_linY'
        
    if add_tss_ada_pseudo > 0:
        save_friendly_target_col += f"_pseudoTSS{add_tss_ada_pseudo}"
    save_friendly_target_col += f"_minTotal{min_total_reads}_minAdapted{min_adapted_reads}_minTSSAdapted{min_tss_ada_reads}"
    save_friendly_target_col += f"_meanOnGene"
    fig.write_html(output_dir / f"{npCommon.get_dt(for_file=True)}_boxPlot_{save_friendly_target_col}.html")
    fig.write_image(output_dir / f"{npCommon.get_dt(for_file=True)}_boxPlot_{save_friendly_target_col}.png", scale=5)
    fig.show(renderer='firefox')
    return box_df

def run_box_with_means2(input_df: pd.DataFrame,
                       target_num: str, target_denom: str,
                       min_total_reads: int = 1,
                       min_adapted_reads: int = 1,
                       min_tss_ada_reads: int = 0,
                       add_tss_ada_pseudo: float = -1.0,
                       min_number_reps_observed: int = -1,
                       output_dir: Path = current_dir / 'plots' / 'decapping',
                       log_y: bool = True,
                       log2_y: bool = False,
                       template: str = 'none') -> pd.DataFrame:
    input_df = input_df.copy()
    if not output_dir.exists():
        output_dir.mkdir(parents=True)
        print(f"Made {output_dir}")
    
    target_col = f"{target_num} / {target_denom}"
    
    input_df['adapted_near_tss'] = input_df['adapted_near_tss'].fillna(0)
    if add_tss_ada_pseudo > 0:
        input_df['adapted_near_tss'] += add_tss_ada_pseudo
    input_df['unadapted_near_tss'] = input_df['unadapted_near_tss'].fillna(0)
    input_df[target_col] = input_df[target_num] / input_df[target_denom]
    
    box_df = input_df.copy().reset_index()
    
    box_df['strain'] = pd.Categorical(box_df['strain'], ['wildtype', 'smg-5', 'smg-6', 'smg-7'])
    box_df.sort_values(by=['strain', 'nmd_target_type'], inplace=True)
    
    box_df = box_df[box_df['strain'] != 'smg-7']
    box_df['name'] = box_df['gene_name'] + ' ' + box_df['strain'].astype(str)
    
    box_df = box_df[box_df['total_reads'] >= min_total_reads].dropna(axis=0)
    box_df = box_df[box_df['total_adapted'] >= min_adapted_reads].dropna(axis=0)
    
    if log2_y:
        box_df = box_df[box_df[target_col] > 0]
        box_df[f"log2({target_col})"] = np.log2(box_df[target_col])
        target_col = f"log2({target_col})"
        log_y = False
    
    if log_y:
        box_df = box_df[box_df[target_col] > 0]
    
    if min_tss_ada_reads > 0:
        box_df = box_df[box_df['adapted_near_tss'] >= min_tss_ada_reads]
    
    groupby_obj = box_df.reset_index().set_index(['strain', 'gene_name', 'nmd_target_type']).groupby(['strain', 'gene_name', 'nmd_target_type'])
    keep_cols = [target_col, target_num, target_denom]
    groupby_df = groupby_obj[keep_cols].mean()
    std_cols = [f'{col}_std' for col in keep_cols]
    groupby_df[std_cols] = groupby_obj[keep_cols].std()
    groupby_df['successful_reps'] = groupby_obj['rep'].apply(lambda x: ','.join(map(str, x)))
    groupby_df['num_successful_reps'] = groupby_obj['rep'].count()
    groupby_df.reset_index(inplace=True)
    if min_number_reps_observed > 0:
        groupby_df = groupby_df.query("num_successful_reps >= @min_number_reps_observed")
    
    # These colors are updated from the final plots!
    plot_colors = (('#37c871', '#16502d'),  # NMD greens
                   ('#64a7ff', '#00008b'),  # Non NMD blues
                   ('#676767', '#414141'),  # ambiguous greys
                   )
    colors_dict = {'NMD_Targets': plot_colors[0][0],
                   'NMD_NonTargets': plot_colors[1][0],}
    
    fig = px.box(groupby_df,
                 x='strain',
                 y=target_col,
                 color='nmd_target_type',
                 color_discrete_map=colors_dict,
                 # points='all',
                 hover_name='gene_name',
                 log_y=log_y,
                 hover_data=[target_num, target_denom, 'successful_reps', 'num_successful_reps'] + std_cols,
                 template=template,
                 title=f"{target_col} by Strain<br>"
                       f"Min Total Reads: {min_total_reads}, Min Adapted Reads: {min_adapted_reads}, Y Scale: {'Log' if log_y else 'Linear'}<br>"
                       f"Means of Replicates",
                 height=500,
                 width=1000,
                 )
    
    # Add strip plot
    fig.add_trace(go.Scatter(
        x=box_df['strain'],
        y=box_df[target_col],
        mode='markers',
        marker=dict(color=box_df['nmd_target_type'].map(colors_dict)),
        text=box_df['gene_name'],
        name='Strip Plot',
    ))
    
    # Add lines between points of the same gene
    for gene in box_df['gene_name'].unique():
        gene_df = box_df[box_df['gene_name'] == gene]
        if len(gene_df) > 1:
            fig.add_trace(go.Scatter(
                x=gene_df['strain'],
                y=gene_df[target_col],
                mode='lines',
                line=dict(color='gray', dash='dash'),
                showlegend=False
            ))
    
    if target_denom == 'total_reads':
        fig.update_yaxes(title_text=f'{target_num} / <br>{target_denom}')
    else:
        fig.update_yaxes(# range=[-0.1, 1.1],
                         title_text=f'{target_num} / <br>{target_denom}')
    fig.update_layout(
        font=dict(
            # family="Courier New, monospace",
            size=16,
            # color="RebeccaPurple",
        ),
                 )
    fig.update_yaxes(nticks=7, ticklabelposition="inside")
    save_friendly_target_col = target_col.replace(' ', '_').replace('/', '_vs_')
    if log_y:
        save_friendly_target_col += '_logY'
    else:
        save_friendly_target_col += '_linY'
        
    if add_tss_ada_pseudo > 0:
        save_friendly_target_col += f"_pseudoTSS{add_tss_ada_pseudo}"
    save_friendly_target_col += f"_minTotal{min_total_reads}_minAdapted{min_adapted_reads}_minTSSAdapted{min_tss_ada_reads}"
    save_friendly_target_col += f"_meanOnGene"
    fig.write_html(output_dir / f"{npCommon.get_dt(for_file=True)}_boxPlot_{save_friendly_target_col}.html")
    fig.write_image(output_dir / f"{npCommon.get_dt(for_file=True)}_boxPlot_{save_friendly_target_col}.png", scale=5)
    fig.show(renderer='firefox')
    return box_df

min_adapteds = [
    # 0,
    1,
    # 2,
    3,
    # 4,
    # 5,
    # 10,
]

target_denominators = [
    'total_reads',
    # 'unadapted_near_tss',
    'total_adapted',
    # 'total_unadapted',
]
for min_adapted in min_adapteds:
    for target_denominator in target_denominators:
        print(f"\n\nUsing {target_denominator=} for denominator... And {min_adapted=}...\n")
        test_df = run_box_with_means2(
            better_df,
            'adapted_near_tss',
            target_denominator,
            min_adapted_reads=min_adapted,
            min_tss_ada_reads=-1,
            log_y=True,
            add_tss_ada_pseudo=0.001,
            min_number_reps_observed=-1,
        )
print("Done!")



Using target_denominator='total_reads' for denominator... And min_adapted=1...










Using target_denominator='total_adapted' for denominator... And min_adapted=1...










Using target_denominator='total_reads' for denominator... And min_adapted=3...










Using target_denominator='total_adapted' for denominator... And min_adapted=3...








Done!


In [213]:
test_df

Unnamed: 0,gene_name,strain,rep,lib,nmd_target_type,adapted_near_stop,adapted_near_tss,total_adapted,unadapted_near_stop,unadapted_near_tss,total_unadapted,total_near_tss,total_near_stop,total_reads,adapted_near_tss / total_adapted,name
194,C53H9.2,wildtype,1,oldN2,NMD_NonTargets,0,0.001,1,0,34,36,34,0,37,0.001000,C53H9.2 wildtype rep1
205,K08D12.3,wildtype,1,oldN2,NMD_NonTargets,0,8.001,8,0,575,577,583,0,585,1.000125,K08D12.3 wildtype rep1
206,K08D12.3,wildtype,2,newerN2,NMD_NonTargets,0,2.001,2,0,489,491,491,0,493,1.000500,K08D12.3 wildtype rep2
207,K08D12.3,wildtype,3,thirdN2,NMD_NonTargets,0,10.001,10,0,385,389,395,0,399,1.000100,K08D12.3 wildtype rep3
216,R06C1.4,wildtype,1,oldN2,NMD_NonTargets,0,1.001,2,0,201,242,202,0,244,0.500500,R06C1.4 wildtype rep1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,rps-22,smg-6,4,temp25cS6,NMD_Targets,0,2.001,2,56,716,813,718,56,815,1.000500,rps-22 smg-6 rep4
157,rsp-5,smg-6,2,newerS6,NMD_Targets,0,1.001,1,8,31,65,32,8,66,1.001000,rsp-5 smg-6 rep2
179,ubl-1,smg-6,2,newerS6,NMD_Targets,0,17.001,17,0,2363,2512,2380,0,2529,1.000059,ubl-1 smg-6 rep2
180,ubl-1,smg-6,3,thirdS6,NMD_Targets,0,5.001,6,0,613,674,618,0,680,0.833500,ubl-1 smg-6 rep3






Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_reads,adapted_near_tss,adapted_near_tss / total_adapted,successful_reps,num_successful_reps
strain,gene_name,nmd_target_type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
wildtype,C53H9.2,NMD_NonTargets,37.000000,0.001000,0.001000,1,1
wildtype,C53H9.2,NMD_Targets,11.500000,0.001000,0.000750,13,2
wildtype,K08D12.3,NMD_NonTargets,492.333333,6.667667,1.000242,123,3
wildtype,K08D12.3,NMD_Targets,11.000000,0.001000,0.001000,3,1
wildtype,R06C1.4,NMD_NonTargets,225.500000,2.251000,0.669206,1234,4
...,...,...,...,...,...,...,...
smg-7,rsp-5,NMD_Targets,,,,,0
smg-7,rsp-6,NMD_NonTargets,,,,,0
smg-7,rsp-6,NMD_Targets,,,,,0
smg-7,ubl-1,NMD_NonTargets,,,,,0
