# lib_v_lib_scatterPlots.ipynb
### Marcus Viscardi,    August 31, 2023

Simple script with goal to look at read count differences between libs

In [1]:
import sys
import warnings

from tqdm.notebook import tqdm

import seaborn as sea
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

import numpy as np
import pandas as pd
import statistics as stats
from pathlib import Path

sys.path.insert(0, '/data16/marcus/scripts/nanoporePipelineScripts')
import nanoporePipelineCommon as npCommon

pio.renderers.default = "browser"

pd.set_option('display.width', 200)
pd.set_option('display.max_columns', None)

CONVERSION_DICT = npCommon.CONVERSION_DICT
REV_CONVERSION_DICT = npCommon.REV_CONVERSION_DICT

print(f"Imports done at {npCommon.get_dt(for_print=True)}")

Imports done at 09/20/23 @ 10:55:51 AM


In [35]:
regenerate = False
libs_to_load = sorted({
    'oldN2',
    'oldS6',
    'newerN2',
    'newerS6',
    'newerS5',
    'thirdN2',
    'thirdS5',
    'thirdS6',
})

try:
    if regenerate:
        raise FileNotFoundError
    
    reads_df_raw_path = npCommon.find_newest_matching_file(f"./output_files/mega_merge_parquets/*_{'-'.join(libs_to_load)}_merged5TERA.reads_df.parquet")
    compressed_df_genes_raw_path = npCommon.find_newest_matching_file(f"./output_files/mega_merge_parquets/*_{'-'.join(libs_to_load)}_merged5TERA.compressed_df.parquet")
    print(f"Found preprocessed files at:\n\t{reads_df_raw_path}\nand:\n\t{compressed_df_genes_raw_path}")

    reads_df_genes_raw = pd.read_parquet(reads_df_raw_path)
    compressed_df_genes_raw = pd.read_parquet(compressed_df_genes_raw_path)
except FileNotFoundError:
    print(f"Could not find preprocessed files matching these libs: {'/'.join(libs_to_load)}\nGoing to create new ones from scratch! This will take longer.")
    reads_df_genes_raw, compressed_df_genes_raw = npCommon.load_and_merge_lib_parquets([REV_CONVERSION_DICT[lib] for lib in libs_to_load],
                                                                                       drop_sub_n=1,
                                                                                       add_tail_groupings=False,
                                                                                       drop_failed_polya=False,
                                                                                       group_by_t5=True,
                                                                                       use_josh_assignment=False)
    print(f"Saving new parquets to speed up future runs.")
    reads_df_genes_raw.to_parquet(f"./output_files/mega_merge_parquets/{npCommon.get_dt()}_{'-'.join(libs_to_load)}_merged5TERA.reads_df.parquet")
    compressed_df_genes_raw.to_parquet(f"./output_files/mega_merge_parquets/{npCommon.get_dt()}_{'-'.join(libs_to_load)}_merged5TERA.compressed_df.parquet")
print(f"Lib load done @ {npCommon.get_dt(for_print=True)}")

compressed_df_genes_short = compressed_df_genes_raw.copy()[["lib", "chr_id", "gene_id", "gene_name", "t5", "gene_hits", "gene_rpm"]]
compressed_df_genes_short.query("gene_name == 'rpl-12'")

Could not find preprocessed files matching these libs: newerN2/newerS5/newerS6/oldN2/oldS6/thirdN2/thirdS5/thirdS6
Going to create new ones from scratch! This will take longer.
Looking for files for libraries: ['5tera_xrn-1-KD_wt_rerun', '5tera_xrn-1-KD_smg-5_rerun', '5tera_xrn-1-KD_smg-6_rerun', 'xrn-1-5tera', 'xrn-1-5tera-smg-6', '5tera_xrn-1-KD_wt_third', '5tera_xrn-1-KD_smg-5_third', '5tera_xrn-1-KD_smg-6_third']
Looking for file for 5tera_xrn-1-KD_wt_rerun, at /data16/marcus/working/230327_nanoporeRun_totalRNA_wt_xrn-1-KD_5TERA_rerun/output_dir/merge_files/*_mergedOnReads.parquet... File Found.
Looking for file for 5tera_xrn-1-KD_smg-5_rerun, at /data16/marcus/working/230410_nanoporeRun_totalRNA_smg-5_xrn-1-KD_5TERA_rerun/output_dir/merge_files/*_mergedOnReads.parquet... File Found.
Looking for file for 5tera_xrn-1-KD_smg-6_rerun, at /data16/marcus/working/230403_nanoporeRun_totalRNA_smg-6_xrn-1-KD_5TERA_rerun/output_dir/merge_files/*_mergedOnReads.parquet... File Found.
Looking f

100%|██████████| 1053185/1053185 [00:13<00:00, 76320.39it/s] 


Loading parquet for 5tera_xrn-1-KD_smg-5_rerun lib... Done.

Making adjustments for 5' ends:


100%|██████████| 557991/557991 [00:09<00:00, 60635.14it/s] 


Loading parquet for 5tera_xrn-1-KD_smg-6_rerun lib... Done.

Making adjustments for 5' ends:


100%|██████████| 1226144/1226144 [00:17<00:00, 70979.39it/s] 


Loading parquet for xrn-1-5tera lib... Done.

Making adjustments for 5' ends:


100%|██████████| 701680/701680 [00:12<00:00, 54143.16it/s]


Loading parquet for xrn-1-5tera-smg-6 lib... Done.

Making adjustments for 5' ends:


100%|██████████| 236286/236286 [00:03<00:00, 64218.75it/s]


Loading parquet for 5tera_xrn-1-KD_wt_third lib... Done.

Making adjustments for 5' ends:


100%|██████████| 1186602/1186602 [00:14<00:00, 79583.80it/s] 


Loading parquet for 5tera_xrn-1-KD_smg-5_third lib... Done.

Making adjustments for 5' ends:


100%|██████████| 1440373/1440373 [00:19<00:00, 72187.54it/s] 


Loading parquet for 5tera_xrn-1-KD_smg-6_third lib... Done.

Making adjustments for 5' ends:


100%|██████████| 560899/560899 [00:06<00:00, 92038.02it/s] 


Skipping assignment with Josh method and relying on whatever assignment was made by the pipeline!
Not keeping transcript information. . . (not using Josh assignment method will also force this!)
Finished dropping dup. columns.
Read counts post gene assignment:  6963160
Read counts post unassigned drop:  5294541
Creating groupby dataframe merged on: ['lib', 'chr_id', 'gene_id', 'gene_name']
	+ [t5] tag


Counting reads per gene: 100%|██████████| 129127/129127 [00:02<00:00, 47237.49it/s]


Gene counts pre sub-1 gene_hits drop:  129127
Gene counts post sub-1 gene_hits drop:  129127
Saving new parquets to speed up future runs.
Lib load done @ 09/20/23 @ 05:09:58 PM


Unnamed: 0,lib,chr_id,gene_id,gene_name,t5,gene_hits,gene_rpm
8494,5tera_xrn-1-KD_smg-5_rerun,IV,WBGene00004424,rpl-12,-,2705,6640.252943
8495,5tera_xrn-1-KD_smg-5_rerun,IV,WBGene00004424,rpl-12,+,37,90.827859
26108,5tera_xrn-1-KD_smg-5_third,IV,WBGene00004424,rpl-12,-,4119,3199.618438
26109,5tera_xrn-1-KD_smg-5_third,IV,WBGene00004424,rpl-12,+,64,49.714877
44454,5tera_xrn-1-KD_smg-6_rerun,IV,WBGene00004424,rpl-12,-,5440,7276.861853
44455,5tera_xrn-1-KD_smg-6_rerun,IV,WBGene00004424,rpl-12,+,53,70.895897
59375,5tera_xrn-1-KD_smg-6_third,IV,WBGene00004424,rpl-12,-,1640,3134.742311
59376,5tera_xrn-1-KD_smg-6_third,IV,WBGene00004424,rpl-12,+,23,43.962849
73739,5tera_xrn-1-KD_wt_rerun,IV,WBGene00004424,rpl-12,-,1739,3489.550412
73740,5tera_xrn-1-KD_wt_rerun,IV,WBGene00004424,rpl-12,+,36,72.239111


In [36]:
conversion_dict = CONVERSION_DICT
ans = [y for x, y in compressed_df_genes_short.groupby(['lib', 't5'], as_index=False)]
df_dict = {}
for i, df in enumerate(ans):
    lib = df.lib.unique()[0]
    t5 = df.t5.unique()[0]
    df = df[["chr_id", "gene_id", "gene_name", "gene_hits", "gene_rpm"]]
    df = df.rename(columns={col: f'{col}_{conversion_dict[lib]}_t5{t5}' for col in df.columns if col not in ["chr_id", "gene_id", "gene_name"]})
    df_dict[(conversion_dict[lib], t5)] = df.set_index(["chr_id", "gene_id", "gene_name"])
    # print((conversion_dict[lib], t5))
    # print(df_dict[(conversion_dict[lib], t5)].query("gene_name == 'rpl-12'"))

super_df = pd.concat(df_dict.values(), axis=1, join='outer').fillna(0)
super_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,gene_hits_newerS5_t5+,gene_rpm_newerS5_t5+,gene_hits_newerS5_t5-,gene_rpm_newerS5_t5-,gene_hits_thirdS5_t5+,gene_rpm_thirdS5_t5+,gene_hits_thirdS5_t5-,gene_rpm_thirdS5_t5-,gene_hits_newerS6_t5+,gene_rpm_newerS6_t5+,gene_hits_newerS6_t5-,gene_rpm_newerS6_t5-,gene_hits_thirdS6_t5+,gene_rpm_thirdS6_t5+,gene_hits_thirdS6_t5-,gene_rpm_thirdS6_t5-,gene_hits_newerN2_t5+,gene_rpm_newerN2_t5+,gene_hits_newerN2_t5-,gene_rpm_newerN2_t5-,gene_hits_thirdN2_t5+,gene_rpm_thirdN2_t5+,gene_hits_thirdN2_t5-,gene_rpm_thirdN2_t5-,gene_hits_oldN2_t5+,gene_rpm_oldN2_t5+,gene_hits_oldN2_t5-,gene_rpm_oldN2_t5-,gene_hits_oldS6_t5+,gene_rpm_oldS6_t5+,gene_hits_oldS6_t5-,gene_rpm_oldS6_t5-
chr_id,gene_id,gene_name,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
I,WBGene00000013,abf-2,1.0,2.454807,0.0,0.000000,0.0,0.000000,2.0,1.553590,0.0,0.000000,1.0,1.337658,0.0,0.000000,2.0,3.822856,0.0,0.000000,2.0,4.013284,0.0,0.000000,0.0,0.000000,0.0,0.000000,1.0,1.753660,0.0,0.0,0.0,0.000000
I,WBGene00000138,amx-2,1.0,2.454807,7.0,17.183649,3.0,2.330385,21.0,16.312694,0.0,0.000000,27.0,36.116778,1.0,1.911428,7.0,13.379998,0.0,0.000000,7.0,14.046494,1.0,0.926648,16.0,14.826360,1.0,1.753660,11.0,19.290259,0.0,0.0,3.0,16.542415
I,WBGene00000140,anc-1,1.0,2.454807,38.0,93.282666,3.0,2.330385,58.0,45.054108,2.0,2.675317,66.0,88.285456,0.0,0.000000,6.0,11.468569,0.0,0.000000,36.0,72.239111,1.0,0.926648,39.0,36.139253,5.0,8.768299,23.0,40.334177,0.0,0.0,3.0,16.542415
I,WBGene00000150,apm-1,3.0,7.364421,28.0,68.734596,4.0,3.107180,56.0,43.500518,1.0,1.337658,63.0,84.272481,0.0,0.000000,8.0,15.291426,1.0,2.006642,34.0,68.225827,2.0,1.853295,31.0,28.726073,7.0,12.275619,30.0,52.609797,0.0,0.0,11.0,60.655521
I,WBGene00000158,apg-1,4.0,9.819228,21.0,51.550947,1.0,0.776795,58.0,45.054108,2.0,2.675317,60.0,80.259506,0.0,0.000000,13.0,24.848567,5.0,10.033210,41.0,82.272321,5.0,4.633238,36.0,33.359310,1.0,1.753660,33.0,57.870776,0.0,0.0,6.0,33.084830
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X,WBGene00173542,21ur-7915,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,1.0,5.514138
X,WBGene00174909,21ur-14059,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,1.0,5.514138
X,WBGene00199316,PDB1.2,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,1.0,5.514138
X,WBGene00206484,C31H2.14,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,1.0,5.514138


In [37]:
# Regenerate just the total RPM for each gene:
for lib in libs_to_load:
    super_df[f"gene_rpm_{lib}"] = super_df[[f"gene_rpm_{lib}_t5+", f"gene_rpm_{lib}_t5-"]].sum(axis=1)

In [66]:
from plotly.subplots import make_subplots
from plotly import graph_objects as go
plot_df = super_df.copy()
genes_to_exclude = ['xrn-1', 'rrn-2.1', 'F23A7.4', 'F23A7.8', 'unNamed']
plot_df = plot_df[~plot_df.index.get_level_values('gene_name').isin(genes_to_exclude)]
plot_df = plot_df.sort_index()

def plot_rockets(l1, l2, plotting_df, save_dir=None, force_limits=False):
    fig = make_subplots(rows=1, cols=3,
                        subplot_titles=[f"{l1} vs {l2} RPM (total)",
                                        f"{l1} vs {l2} RPM (unadapted)",
                                        f"{l1} vs {l2} RPM (adapted)",
                                        ],
                        # shared_yaxes=True,
                        # shared_xaxes=True,
                        row_heights=[500],
                        column_widths=[500, 500, 500],
                        )
    for i, t5 in enumerate(['', '_t5-', '_t5+']):
        subplot = px.scatter(plotting_df.reset_index(),
                             x=f"gene_rpm_{l1}{t5}",
                             y=f"gene_rpm_{l2}{t5}",
                             hover_name="gene_name",
                             )
        fig.add_trace(subplot.data[0], row=1, col=i + 1)

        fig.update_xaxes(
            title=f"{l1} RPM {t5.strip('_')}",
            ticks="inside", ticklen=5, showgrid=True, gridcolor='lightgrey', type='log',
            minor=dict(ticks="inside", ticklen=5, showgrid=True),
            row=1, col=i + 1,
        )
        fig.update_yaxes(
            title=f"{l2} RPM {t5.strip('_')}",
            ticks="inside", ticklen=5, showgrid=True, gridcolor='lightgrey', type='log',
            minor=dict(ticks="inside", ticklen=5, showgrid=True),
            row=1, col=i + 1,
        )
        if force_limits:
            if i != 2:
                limits = [0.5, 4.5]
            else:
                limits = [0, 3]
            fig.update_xaxes(range=limits, row=1, col=i + 1)
            fig.update_yaxes(range=limits, row=1, col=i + 1)
    fig.update_traces(marker=dict(size=5,
                                  color='black',
                                  ),
                      )
    fig.update_layout(height=500,
                      width=1500,
                      template='none')
    if save_dir:
        if not Path(save_dir).exists():
            warnings.warn(f"Save directory doesn't exist! Making it now at: {save_dir}")
            Path(save_dir).mkdir(parents=True)
        fig.write_html(f"{save_dir}/{l1}_v_{l2}_scatters.html")
        fig.write_image(f"{save_dir}/{l1}_v_{l2}_scatters.png")
        fig.write_image(f"{save_dir}/{l1}_v_{l2}_scatters.svg")
    
    fig.show(renderer='firefox')
    return fig

def plot_rocket_grid(libs, plotting_df, save_dir=None, force_limits=True):
    lib_list_in_order = sorted(libs)
    plotting_df = plotting_df[[f"gene_rpm_{lib}" for lib in lib_list_in_order]]
    fig = go.Figure(data=go.Splom(
        dimensions=[dict(label=f"{lib}", values=plotting_df[f"gene_rpm_{lib}"]) for lib in lib_list_in_order],
        showupperhalf=False,
        text=plotting_df.index.get_level_values('gene_name'),
        marker=dict(color='black',
                    size=5,
                    opacity=0.5),
    ))
    fig.update_layout(
        title=f"Gene RPMs for {', '.join(libs)}",
        width=1000,
        height=1000,
    )
    axes_layout_dir = dict(ticks="inside", ticklen=5, showgrid=True, gridcolor='lightgrey', type='log',
                           minor=dict(ticks="inside", ticklen=5, showgrid=True))
    if force_limits:
        axes_layout_dir['range'] = [0.5, 4.5]
    update_layout_dict_base = {'xaxis': axes_layout_dir, 'yaxis': axes_layout_dir}
    update_layout_dict_xauto = {f'xaxis{i}': axes_layout_dir for i in range(1, len(libs) + 1)}
    update_layout_dict_yauto = {f'yaxis{i}': axes_layout_dir for i in range(1, len(libs) + 1)}
    update_layout_dict = {**update_layout_dict_base, **update_layout_dict_xauto, **update_layout_dict_yauto}
    fig.update_layout(**update_layout_dict)
    if save_dir:
        if not Path(save_dir).exists():
            warnings.warn(f"Save directory doesn't exist! Making it now at: {save_dir}")
            Path(save_dir).mkdir(parents=True)
        fig.write_html(f"{save_dir}/{'-'.join(libs)}_scatters.html")
        fig.write_image(f"{save_dir}/{'-'.join(libs)}_scatters.png")
        fig.write_image(f"{save_dir}/{'-'.join(libs)}_scatters.svg")
    fig.show()

In [57]:
# lib_combinations = [(l1, l2) for l1 in libs_to_load for l2 in libs_to_load if l1 != l2]
lib_combinations = [
    # ('oldS6', 'newerS6'),
    # ('oldN2', 'newerN2'),
    # ('oldN2', 'oldS6'),
    ('newerN2', 'newerS6'),
    # ('newerN2', 'newerS5'),
    ('newerS6', 'thirdS6'),
    ('newerN2', 'thirdS6'),
    
]

output_directory = f"/home/marcus/Insync/mviscard@ucsc.edu/Google Drive/insync_folder/NMD_cleavage_and_deadenylation_paper/raw_figures_from_python/{npCommon.get_dt()}_scatterPlots"

for lib_1, lib_2 in lib_combinations:
    plot_rockets(lib_1, lib_2, plot_df, save_dir=output_directory, force_limits=True)

----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 52998)
Traceback (most recent call last):
  File "/usr/lib/python3.8/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/lib/python3.8/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/usr/lib/python3.8/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/lib/python3.8/socketserver.py", line 747, in __init__
    self.handle()
  File "/usr/lib/python3.8/http/server.py", line 427, in handle
    self.handle_one_request()
  File "/usr/lib/python3.8/http/server.py", line 415, in handle_one_request
    method()
  File "/usr/local/lib/python3.8/dist-packages/plotly/io/_base_renderers.py", line 697, in do_GET
    self.wfile.write(html[i : i + bufferSize])
  File "/usr/lib/python3.8/socketserver.py", line

In [69]:
libs_to_plot = ['newerN2', 'newerS6', 'newerS5', 'thirdN2', 'thirdS5', 'thirdS6']

plot_rocket_grid(libs_to_plot,
                 plot_df,
                 save_dir=output_directory,
                 force_limits=True)