In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
pd.set_option('display.width', 100)
pd.set_option('display.max_columns', None)

import seaborn as sea
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = "browser"

from nanoporePipelineCommon import *

print("imports done")

imports done


In [3]:
reads_df_genes_raw, compressed_df_genes_raw = load_and_merge_lib_parquets(["xrn-1-5tera", "xrn-1-5tera-smg-6"], drop_sub_n=1, add_tail_groupings=False, drop_failed_polya=False, group_by_t5=True)
print("done.")

Loading readAssignments file from: /data16/marcus/genomes/elegansRelease100/Caenorhabditis_elegans.WBcel235.100.allChrs.parquet... Done.
Looking for files for libraries: ['xrn-1-5tera', 'xrn-1-5tera-smg-6']
Looking for file for xrn-1-5tera, at /data16/marcus/working/211118_nanoporeRun_totalRNA_5108_xrn-1-KD_5TERA/output_dir/merge_files/*_mergedOnReads.parquet... File Found.
Looking for file for xrn-1-5tera-smg-6, at /data16/marcus/working/211210_nanoporeRun_totalRNA_2102_xrn-1-KD_5TERA/output_dir/merge_files/*_mergedOnReads.parquet... File Found.
Loading parquet for xrn-1-5tera lib... Done.
'original_chr_pos' column already found in dataframe, skipping adjustment for 5'ends!
Loading parquet for xrn-1-5tera-smg-6 lib... Done.
'original_chr_pos' column already found in dataframe, skipping adjustment for 5'ends!
Finished assignment merge!. . .
Read counts post gene assignment:  937835
Read counts post unassigned drop:  636409
Creating groupby dataframe merged on: ['lib', 'chr_id', 'gene_i

Counting reads per gene: 100%|██████████| 31278/31278 [00:00<00:00, 56603.07it/s]


Gene counts pre sub-1 gene_hits drop:  31278
Gene counts post sub-1 gene_hits drop:  31278
done.


In [4]:
reads_df = reads_df_genes_raw.copy()
compressed_df = compressed_df_genes_raw.copy()

In [28]:
reads_df.query("gene_name == 'ubl-1'").query("lib == 'xrn-1-5tera-smg-6'").query("t5 == '+'")
reads_df['lib_t5'] = reads_df.lib.astype(str) + "_" + reads_df.t5.astype(str)

0               xrn-1-5tera_-
1               xrn-1-5tera_-
2               xrn-1-5tera_-
3               xrn-1-5tera_-
4               xrn-1-5tera_-
                 ...         
636404    xrn-1-5tera-smg-6_-
636405    xrn-1-5tera-smg-6_-
636406    xrn-1-5tera-smg-6_-
636407    xrn-1-5tera-smg-6_-
636408    xrn-1-5tera-smg-6_+
Name: lib_t5, Length: 636409, dtype: object

In [29]:
# NMD Targets:
gene_violin_list = ['ubl-1', ]  # 'rpl-7A', 'odc-1', 'Y73B3A.18','rpl-30', 'rpl-1']
# Not NMD Targets:
# gene_violin_list = ['col-125', 'col-160', 'col-129', 'dod-19', 'vit-3', 'vit-4']

plot_df = reads_df[reads_df['gene_name'].isin(gene_violin_list)].sort_values('gene_name')
# 
# plot_df = reads_df.query("lib == 'xrn-1-5tera'")  #.query("chr_id != 'MtDNA'")

fig = go.Figure()
fig.add_trace(go.Violin(x=plot_df.query("t5 == '+'")['lib'],
                        y=plot_df.query("t5 == '+'")['polya_length'],
                        name='t5 +',
                        side='negative',
                        # fillcolor='#fbc6c1',
                        fillcolor='firebrick',
                        spanmode='hard',
                        ))
fig.add_trace(go.Violin(x=plot_df.query("t5 == '-'")['lib'],
                        y=plot_df.query("t5 == '-'")['polya_length'],
                        name='t5 -',
                        side='positive',
                        # fillcolor='#c5d8e9',
                        fillcolor='dimgray',
                        spanmode='hard',
                        ))
fig.update_traces(meanline_visible=True,
                  scalemode='count',
                  points='outliers',
                  width=0,
                  line_color='black',
                  box_visible=True)
fig.update_layout(violinmode='overlay',
                  violingap=0,
                  margin={'l': 0, 'b': 40, 't': 10, 'r': 40},
                  yaxis_title=f"Distribution of PolyA Tail Length Calls",
                  template='plotly_white',
                  width=1000, height=450,
                  )
fig.update_yaxes(range=[-5, 201])
fig.write_image(f"./{get_dt(for_file=True)}_splitViolins_exampleGenes.svg")
fig.show()

In [17]:
plot_df['passed_polya'] = ~plot_df['polya_length'].isna()
# fig = sea.countplot(data=plot_df,
#                     x='gene_name',
#                     hue='passed_polya')
# plt.show()
fig = px.histogram(plot_df,
                   x='gene_name',
                   pattern_shape='passed_polya',
                   color_discrete_sequence=['#c5d8e9', '#fbc6c1',],
                   color='t5')
fig.update_layout(barmode='relative')
fig.update_layout(margin={'l': 0, 'b': 40, 't': 10, 'r': 40},
                  template='plotly_white')
fig.show()

In [51]:
plot_df = reads_df[reads_df['gene_name'].isin(gene_violin_list)].sort_values('gene_name')
# 
# plot_df = reads_df.query("lib == 'xrn-1-5tera'")  #.query("chr_id != 'MtDNA'")
plot_df = plot_df.query("lib_t5 != 'xrn-1-5tera-smg-6_+'")
fig = go.Figure()
fig.add_trace(go.Violin(x=plot_df.query("t5 == '+'")['lib'],
                        y=plot_df.query("t5 == '+'")['polya_length'],
                        name='t5 +',
                        side='negative',
                        # fillcolor='#fbc6c1',
                        fillcolor='firebrick',
                        spanmode='hard',
                        ))
fig.add_trace(go.Violin(x=plot_df.query("t5 == '-'")['lib'],
                        y=plot_df.query("t5 == '-'")['polya_length'],
                        name='t5 -',
                        side='positive',
                        # fillcolor='#c5d8e9',
                        fillcolor='dimgray',
                        spanmode='hard',
                        ))
fig.update_traces(meanline_visible=True,
                  scalemode='count',
                  points='outliers',
                  # width=1,
                  line_color='black',
                  box_visible=True,
                  )
fig.update_layout(violinmode='overlay',
                  violingap=0,
                  margin={'l': 0, 'b': 40, 't': 10, 'r': 40},
                  yaxis_title=f"Distribution of PolyA Tail Length Calls",
                  template='plotly_white',
                  width=500, height=450,
                  )
fig.update_yaxes(range=[-5, 201])
fig.write_image(f"./{get_dt(for_file=True)}_splitViolins_exampleGenes.svg")
fig.show()