# testingReadPlotting_plotly.ipynb
## Marcus Viscardi,    May 23, 2022

This is made to work in conjunction with SimpleReadPlotting_cigars.py. This is really just a place for faster iterations of testing!

In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
pd.set_option('display.width', 100)
pd.set_option('display.max_columns', None)

import seaborn as sea
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = "browser"

from nanoporePipelineCommon import *

print("imports done")

imports done


In [3]:
# Adding by drawing thick lines
def add_line(figure, x0, x1, y, width=10, text=None, color='black'):
    figure.add_trace(go.Scatter(x=[x0,x1], y=[y, y],
                                hovertext=text,
                                line=dict(width=width, color=color),
                                marker=dict(opacity=0)))

# Adding by drawing boxes
def add_box(figure, x0, x1, y, width=4.5, text=None, color='black'):
    figure.add_shape(type='rect',
                     xref='x', x0=x0, x1=x1,
                     yref='y', y0=y+(width/10), y1=y-(width/10),
                     line=dict(width=0),
                     fillcolor=color,
                     )

In [13]:
import re
def add_patches_from_cigars_and_gen_pos(figure, cigar, gen_start, y):
    # Parse the cigar string
    parsed_cigar = re.findall(rf'(\d+)([MDNSIX])', cigar)
    mdn_nums = [int(num) for num, char in parsed_cigar if char in "MDN"]
    read_end = gen_start + sum(mdn_nums)
    
    genome_loc = gen_start
    for nucl, code in parsed_cigar:
        nucl = int(nucl)
        if code == 'S':
            genome_loc += nucl
            figure.add_scatter(x=[genome_loc], y=[y], hovertext=f"Read Number {y}<br>{cigar}", name="",
                    marker=dict(color='black'))
        elif code == 'M':
            add_box(figure, genome_loc, genome_loc+nucl,y)
            genome_loc += nucl
        elif code == 'I':
            add_box(figure, genome_loc-0.1, genome_loc,y, color='red')
        elif code in ['D', 'N']:
            if nucl > 5:
                width = 2
            else:
                width=4.5
            add_box(figure, genome_loc, genome_loc+nucl,y, width=width)
            genome_loc += nucl
    return gen_start, read_end

# Make some fake data:
test_cigar = "0S10M2D10I25M"
genome_start = 3

test_cigar_and_gen_starts = [("2S4M20I10M13D5M2S",2),
                             ("0S10M2D10I25M",3),
                             ("5S20M3I25M",3),
                             ("4S13M10I20M13I5M2S",5),
                             ("4S13M10I20M5M2S",1),
                             ("20M13I5M2S",5),
                             ("4S13M10I20D5M2S",1),
                             ("4S8M1D14M2I14M1D11M2D23M1I46M1D8M1D8M1I19M1I5M",3),
                             ("0S13M15I5M2S",3),]
fig = go.Figure()
fig.update_xaxes(range=[0,50])
fig.update_yaxes(range=[0,len(test_cigar_and_gen_starts)+1])
coverage = []

for i, (cigar, gen_start) in enumerate(test_cigar_and_gen_starts):
    i += 1
    coverage.append(add_patches_from_cigars_and_gen_pos(fig, cigar, gen_start, i))
    print(coverage)

fig.update_layout(showlegend=False, template='plotly_white')
#x axis
fig.update_xaxes(visible=False)
#y axis
fig.update_yaxes(visible=False)
fig

[(2, 34)]
[(2, 34), (3, 40)]
[(2, 34), (3, 40), (3, 48)]
[(2, 34), (3, 40), (3, 48), (5, 43)]
[(2, 34), (3, 40), (3, 48), (5, 43), (1, 39)]
[(2, 34), (3, 40), (3, 48), (5, 43), (1, 39), (5, 30)]
[(2, 34), (3, 40), (3, 48), (5, 43), (1, 39), (5, 30), (1, 39)]
[(2, 34), (3, 40), (3, 48), (5, 43), (1, 39), (5, 30), (1, 39), (3, 165)]
[(2, 34), (3, 40), (3, 48), (5, 43), (1, 39), (5, 30), (1, 39), (3, 165), (3, 21)]


In [5]:
reads_df_genes_raw, compressed_df_genes_raw = load_and_merge_lib_parquets(["xrn-1-5tera"], drop_sub_n=1, add_tail_groupings=False, drop_failed_polya=False, group_by_t5=True)
print("done.")

Loading readAssignments file from: /data16/marcus/genomes/elegansRelease100/Caenorhabditis_elegans.WBcel235.100.allChrs.parquet... Done.
Looking for files for libraries: ['xrn-1-5tera']
Looking for file for xrn-1-5tera, at /data16/marcus/working/211118_nanoporeRun_totalRNA_5108_xrn-1-KD_5TERA/output_dir/merge_files/*_mergedOnReads.parquet... File Found.
Loading parquet for xrn-1-5tera lib... Done.
'original_chr_pos' column already found in dataframe, skipping adjustment for 5'ends!
Finished assignment merge!. . .
Read counts post gene assignment:  701607
Read counts post unassigned drop:  474099
Creating groupby dataframe merged on: ['lib', 'chr_id', 'gene_id', 'gene_name']
	+ [t5] tag


Counting reads per gene: 100%|██████████| 18756/18756 [00:00<00:00, 60846.54it/s]


Gene counts pre sub-1 gene_hits drop:  18756
Gene counts post sub-1 gene_hits drop:  18756
done.


In [14]:
reads_df = reads_df_genes_raw.copy()
compressed_df = compressed_df_genes_raw.copy()

In [15]:
ubl_df = reads_df.query("gene_name == 'ubl-1'").reset_index(drop=True).head(5)

def row_apply_plot_cigar(row, fig):
    index = row.name
    cigar = row.cigar
    gen_start = row.chr_pos
    is_adapted = row.t5
    add_patches_from_cigars_and_gen_pos(fig, cigar, gen_start, index)
    
fig = go.Figure()
tqdm.pandas()
ubl_df.progress_apply(lambda row: row_apply_plot_cigar(row, fig), axis=1)
fig.update_layout(showlegend=False, template='plotly_white')
#x axis
fig.update_xaxes(visible=False)
#y axis
fig.update_yaxes(visible=False)
fig

100%|██████████| 25/25 [04:06<00:00,  9.84s/it]
