# readLengthNotebook.ipynb
## Marcus Viscardi, March 23, 2022

Plan is to take a lot of this from plotlyReadLengths.py

In [5]:
import pandas as pd
import numpy as np

pd.set_option('display.width', 400)
pd.set_option('display.max_columns', None)

import plotly.express as px
import seaborn as sea
import matplotlib.pyplot as plt

from nanoporePipelineCommon import find_newest_matching_file, get_dt, load_read_assignments, pick_libs_return_paths_dict
from plotlyReadLengths import load_for_cds_based_plotting, _reads_past_start, _test_if_cds_len_consistent
print("Imports done.")

Imports done.


In [3]:
run_with = ["polyA3", "totalRNA3", "polyA2", "totalRNA2"]
force_long_df_build = False

if not force_long_df_build:
    search_path = f"../testInputs/*_{'-'.join(run_with)}.longForReadLengths.parquet"
    try:
        longest_df_path = find_newest_matching_file(search_path)
        print(f"Found preprocessed at {longest_df_path}\nLoading now.", end='')
        longest_df = pd.read_parquet(longest_df_path)
        print(" Done.")
    except ValueError:
        print(f"Couldn't find pre-processed file at: {search_path}\nGoing to load from library files!")
        force_long_df_build = True
if force_long_df_build:
    lib_path_dict = pick_libs_return_paths_dict(run_with)
    longest_df = load_for_cds_based_plotting(lib_path_dict, subset=None)
    save_path = f"../testInputs/{get_dt(for_file=True)}_{'-'.join(run_with)}.longForReadLengths.parquet"
    longest_df.to_parquet(save_path)
    print(f"Saved new long form file to: {save_path}")

Found preprocessed at ../testInputs/220302_polyA3-totalRNA3-polyA2-totalRNA2.longForReadLengths.parquet
Loading now. Done.


### Process dataframe:

In [39]:
super_df = longest_df.copy()
distance_from_start_cutoff=50
genes_or_txns="genes"
filter_hits_less_than=40

from tqdm import tqdm
# First lets add a column to hold the past_start information
super_df["to_start"] = super_df["to_start"].astype('int64')
super_df["to_stop"] = super_df["to_stop"].astype('int64')

tqdm.pandas(desc="Calculating dist past start codons")
super_df["past_start"] = super_df.progress_apply(lambda row: _reads_past_start(row["to_start"],
                                                                               cut_off=distance_from_start_cutoff),
                                                 axis=1)

tqdm.pandas(desc="Calculating cds lengths")
super_df["cds_len"] = super_df.progress_apply(lambda row: abs(row["to_start"] - row["to_stop"] + 4),
                                              axis=1)
print("Finished calculating read lengths")

compress_list = ["lib", "gene_id_fromAssign"]
if genes_or_txns == "txns":
    compress_list.append("transcript_id")
group_by_txs = super_df.groupby(by=compress_list, observed=True)
grouped_df = pd.DataFrame(group_by_txs["past_start"].apply(np.mean))
grouped_df["transcript_hits"] = group_by_txs["past_start"].apply(len)
grouped_df = grouped_df[grouped_df["transcript_hits"] > filter_hits_less_than]

if genes_or_txns == 'txns':
    tqdm.pandas(desc="Checking for only one cds per transcript")
else:
    tqdm.pandas(desc="Getting average cds length for genes w/ >1 transcript")

grouped_df["cds_len"] = group_by_txs["cds_len"].progress_apply(lambda cds_lens:
                                                               _test_if_cds_len_consistent(cds_lens,
                                                                                           genes_or_txns=genes_or_txns,
                                                                                           zero_multi_txn_genes=True))
grouped_df = grouped_df.query("cds_len != 0")
print("Done.")

Calculating dist past start codons: 100%|██████████| 7230122/7230122 [00:45<00:00, 159121.85it/s]
Calculating cds lengths: 100%|██████████| 7230122/7230122 [00:58<00:00, 124482.28it/s]


Finished calculating read lengths


Getting average cds length for genes w/ >1 transcript: 100%|██████████| 53293/53293 [00:00<00:00, 63448.14it/s]

Done.





In [40]:
# Binning data with the pandas cut function:
cds_bins = [0, 250, 500, 750, 1000, 1500, 2000, 2500, 3000, 4000, 10000]
cds_bin_names = [f"{bin_start} to {cds_bins[i + 1]}" for i, bin_start in enumerate(cds_bins[:-1])]
cds_bin_names[-1] = f"{cds_bins[-2]} up"
grouped_df["binned_cds_len"] = pd.cut(grouped_df["cds_len"], bins=cds_bins, labels=cds_bin_names)
print("Done.")

Done.


In [21]:
grouped_df["binned_cds_len"] = pd.qcut(grouped_df["cds_len"], q=10, labels=False)
grouped_df["binned_cds_len"] += 1
print("Done.")

Done.


### Make box plot w/ plotly:

In [41]:
plot_df = grouped_df.reset_index().sort_values(['binned_cds_len', 'lib'])
plot_df["set"] = plot_df['lib'].str[-1]

hover_col_list = ["gene_id_fromAssign", "transcript_hits"]
if genes_or_txns == 'txns':
    hover_col_list.append('transcript_id')
fig = px.box(plot_df.sort_values(["set", "lib"]),
                 x="binned_cds_len", y="past_start",
                 color="lib", points="suspectedoutliers",
                 color_discrete_sequence=['#202020', '#A0A0A0', '#606060', '#C0C0C0'],
                 hover_data=hover_col_list,
                 category_orders=dict(zip(cds_bin_names, cds_bin_names)))
fig.update_layout(template='plotly_white')
fig.show()

In [42]:
save_folder = "/home/marcus/Insync/mviscard@ucsc.edu/Google Drive/insync_folder/polyAPaperFigures/figure_readLengths/raw"
save_path = f"{save_folder}/{get_dt(for_file=True)}_fullCDSreads"
fig.write_image(save_path + ".svg")
fig.write_image(save_path + ".png")