# concat_all_BAMs.ipynb
## Marcus Viscardi,    August 20, 2024

Generally, I want to try just merging ALL my BAM files into one super file for each treatment and then using that for a meta anaylsis type thing.

The main idea here is that any noise between libraries should shake out while the actual signal of NMD deg species will be easier to pick up with the counts

I'll really just need to call [`samtool merge`](http://www.htslib.org/doc/samtools-merge.html) which should do it all! The only thing before that will be producing a file with a list of BAM file paths... Doesn't sound too bad!

In [1]:
from typing import Tuple

import nanoporePipelineCommon as npCommon

import numpy as np
import pandas as pd

import re

import seaborn as sea
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

from tqdm.auto import tqdm

from icecream import ic
from datetime import datetime

from pathlib import Path

from scipy.stats import mannwhitneyu, ks_2samp

import pickle as pkl

import random

pd.set_option('display.width', 200)
pd.set_option('display.max_columns', None)

def __time_formatter__():
    now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    return f"ic: {now} | > "
ic.configureOutput(prefix=__time_formatter__)

_ = ic("Imports done!")

ic: 2024-08-20 17:26:51 | > 'Imports done!'


In [11]:
plot_libs = [
    ### The classics:
    # "oldN2",
    # "oldS6",
    ### The terrible second replicates:
    # "newN2",
    # "newS5",
    # "newS6",
    ### The "better" second replicates:
    "newerN2",
    "newerS6",
    "newerS5",
    ### The triplicates!
    "thirdN2",
    "thirdS5",
    "thirdS6",
    ### At 25C for smg-7 (and fourth replicates in a way):
    "temp25cN2",
    "temp25cS5",
    "temp25cS6",
    "temp25cS7",
]
dir_paths = {lib: Path(npCommon.OUTPUT_DIR_DICT[npCommon.REV_CONVERSION_DICT[lib]]) for lib in plot_libs}
for lib, out_dir in dir_paths.items():
    assert out_dir.exists()
    print(f"{lib}:\n\t{out_dir}")

newerN2:
	/data16/marcus/working/230327_nanoporeRun_totalRNA_wt_xrn-1-KD_5TERA_rerun/output_dir
newerS6:
	/data16/marcus/working/230403_nanoporeRun_totalRNA_smg-6_xrn-1-KD_5TERA_rerun/output_dir
newerS5:
	/data16/marcus/working/230410_nanoporeRun_totalRNA_smg-5_xrn-1-KD_5TERA_rerun/output_dir
thirdN2:
	/data16/marcus/working/230918_nanoporeRun_sMV013_wt_xrn-1-KD_5TERA/output_dir
thirdS5:
	/data16/marcus/working/230918_nanoporeRun_sMV014_smg-5_xrn-1-KD_5TERA/output_dir
thirdS6:
	/data16/marcus/working/230918_nanoporeRun_sMV015_smg-6_xrn-1-KD_5TERA/output_dir
temp25cN2:
	/data16/marcus/working/240626_nanoporeRun_sMV034_xrn-1-kd_25C_wt_5TERA/output_dir
temp25cS5:
	/data16/marcus/working/240626_nanoporeRun_sMV036_xrn-1-kd_25C_smg-5_5TERA/output_dir
temp25cS6:
	/data16/marcus/working/240626_nanoporeRun_sMV037_xrn-1-kd_25C_smg-6_5TERA/output_dir
temp25cS7:
	/data16/marcus/working/240626_nanoporeRun_sMV038_xrn-1-kd_25C_smg-7_5TERA/output_dir


In [12]:
basic_bam_paths = {lib: out_dir / "cat_files" / "cat.sorted.mappedAndPrimary.bam" for lib, out_dir in dir_paths.items()}
for lib, bam in basic_bam_paths.items():
    assert bam.exists()
    print(f"{lib}:\n\t{bam}")

newerN2:
	/data16/marcus/working/230327_nanoporeRun_totalRNA_wt_xrn-1-KD_5TERA_rerun/output_dir/cat_files/cat.sorted.mappedAndPrimary.bam
newerS6:
	/data16/marcus/working/230403_nanoporeRun_totalRNA_smg-6_xrn-1-KD_5TERA_rerun/output_dir/cat_files/cat.sorted.mappedAndPrimary.bam
newerS5:
	/data16/marcus/working/230410_nanoporeRun_totalRNA_smg-5_xrn-1-KD_5TERA_rerun/output_dir/cat_files/cat.sorted.mappedAndPrimary.bam
thirdN2:
	/data16/marcus/working/230918_nanoporeRun_sMV013_wt_xrn-1-KD_5TERA/output_dir/cat_files/cat.sorted.mappedAndPrimary.bam
thirdS5:
	/data16/marcus/working/230918_nanoporeRun_sMV014_smg-5_xrn-1-KD_5TERA/output_dir/cat_files/cat.sorted.mappedAndPrimary.bam
thirdS6:
	/data16/marcus/working/230918_nanoporeRun_sMV015_smg-6_xrn-1-KD_5TERA/output_dir/cat_files/cat.sorted.mappedAndPrimary.bam
temp25cN2:
	/data16/marcus/working/240626_nanoporeRun_sMV034_xrn-1-kd_25C_wt_5TERA/output_dir/cat_files/cat.sorted.mappedAndPrimary.bam
temp25cS5:
	/data16/marcus/working/240626_nanopo