# quickProduceReadHitsTable.ipynb
## Marcus Viscardi,    April 24, 2024

I am just taking the first few parts of DESeq2_fromGeneCountsDF.ipynb and putting them here for easier access.

In [1]:
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
import pandas as pd
from pathlib import Path

import seaborn as sea
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

import nanoporePipelineCommon as npCommon

from icecream import ic
from datetime import datetime

def __time_formatter__():
    now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    return f"ic: {now} | > "
ic.configureOutput(prefix=__time_formatter__)


ic("Imports done.")
working_dir = Path.cwd()
_ = ic(working_dir)

ic: 2024-04-29 11:08:58 | > 'Imports done.'
ic: 2024-04-29 11:08:58 | > working_dir: PosixPath('/data16/marcus/scripts/nanoporePipelineScripts/DESeq2_fromGeneCounts')


In [2]:
gene_id_gene_name_df = npCommon.gene_names_to_gene_ids()
gene_id_gene_name_df.head()

Unnamed: 0,gene_name,gene_id,chr
0,cTel3X.2,WBGene00197333,V
1,cTel3X.3,WBGene00198386,V
2,B0348.5,WBGene00015153,V
3,ife-3,WBGene00002061,V
4,B0348.10,WBGene00255704,V


In [3]:
obj_dict = {}
libs_to_run = [
    # "oldN2",
    # "oldS6",
    # "newerN2",
    # "newerS6",
    # "newerS5",
    # "thirdN2",
    # "thirdS5",
    # "thirdS6",
    # "polyA",
    "polyA1",
    "polyA2",
    "polyA3",
    "totalRNA1",
    "totalRNA2",
    "totalRNA3",
]
for lib in libs_to_run:
    print(f"\nLoading {lib}...", end="")
    obj_dict[lib] = npCommon.NanoporeRun(run_nickname=lib)
    print(" Done!")


Loading polyA1...Found 4 settings files, we are going to pick the newest one!
 Done!

Loading polyA2...Found 3 settings files, we are going to pick the newest one!
 Done!

Loading polyA3... Done!

Loading totalRNA1...

[E::idx_find_and_load] Could not retrieve index file for '/data16/marcus/working/210709_NanoporeRun_totalRNA_0639_L3/output_dir/cat_files/cat.sorted.mappedAndPrimary.bam'


Could not calculate read counts: [Errno 2] No such file or directory: '/data16/marcus/working/210709_NanoporeRun_totalRNA_0639_L3/output_dir/flair/counts_matrix.tsv'
Found 5 settings files, we are going to pick the newest one!
 Done!

Loading totalRNA2...Found 4 settings files, we are going to pick the newest one!
 Done!

Loading totalRNA3... Done!


In [4]:
# compressedOnGenes_dict = {}
# for lib, obj in obj_dict.items():
#     compressedOnGenes_dict[lib] = obj.load_compressedOnGenes()  # Looks like the old N2 library had a read cutoff of 5 while everything else had no cutoff!!
#     # ic(obj)
# read_hits_series_dict = {}
# for lib, df in compressedOnGenes_dict.items():
#     print(f"Pre-cutdown:  {lib} - {df.shape[0]:,} Genes", end=" ")
#     # # TODO: Eventually, I should rerun the compressing for oldN2 without the cutoff!!!
#     # df = df.query("read_hits >= 5")
#     print(f"Post-cutdown: {lib} - {df.shape[0]:,} Genes")
#     # print(df.head())
#     hits_series = df[[
#         # 'gene_id',
#         'read_hits',
#                       ]]# .set_index('gene_id')
#     print(hits_series)
#     hits_series.rename(columns={'read_hits': lib}, inplace=True)
#     read_hits_series_dict[lib] = hits_series
# read_hits_table = pd.concat(read_hits_series_dict.values(), axis=1).fillna(0)
# read_hits_table.to_csv(working_dir / f"read_hits_table_{'-'.join(libs_to_run)}.csv")
# print(f"Saved read_hits_table_{'-'.join(libs_to_run)}.csv to {working_dir}!")
# read_hits_table.head()

# From fresh featureCounts runs

Another way to go about this would be to run FeatureCounts for each of the libraries using their BAM files, then use the resulting gene count tables to make a read hits table. This would be a bit more work, but would also be a bit more accurate. I will try this next.

In [10]:
import subprocess

regen = True
threads = 32
input_bam_paths = {lib_name: obj.bam_path for lib_name, obj in obj_dict.items()}
overall_output_dir = working_dir / "featureCounts_testing"
overall_output_dir.mkdir(exist_ok=True)


for lib, lib_obj in obj_dict.items():
    bam_path = lib_obj.bam_path
    gtf_path = lib_obj.gtf_path
    output_dir = overall_output_dir / lib
    output_dir.mkdir(exist_ok=True)
    
    assigned_read_output_file = output_dir / (str(Path(bam_path).name) + ".featureCounts")
    gene_count_output_file = output_dir / f"{npCommon.get_dt(for_file=True)}_{lib}_featureCounts"
    if regen or not gene_count_output_file.exists():
        featCounts_call = (f"featureCounts -L -T {threads} -R CORE -a {gtf_path} "
                           f"-o {output_dir}/{npCommon.get_dt(for_file=True)}_{lib}_featureCounts "
                           f"--largestOverlap -s 1 "
                           f"{bam_path}")
        # TODO: Turn back on
        subprocess.run(featCounts_call, shell=True)
    else:
        print(f"Already ran {lib}! w/ input file {bam_path}, gtf file {gtf_path} and output file {gene_count_output_file}!")
    
    # These would be names for the pure featureCounts output: names=["GeneID", "Chr", "Start", "End", "Strand", "Length", lib]
    featCounts_df = pd.read_csv(gene_count_output_file, sep="\t", skiprows=2, names=["GeneID", "Chr", "Start", "End", "Strand", "Length", lib])
    print(featCounts_df.head())
    break


        =====         / ____| |  | |  _ \|  __ \|  ____|   /\   |  __ \ 
          =====      | (___ | |  | | |_) | |__) | |__     /  \  | |  | |
            ====      \___ \| |  | |  _ <|  _  /|  __|   / /\ \ | |  | |
              ====    ____) | |__| | |_) | | \ \| |____ / ____ \| |__| |
	  v2.0.0

||                                                                            ||
||             Input files : 1 BAM file                                       ||
||                           o cat.sorted.mappedAndPrimary.bam                ||
||                                                                            ||
||             Output file : 240429_polyA1_featureCounts                      ||
||                 Summary : 240429_polyA1_featureCounts.summary              ||
||              Annotation : 210928_allChrs_plus-pTRI.gtf (GTF)               ||
||      Dir for temp files : /data16/marcus/scripts/nanoporePipelineScrip ... ||
||      Assignment details : <input_file>.featur

KeyboardInterrupt: 

In [None]:
genes_to_print = [
    'WBGene00023068',
    'WBGene00023067',
]

featCounts_df.query(f"index in @genes_to_print")

In [12]:
regen = False
gtf_path = obj_dict['polyA1'].gtf_path
bam_paths_dict = {lib: obj.bam_path for lib, obj in obj_dict.items()}
bam_paths = [str(lib_bam_path) for lib, lib_bam_path in bam_paths_dict.items()]
libs = list(obj_dict.keys())
output_dir = overall_output_dir / "allLibs_fractional"
output_dir.mkdir(exist_ok=True)

gene_count_alllibs_output_file = output_dir / f"{npCommon.get_dt(for_file=True)}_{'-'.join(libs)}_featureCounts"
if regen or not gene_count_alllibs_output_file.exists():
    featCounts_call = (f"featureCounts -L -T {threads} -R CORE -a {gtf_path} "
                       f"-o {gene_count_alllibs_output_file} "
                       # f"-O "  # this will count all features that a read overlaps, instead of tossing it!
                       # f"--fraction "
                       f"--largestOverlap -s 1 "
                       f"{' '.join(bam_paths)}")
    subprocess.run(featCounts_call, shell=True)
else:
    print(f"Already ran {'-'.join(libs)}! w/ input files {bam_paths}, gtf file {gtf_path} and output file {gene_count_alllibs_output_file}!")


        =====         / ____| |  | |  _ \|  __ \|  ____|   /\   |  __ \ 
          =====      | (___ | |  | | |_) | |__) | |__     /  \  | |  | |
            ====      \___ \| |  | |  _ <|  _  /|  __|   / /\ \ | |  | |
              ====    ____) | |__| | |_) | | \ \| |____ / ____ \| |__| |
	  v2.0.0

||                                                                            ||
||             Input files : 6 BAM files                                      ||
||                           o cat.sorted.mappedAndPrimary.bam                ||
||                           o cat.sorted.mappedAndPrimary.bam                ||
||                           o cat.sorted.mappedAndPrimary.bam                ||
||                           o cat.sorted.mappedAndPrimary.bam                ||
||                           o cat.sorted.mappedAndPrimary.bam                ||
||                           o cat.sorted.mappedAndPrimary.bam                ||
||                                              

In [13]:
rev_bam_paths_dict = {str(v): k for k, v in bam_paths_dict.items()}
print(rev_bam_paths_dict)

featCounts_alllibs_df = pd.read_csv(gene_count_alllibs_output_file,
                                    sep="\t",
                                    skiprows=1,
                                    # names=["GeneID", "Chr", "Start", "End", "Strand", "Length"] + libs,
                                    )
featCounts_alllibs_df.rename(columns={"Geneid": "Gene_ID"}, inplace=True)
featCounts_alllibs_df.rename(columns=rev_bam_paths_dict, inplace=True)
featCounts_alllibs_df.set_index("Gene_ID", inplace=True)
featCounts_alllibs_simple_df = featCounts_alllibs_df[libs].copy()
featCounts_alllibs_simple_df["sum"] = featCounts_alllibs_simple_df.sum(axis=1)
featCounts_alllibs_simple_df.sort_values("sum", ascending=False, inplace=True)
featCounts_alllibs_simple_df.head(25)

{'/data16/marcus/working/210528_NanoporeRun_0639_L3s/output_dir/cat_files/cat.sorted.mappedAndPrimary.bam': 'polyA1', '/data16/marcus/working/210719_nanoporeRun_polyA_0639_L3_replicate/output_dir/cat_files/cat.sorted.mappedAndPrimary.bam': 'polyA2', '/data16/marcus/working/220131_nanoporeRun_polyA_0639_L3_third/output_dir/cat_files/cat.sorted.mappedAndPrimary.bam': 'polyA3', '/data16/marcus/working/210709_NanoporeRun_totalRNA_0639_L3/output_dir/cat_files/cat.sorted.mappedAndPrimary.bam': 'totalRNA1', '/data16/marcus/working/210720_nanoporeRun_totalRNA_0639_L3_replicate/output_dir/cat_files/cat.sorted.mappedAndPrimary.bam': 'totalRNA2', '/data16/marcus/working/220131_nanoporeRun_totalRNA_0639_L3_third/output_dir/cat_files/cat.sorted.mappedAndPrimary.bam': 'totalRNA3'}


Unnamed: 0_level_0,polyA1,polyA2,polyA3,totalRNA1,totalRNA2,totalRNA3,sum
Gene_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
WBGene00010965,29388,24489,12365,1581,11435,10403,89661
WBGene00010964,16100,17959,7232,1687,11901,12684,67563
WBGene00010960,16647,16723,7746,1294,9330,9256,60996
WBGene00010962,17495,14532,8226,982,7365,8208,56808
WBGene00004494,11602,9348,14101,516,4109,8954,48630
WBGene00001168,12013,7874,11884,521,4543,8832,45667
WBGene00021350,10041,7417,12343,435,3476,7592,41304
WBGene00004477,9550,7078,11696,408,3121,6633,38486
WBGene00004492,9427,7078,11487,391,3003,6326,37712
WBGene00000829,8739,9066,4791,799,5772,7143,36310


In [14]:
genes_to_print = [
    'WBGene00023068',
    'WBGene00023067',
]

featCounts_alllibs_simple_df.query(f"index in @genes_to_print")


Unnamed: 0_level_0,polyA1,polyA2,polyA3,totalRNA1,totalRNA2,totalRNA3,sum
Gene_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
WBGene00023068,6339,5215,8211,444,2290,4176,26675
WBGene00023067,418,367,411,25,154,178,1553


In [25]:
featCounts_alllibs_simple_df = featCounts_alllibs_simple_df[['polyA1', 'polyA2', 'polyA3', 'totalRNA2', 'totalRNA3']].copy()
for col in featCounts_alllibs_simple_df.columns:
    print(f"{col} total assigned reads: {featCounts_alllibs_simple_df[col].sum():,}")
featCounts_alllibs_simple_df.head(50)

polyA1 total assigned reads: 1,963,087
polyA2 total assigned reads: 1,336,530
polyA3 total assigned reads: 1,747,422
totalRNA2 total assigned reads: 697,736
totalRNA3 total assigned reads: 1,228,387


Unnamed: 0_level_0,polyA1,polyA2,polyA3,totalRNA2,totalRNA3
Gene_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
WBGene00010965,29388,24489,12365,11435,10403
WBGene00010964,16100,17959,7232,11901,12684
WBGene00010960,16647,16723,7746,9330,9256
WBGene00010962,17495,14532,8226,7365,8208
WBGene00004494,11602,9348,14101,4109,8954
WBGene00001168,12013,7874,11884,4543,8832
WBGene00021350,10041,7417,12343,3476,7592
WBGene00004477,9550,7078,11696,3121,6633
WBGene00004492,9427,7078,11487,3003,6326
WBGene00000829,8739,9066,4791,5772,7143


In [26]:
featCounts_alllibs_simple_df.to_csv(working_dir / f"featureCounts_readCounts_{'-'.join(featCounts_alllibs_simple_df.columns)}.csv")

In [30]:
genes_to_print = [
    'WBGene00023068',
    'WBGene00023067',
    'WBGene00004446',
    'WBGene00004419',
    'WBGene00004451',
    'WBGene00004432',
]

featCounts_alllibs_simple_df.query(f"index in @genes_to_print")

Unnamed: 0_level_0,polyA1,polyA2,polyA3,totalRNA2,totalRNA3
Gene_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
WBGene00004446,6269,4988,8366,2502,5441
WBGene00023068,6339,5215,8211,2290,4176
WBGene00004419,4127,2675,4985,1825,3563
WBGene00004432,1693,1155,1861,1654,3249
WBGene00004451,511,404,690,184,429
WBGene00023067,418,367,411,154,178
