# stackedBarPlotsOfGeneBiotpes.ipynb
## Marcus Viscardi  June 22, 2022

General goal here is to produce four stacked bar plots, one for each library, to contain gene biotype information (e.g. protein_coding, tRNA, rRNA, ncRNA)

Plan is to pull out the read_ids and assigned gene_ids straight from the featureCounts output. I will then need to merge that with a gtf file to get gene biotype information. From there the pandas count function should work great!!

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.width', 200)
pd.set_option('display.max_columns', None)

import scipy.stats as stats

import nanoporePipelineCommon as npCommon

from tqdm.notebook import tqdm
from pprint import pprint

import seaborn as sea
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
print("imports done!")

imports done!


In [6]:
libs_to_get = ["polyA2", "polyA3", "totalRNA2", "totalRNA3"]
lib_paths_dict = npCommon.pick_libs_return_paths_dict(libs_to_get, output_dir_folder="featureCounts", file_midfix="cat.sorted.mappedAndPrimary.bam", file_suffix="featureCounts")
libs_dict_raw = {lib: pd.read_table(path, header=None, names=["read_id", "featC_QC_tag", "featC_QC_score", "gene_id"]) for lib, path in lib_paths_dict.items()}

Looking for file for totalRNA2, at /data16/marcus/working/210720_nanoporeRun_totalRNA_0639_L3_replicate/output_dir/featureCounts/*cat.sorted.mappedAndPrimary.bam.featureCounts... File Found.
Looking for file for polyA2, at /data16/marcus/working/210719_nanoporeRun_polyA_0639_L3_replicate/output_dir/featureCounts/*cat.sorted.mappedAndPrimary.bam.featureCounts... File Found.
Looking for file for polyA3, at /data16/marcus/working/220131_nanoporeRun_polyA_0639_L3_third/output_dir/featureCounts/*cat.sorted.mappedAndPrimary.bam.featureCounts... File Found.
Looking for file for totalRNA3, at /data16/marcus/working/220131_nanoporeRun_totalRNA_0639_L3_third/output_dir/featureCounts/*cat.sorted.mappedAndPrimary.bam.featureCounts... File Found.


In [68]:
parsed_gtf_path="/data16/marcus/genomes/elegansRelease100/Caenorhabditis_elegans.WBcel235.100.gtf.parquet"
gtf_df = pd.read_parquet(parsed_gtf_path).query("feature == 'gene'")[["gene_id", "gene_name", "gene_biotype"]]
libs_dict = {lib: df[["read_id", "gene_id", "featC_QC_tag"]].copy().merge(gtf_df, on="gene_id", how="left") for lib, df in libs_dict_raw.items()}

In [109]:
multi_df = pd.concat(libs_dict.values(),
                     keys=libs_dict.keys())
multi_df.index.set_names(("lib", "old_index"), inplace=True)
concat_df_raw = multi_df.reset_index(level="lib").reset_index(drop=True)
concat_df_raw.gene_biotype.fillna(concat_df_raw.featC_QC_tag, inplace=True)
concat_df_raw.gene_id.fillna(concat_df_raw.featC_QC_tag, inplace=True)
concat_df_raw.gene_name.fillna(concat_df_raw.featC_QC_tag, inplace=True)

In [119]:
concat_df: pd.DataFrame = concat_df_raw.copy()[["lib", "gene_id", "gene_name", "gene_biotype"]]

# Need to collapse duplicates due to read_id!
concat_df.drop_duplicates(inplace=True)

In [145]:
def biotypes_stacked_bar_plot(raw_concat_df, plot_non_protein_coding=True, save_file=True, save_file_type="svg"):
    concat_df: pd.DataFrame = concat_df_raw.copy()[["lib", "gene_id", "gene_name", "gene_biotype"]]
    concat_df.drop_duplicates(inplace=True)
    concat_df.lib.replace({"polyA3": "Selected-1",
                           "polyA2": "Selected-2",
                           "totalRNA3": "Unselected-1",
                           "totalRNA2": "Unselected-2"},
                          inplace=True)
    if not plot_non_protein_coding:
        concat_df.loc[~concat_df.gene_biotype.isin(["protein_coding"]), 'gene_biotype'] = "other"
        color_scale = px.colors.qualitative.T10
    long_df = concat_df.value_counts(subset=["lib", "gene_biotype"], normalize=False)  # Normalize doesn't quite work here due to everything being in one huge DF, I would want it to normalize within each lib!
    long_df = long_df.to_frame().reset_index()
    if plot_non_protein_coding:
        long_df = long_df.query("gene_biotype != 'protein_coding'").query("gene_biotype != 'Unassigned_Ambiguity'").query("gene_biotype != 'Unassigned_NoFeatures'")
        color_scale = px.colors.qualitative.T10[3:]
    fig = px.bar(long_df, x="lib", y=0, color="gene_biotype",
                 color_discrete_sequence=color_scale)
    # fig.update_yaxes(type='log')
    fig.update_layout(
        title="Gene Biotypes Sequenced",
        yaxis_title='Genes',
        xaxis_title='Library',
        template="plotly_white",
        width=400, height=500,
    )
    
    
    if save_file:
        if plot_non_protein_coding:
            file_suffix = f"_proteinCodingAndOther.{save_file_type}"
        else:
            file_suffix = f"_nonProteinCoding.{save_file_type}"
        fig.write_image(
            f"/home/marcus/Insync/mviscard@ucsc.edu/Google Drive/insync_folder/polyAvsTotalRNA_ReviewEditsAndPlots/final_SVGs/{npCommon.get_dt(for_file=True)}_geneBiotypes{file_suffix}")
    fig.show()


biotypes_stacked_bar_plot(concat_df_raw, plot_non_protein_coding=True, save_file=True)
biotypes_stacked_bar_plot(concat_df_raw, plot_non_protein_coding=False, save_file=True)