In [None]:
# %load ../snippets/plotly_settings.py
from datetime import date
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from pathlib import Path
import seaborn as sns
import sys
import plotly.express as px
import plotly.io as pio
import yaml
import re

sns.set_context("notebook", font_scale=1.4)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
plt.rcParams["figure.figsize"] = (16, 12)
plt.rcParams['savefig.dpi'] = 200
plt.rcParams['figure.autolayout'] = False
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14
pd.set_option('display.float_format', lambda x: '{:,.4f}'.format(x))


sushi_colors = {'red': '#C0504D',
             'orange': '#F79646',
             'medSea': '#4BACC6', 
             'black': '#000000',
             'dgreen': '#00B04E',
             'lgreen': '#92D050',
             'dblue': '#366092',
             'lblue': '#95B3D7'}

today = date.today().strftime("%d-%m-%y")

In [None]:
root = Path("/nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq/scratch/03_23_transcriptomics")

# Mapped to mouse rRNA

In [None]:
ssu_flagstat_dir = Path("/nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq/scratch/03_23_transcriptomics/mouse_rrna_bwa")
lsu_flagstat_dir = Path("/nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq/scratch/03_23_transcriptomics/mouse_lsu_rrna_bwa")
ssu_stats = ssu_flagstat_dir/"mouse_ssu_rrna.flagstats"
lsu_stats = lsu_flagstat_dir/"mouse_lsu_rrna.flagstat"

In [None]:
def process_flagstat_file(file, prefix = ''):
    samples = []
    total = []
    mapped = []
    paired = []
    with open(file, 'r') as fh:
        for line in fh.readlines():

            if re.search('^[A-Z]|^[a-z]', line):
                samples.append(line.split("/")[0])
            if 'total' in line:
                total.append(int(line.split()[0]))
            if 'mapped (' in line:
                mapped.append(int(line.split()[0]))
            if 'properly paired' in line:
                paired.append(int(line.split()[0]))
    df = pd.DataFrame([samples, total, mapped, paired],
                     index = ['sample_id', 'total', 'mapped', 'paired']).T
    df['%mapped'] = df['mapped']/df['total']*100
    return df

In [None]:
ssu_df = process_flagstat_file(ssu_stats)
lsu_df = process_flagstat_file(lsu_stats)

In [None]:
ssu_df.sort_values('%mapped', ascending=False).to_csv(root/'13-04-23_salmonella_mapped_to_mouse_rrna.csv', index=False)


In [None]:
lsu_df
%store lsu_df

In [None]:
ssu_df.sort_values('%mapped', ascending=False)

In [None]:
import re
def parse_flagstat(filename):
    samples = []
    totals = []
    mapped = []
    proper = []
    with open(filename, 'r') as fh:
        for line in fh.readlines():
            if re.match(r'^[a-zA-Z]', line):
                samples.append(line.strip())
            elif 'total' in line:
                totals.append(line.split("+")[0].strip())
            elif 'mapped (' in line:
                mapped.append(line.split("+")[0].strip())
            elif 'properly' in line:
                proper.append(line.split("+")[0].strip())
    return pd.DataFrame([samples, totals, mapped, proper], index=['samples', 'totals', 'mapped', 'mapped_paired']).T


df = parse_flagstat(root/'mouse_rrna_bwa/mouse_rrna.flagstats')
df = df.set_index('samples')
df = df.astype(int)/2/1e6
df

# ASF counts 

In [None]:
asf_strains = ['ASF519', 'YL31', 'YL58', 'SL1344_asf', 'YL32', 'dsm755']



# Load all of the count files

def load_htseq(strain):
    count_suffix = '_htseqcount'
    dir_name = root/(strain+count_suffix)
    files = dir_name.rglob("*htseqcount.txt")
    
    df = pd.concat([(pd.read_table(f, header=None, names=['Name', 'count'])
                     .assign(sample_id = f.stem.split('.')[0])) for f in files])
    df['strain'] = strain
    return df

In [None]:
def get_stats(counts_df, prefix=''):   
    mapped = f'{prefix}_mapped' if prefix else 'mapped'
    no_feat = f'{prefix}_no_feature' if prefix else 'no_feature'
    to_feat = f'{prefix}_to_feature' if prefix else 'to_feature'
    amb = f'{prefix}_ambiguous' if prefix else 'ambiguous'
    feat = (counts_df[~counts_df.Name.str.contains("__")]
            .groupby(['strain', 'sample_id'])
            .sum()
            .rename({'count':to_feat}, axis=1)
            .reset_index())
    feat[to_feat] = feat[to_feat]/1e6
    stats_df = counts_df[counts_df.Name.str.contains("__")]
    stats_df = stats_df.pivot(index=['strain', 'sample_id'], columns='Name')
    stats_df = stats_df/1e6
    colnames = [c[1].strip('_') for c in stats_df.columns]
    if prefix:
        colnames = [f"{prefix}_{c}" for c in colnames]
    stats_df.columns = colnames
    stats_df = stats_df.reset_index()
    stats_df = stats_df.merge(feat, on=['strain', 'sample_id'])
    stats_df[mapped] = stats_df[no_feat] + stats_df[to_feat] + stats_df[amb]
    total = [c for c in stats_df.columns if 'feature' not in c]
    stats_df['total'] = stats_df[total].sum(axis=1, numeric_only=True)
    return stats_df

In [None]:
all_strains = []
for s in asf_strains:
    df = load_htseq(s)
    all_strains.append(df)
asf_df = pd.concat(all_strains)

In [None]:
stats_df.sample_id.unique()

In [None]:
stats_df = get_stats(asf_df)

In [None]:
stats_df[stats_df.sample_id == 'ae132D1']

In [None]:
stats_df[stats_df.strain == 'YL32']

In [None]:
sal_alone_dir = root/'sal_only'
sal_metaflye_dir = root/'sal_metaflye'
sal_asf_dir = root/'sal_asf'
bwa_dir = 'bwa'
count_dir = 'htseqcount'

# Salmonella alone, looking at # aligned to genes

In [None]:
sal_counts = load_htseq(sal_alone_dir/count_dir)
sal_counts_long = sal_counts.pivot(index='Name', columns='sample_id')
sal_counts_long.columns = [s[1] for s in sal_counts_long]
sal_counts_long = sal_counts_long.reset_index()
sal_counts_long = sal_counts_long[~sal_counts_long.Name.str.contains('__')]
sal_alone_summary = sal_counts_long.sum(numeric_only=True)

In [None]:
stats_df = get_stats(sal_counts)
stats_df

# Salmonella + metaflye 

In [None]:
flye_counts = load_htseq(sal_metaflye_dir/count_dir)
# sal_counts_long = sal_counts.pivot(index='Name', columns='sample_id')
# sal_counts_long.columns = [s[1] for s in sal_counts_long]
# sal_counts_long = sal_counts_long.reset_index()
# sal_counts_long = sal_counts_long[~sal_counts_long.Name.str.contains('__')]
# sal_flye_summary = sal_counts_long.sum(numeric_only=True)

In [None]:
flye_stats = get_stats(flye_counts)

In [None]:
flye_stats

# Sal + ASF

In [None]:
asf_counts = load_htseq(sal_asf_dir/count_dir)
# sal_counts_long = sal_counts.pivot(index='Name', columns='sample_id')
# sal_counts_long.columns = [s[1] for s in sal_counts_long]
# sal_counts_long = sal_counts_long.reset_index()
# sal_counts_long = sal_counts_long[~sal_counts_long.Name.str.contains('__')]
# sal_asf_summary = sal_counts_long.sum(numeric_only=True)

In [None]:
yl32_counts = load_htseq(yl32_dir)
yl32_stats = get_stats(yl32_counts, 'yl32')

In [None]:
yl58_counts = load_htseq(yl58_dir)
yl58_stats = get_stats(yl58_counts, 'yl58')

In [None]:
asf519_counts = load_htseq(asf519_dir)
asf519_stats = get_stats(asf519_counts, 'asf519')

In [None]:
asf519_stats[asf519_stats['asf519_mapped'] < 3]

In [None]:
yl32_stats

In [None]:
yl58_stats

In [None]:
df4 = pd.DataFrame(sal_asf_summary).reset_index()
df4.columns = ['sample_id', 'feature']
df5 = df4.merge(sal_counts[sal_counts['Name'].str.contains('__no_')].sort_values('count'), on='sample_id')

In [None]:
df5['mapped'] = (df5['feature'] + df5['count'])/1000000

In [None]:
df5

In [None]:
aw9332D4 aw9332D4 aw138D3 aw141D3 aw135D4

In [None]:
653 651


In [None]:
not_aligned = sal_counts[(sal_counts.Name.str.contains("__not"))].copy()
not_aligned['unmapped'] = not_aligned['count']/1e6

In [None]:
fdf = not_aligned.merge(df5, on='sample_id')[['sample_id', 'mapped', 'unmapped']]
fdf['total'] = fdf['mapped'] + fdf['unmapped']

In [None]:
sal_counts[(sal_counts.sample_id=='AU651') &(sal_counts.Name.str.contains("__"))]

In [None]:
fdf.sort_values('unmapped', ascending=False)


In [None]:
fdf[fdf.mapped < 10].shape