# Settings

In [None]:
# %load ../snippets/basic_settings.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from pathlib import Path
import seaborn as sns
import sys
import plotly.express as px
import plotly.io as pio
import yaml
import pyranges as pr
import requests
from time import sleep
import json

sns.set_context("notebook", font_scale=1.1)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
plt.rcParams["figure.figsize"] = (16, 12)
plt.rcParams['savefig.dpi'] = 200
plt.rcParams['figure.autolayout'] = False
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14
plt.rcParams['text.usetex'] = False  # True activates latex output in fonts!
pd.set_option('display.float_format', lambda x: '{:,.2f}'.format(x))
from datetime import date
today = date.today().strftime("%d-%m-%y")

In [None]:
sys.path.append("..")
from snippets.utils import syncom_colors, find_pcs, find_umap_df, genome_map, ncbi_taxid_map, link_to_string, string_function

In [None]:
with open("config.yaml", "r") as fh:
    config_dict = yaml.safe_load(fh)['default']
root = Path(config_dict["root"])
out_dir = root/config_dict['output_dir']
sample_data_file = root/config_dict['sample_data_file']
sd = pd.read_csv(sample_data_file)

# Plan

1. How many reads in each file mapped to each genome
    Total numbers:
    
    - median 71% mapped to CDS
    - median of 6 % remains unmapped
    
2. Transcriptome composition across samples
    
    
2. Saturation curves for each genome 

Conclusion: which genomes can we analyze?

For each genome that we can analyze:
1. PCA: looking for outliers / clustering based on LPS treatment
2. Sanity check: do we see activation of stress response?
3. Run DESeq
    - Can we overlay results on STRING?
    - Add functional annotation from eggNOG for visualisation

# Look at how many reads mapped where

- There are differences between what featureCounts outputs vs. what I counted with an awk(?) command. Not sure what they mean.

In [None]:
# Number of reads mapped per chromosome
alignment_dir = Path("/nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq/scratch/03_23_transcriptomics/bowtie")
align_seq = list(alignment_dir.rglob("*/*.seq"))
seq_df = pd.concat([pd.read_table(f, header=None, names=['read_counts', 'chr']).assign(sample_id=f.stem) for f in align_seq])
seq_df['genome'] = seq_df['chr'].replace(genome_map)
# Featurecounts summary
featcnts_dir = Path("/nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq/scratch/03_23_transcriptomics/bowtie_featurecounts")
featcnt_files = list(featcnts_dir.rglob("*/*.summary"))

In [None]:
seq_df = seq_df.merge(sd, on='sample_id')
seq_df = seq_df.merge(seq_df.groupby('sample_id').read_counts.sum().reset_index().rename(columns={'read_counts':'totals'}), on='sample_id')

In [None]:
def summary_df(files):
        df_list = []
        for f in files:
            df = pd.read_table(f)
            name = df.columns[1].split("/")[-1].split('.')[0]
            df = df.assign(sample_id=name)
            df.columns = ['status', 'read_counts', 'sample_id']
            df_list.append(df)
        fdf = pd.concat(df_list)
        summary = fdf.groupby('sample_id').read_counts.sum().reset_index()
        summary.columns = ['sample_id', 'total']
        summary = (summary.merge(fdf[fdf.status == 'Assigned'][['read_counts', 'sample_id']], on='sample_id')
                   .rename({'read_counts': 'assigned'}, axis=1)
                   .merge(fdf[fdf.status == 'Unassigned_Unmapped'][['read_counts', 'sample_id']], on='sample_id')
                   .rename({'read_counts': 'unmapped'}, axis=1)
                   .merge(fdf[fdf.status == 'Unassigned_NoFeatures'][['read_counts', 'sample_id']], on='sample_id')
                   .rename({'read_counts': 'no_feature'}, axis=1))
        summary['percent_assigned'] = summary['assigned']/summary['total']*100
        summary['percent_unmapped'] = summary['unmapped']/summary['total']*100
        summary['percent_no_feature'] = summary['no_feature'] / \
            summary['total']*100
        return summary

In [None]:
sum_df = summary_df(featcnt_files)
sum_df['mapped'] = (sum_df['total'] - sum_df['unmapped'])/2

In [None]:
sum_df.merge(seq_df.groupby('sample_id').read_counts.sum().reset_index(), on=['sample_id'], how='outer')

In [None]:
seq_df['RelAb'] = seq_df['read_counts']/seq_df['totals']

In [None]:
fig = px.bar(seq_df[seq_df.Mouse == 'Oligo'].sort_values(['Treatment', 'sample_id']),
             x='sample_id', y='RelAb', color='genome', color_discrete_map=syncom_colors, 
            template='plotly_white', width=800, height=800, labels = {'genome_perc': 'Transcriptome abundance', 'sample_id':''})

fig.add_shape(type="line",
    x0=5.5, y0=0, x1=5.5, y1=1,
    line=dict(
        color="black",
        width=2,
        dash="dash",
    ))
fig.add_annotation(x=3, y=1.03,
            text="LPS",
            showarrow=False,
            )
fig.add_annotation(x=8, y=1.03,
            text="PBS",
            showarrow=False,
            )

# Look at transcriptome abundances

- Note: Sanne has matching 16S data, would be interesting to compare

In [None]:
raw_counts = pd.read_csv(out_dir/"annotated-featurecounts-for-deseq.csv")
raw_counts = raw_counts.melt(id_vars=['ID', 'genome'], var_name='sample_id', value_name='fcnts')
raw_counts = (raw_counts.merge(raw_counts
                               .groupby('sample_id').fcnts.sum()
                               .reset_index()
                               .rename(columns={'fcnts': 'sample_total'}), on='sample_id', how='left'))
raw_counts = (raw_counts.merge(raw_counts
                               .groupby(['sample_id', 'genome']).fcnts.sum()
                               .reset_index()
                               .rename(columns={'fcnts': 'genome_total'}), on=['sample_id', 'genome'], how='left'))

In [None]:
raw_counts['genome_perc'] = raw_counts['genome_total']/raw_counts['sample_total']
genome_counts = raw_counts[['sample_id', 'genome', 'sample_total', 'genome_total', 'genome_perc']].drop_duplicates()
genome_counts = genome_counts.merge(sd, on='sample_id')

## Oligo mice

In [None]:
oligo_comp = genome_counts[genome_counts.Mouse == 'Oligo']
oligo_comp = oligo_comp[oligo_comp.genome_perc > 0.01].sort_values(['Treatment', 'sample_id'])

In [None]:
fig = px.bar(oligo_comp, x='sample_id', y='genome_perc', color='genome', color_discrete_map=syncom_colors, 
       template='plotly_white', width=800, height=800, labels = {'genome_perc': 'Transcriptome abundance', 'sample_id':''})

fig.add_shape(type="line",
    x0=5.5, y0=0, x1=5.5, y1=1,
    line=dict(
        color="black",
        width=2,
        dash="dash",
    ))
fig.add_annotation(x=3, y=1.03,
            text="LPS",
            showarrow=False,
            )
fig.add_annotation(x=8, y=1.03,
            text="PBS",
            showarrow=False,
            )

## LCM mice

In [None]:
lcm_comp = genome_counts[genome_counts.Mouse == 'LCM']
lcm_comp = lcm_comp[lcm_comp.genome_perc > 0.01]
lcm_comp['Treatment'] = lcm_comp['Treatment'].replace({'PBS_D1': '0D1'})
lcm_comp = lcm_comp.sort_values( ['Treatment'])

In [None]:
lcm_comp[['sample_id', 'Treatment']].drop_duplicates().sort_values(['Treatment', 'sample_id'])

In [None]:
fig = px.bar(lcm_comp, x='sample_id', y='genome_perc', color='genome', color_discrete_map=syncom_colors, 
       template='plotly_white', width=800, height=800, labels = 
       {'genome_perc': 'Transcriptome abundance', 'sample_id':''})
fig.add_shape(type="line",
    x0=1.5, y0=0, x1=1.5, y1=1,
    line=dict(
        color="black",
        width=2,
        dash="dash",
    ))
fig.add_shape(type="line",
    x0=6.5, y0=0, x1=6.5, y1=1,
    line=dict(
        color="black",
        width=2,
        dash="dash",
    ))
fig.add_shape(type="line",
    x0=11.5, y0=0, x1=11.5, y1=1,
    line=dict(
        color="black",
        width=2,
        dash="dash",
    ))

fig.add_shape(type="line",
    x0=14.5, y0=0, x1=14.5, y1=1,
    line=dict(
        color="black",
        width=2,
        dash="dash",
    ))

# EDA on Oligo mice

In [None]:
norm_counts = out_dir/config_dict["norm_count_file"]
norm_counts = pd.read_csv(norm_counts, index_col=0)
norm_counts.head()

In [None]:
norm_counts = norm_counts.set_index('ID')
norm_counts = norm_counts[norm_counts.sum(axis=1) > 100]
norm_counts = np.log2(norm_counts+1)

In [None]:
pc_df, pc_var = find_pcs(norm_counts, num_genes=1000)
pc_df = pc_df.reset_index().rename(columns={'index':'sample_id'}).merge(sd, on='sample_id')

In [None]:
fig = px.scatter(pc_df, x='PC1', y='PC2', color='Treatment', width=700, height=600, 
template='plotly_white', hover_data=['sample_id'])
fig.update_traces(marker=dict(size=12,
                              line=dict(width=1,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))

In [None]:
umap_df = find_umap_df(norm_counts)
umap_df = umap_df.reset_index().rename(columns={'index':'sample_id'}).merge(sd, on='sample_id')
px.scatter(umap_df, x='UMAP1', y='UMAP2', color='Treatment', width=500, height=400, 
template='plotly_white', hover_data=['sample_id'])
# Sensetive to pseudocount used

In [None]:

def get_full_annotation(genome_short, genome_long):
    gff_dir = Path("/nfs/nas22/fs2202/biol_micro_sunagawa/Projects/PAN/REFERENCE_GENOMES_PAN/data/raw")
    kegg_dir = Path("/nfs/shared/_shared/lilith/annotated_genomes_oligomm12_asf519")
    gff_file = gff_dir/f"{genome_short}/{genome_long}/{genome_long}.gff3"
    kegg_file_path = kegg_dir/f"{genome_long}/assembly/{genome_long}/kegg/{genome_long}-annotations.tsv"
    if not gff_file.is_file():
        print(gff_file)
        return
    if not kegg_file_path.is_file():
        print(kegg_file_path)
        return
    gff = pr.read_gff3(gff_file).as_df()
    gff = gff[gff.Feature == 'CDS']
    gff['protein_id'] = gff['protein_id'].fillna(gff['locus_tag'])
    kegg_file = pd.read_table(kegg_file_path)
    kegg_file['protein_id'] = kegg_file.QUERY.str.split("prot_", expand=True)[1]
    kegg_file['protein_id'] = kegg_file['protein_id'].apply(lambda x: "_".join(x.split("_")[:-1]))
    full = gff.merge(kegg_file, on='protein_id', how='outer')
    full.to_csv(root/f"{genome_short}.annotation.csv")
    print(genome_short)
    print(full.head())

# Look at mOTUs abundances 

In [None]:
motus_dir = root/config_dict["motus_dir"]
mouts_profiles = list(motus_dir.rglob("*.motus"))

In [None]:
f = mouts_profiles[0]
mdf_list = []
for f in mouts_profiles:
    motu_df = pd.read_table(f, comment='#', header=None, dtype={0:str, 1:str}, names=['motu', 'ncbi', 'motu_cnt']).assign(sample_id=f.stem)
    motu_df = motu_df[motu_df.motu_cnt > 0]
    
    mdf_list.append(motu_df)
mdf = pd.concat(mdf_list)

In [None]:
pd.set_option('display.max_colwidth', None)
test = mdf[mdf.sample_id == 'AU655'].copy()
test[test.motu.str.contains('YL45')]

In [None]:
test['RelAb'] = test['motu_cnt']/test.motu_cnt.sum()

In [None]:
px.bar(test[test.RelAb > 0.01], x='sample_id', y='RelAb', color='motu', height=500, width=600)

In [None]:
test = test[test.motu_cnt > 20]
test['motu_cnt'] = test['motu_cnt']/test['motu_cnt'].sum()

In [None]:
test.sort_values('motu_cnt')

# Functional annotation

In [None]:
genomes_dir = Path("/nfs/shared/_shared/lilith/annotated_genomes_oligomm12_asf519")
annots = list(genomes_dir.rglob("*.emapper.annotations"))

In [None]:
emap_list = []
for annot in annots:
    emap_df = pd.read_table(annot, comment='#', header=None)[[0,4,7,8,11,12]]
    emap_df.columns =['ID', 'COG', 'Description','Name', 'KO', 'KEGG_Pathway']
    pattern = r'_prot_([A-Z0-9_]+(\.|_)[A-Z0-9]+)_'
    emap_df['protein_id'] = emap_df.ID.str.extract(pattern)[0]
    pattern_root = r'(COG[0-9]+)@1|root'
    pattern_bac = r'(COG[0-9]+)@2|Bacteria'
    emap_df['COG_root'] = emap_df.COG.str.findall(pattern_root).apply(lambda x: ",".join([a for a in x if a]))
    emap_df['COG_bac'] = emap_df.COG.str.findall(pattern_bac).apply(lambda x: ",".join([a for a in x if a]))
    emap_df['KO'] = emap_df['KO'].apply(lambda x: ",".join([a.lstrip("ko:") for a in x.split(",")]))
    emap_list.append(emap_df)
emapper_all = pd.concat(emap_list)

In [None]:
emapper_all[emapper_all['Name'].str.contains('rnf')].COG_bac.unique()

In [None]:
emapper_all.to_csv(out_dir/"oligo-emmapper-processed.csv", index=False)

In [None]:
gff = pr.read_gff3(root/config_dict['gff_file'])

In [None]:
cds = gff[gff.Feature == 'CDS'].as_df()
cds['protein_id'] = cds['protein_id'].fillna(cds['ID'].str.strip('cds-'))

In [None]:
cds_full = cds.merge(emapper_all, on="protein_id", how='outer')

In [None]:
gff = gff.as_df()
gff['locus_tag'] = gff['locus_tag'].fillna(gff['ID'])
final_annotation = gff[['Chromosome', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame',  'locus_tag', 'gene_biotype', 'pseudo', 'product']]
final_annotation = final_annotation[final_annotation.Feature == 'gene']

In [None]:
fa = final_annotation.merge(cds_full[['locus_tag', 'COG', 'Description', 'KO', 'KEGG_Pathway', 'protein_id',
       'COG_root', 'COG_bac']], on='locus_tag', how='left')

In [None]:
fa.groupby('Chromosome').agg({'locus_tag':['nunique'], 'COG_root':['nunique'], 'KO':['nunique']})

# DE Results

In [None]:
res = pd.read_csv(out_dir/"2023-08-03_oligo-alone-within-taxon-LPS_vs_PBS_l0a0.01_results.csv")
res['locus_tag'] = res['ID'].str.replace('gene-', '')
res = res.merge(fa, on='locus_tag')

In [None]:

res['genome'] = res['Chromosome'].replace(genome_map)

In [None]:

def get_term_df(func_analysis, description):
    df = func_analysis[func_analysis.description.str.contains(description)]
    fdf = pd.concat([df[['inputGenes', 'description']].explode('inputGenes'), df[['preferredNames']].explode('preferredNames')], axis=1).drop_duplicates()
    fdf = fdf.rename(columns={'inputGenes': 'locus_tag'})
    return fdf

def process_strain(res, strain, taxid_map):
    sres = res[res.genome == strain].copy()
    sres['locus_tag'] = sres['ID'].str.replace("gene-", '')
    taxid = taxid_map[strain]
    sup = sres.query("log2FoldChange > 1 & padj < 0.05")
    sdown = sres.query("log2FoldChange < -1 & padj < 0.05")
    link_up = link_to_string(sup.locus_tag.values, taxid)
    link_down = link_to_string(sdown.locus_tag.values, taxid)
    func_up = pd.DataFrame(string_function(sup.locus_tag.values, taxid))
    func_down = pd.DataFrame(string_function(sdown.locus_tag.values, taxid))
    return sres, link_up, link_down, func_up, func_down

def highlight_term(sres, func_up, func_down, up_terms, down_terms):
    ups = pd.concat([get_term_df(func_up, term) for term in up_terms])
    downs = pd.concat(get_term_df(func_down, term) for term in down_terms)
    highlights = pd.concat([ups, downs])
    sres = sres.merge(highlights, on='locus_tag', how='left')
    sres['description'] = sres['description'].fillna('Other')
    return sres

In [None]:
# Chaperones 
import json
chap_file = root/'analysis/ko03110.keg'
with open(chap_file, 'r') as fh:
    lines = fh.readlines()


In [None]:
import re
chaps = []
chap_names = []
pattern = '^C *(K[0-9]*) *(\w*);'
for line in lines:
    matches = re.search(pattern, line)
    if matches:
        chaps.append(matches.group(1))
        chap_names.append(matches.group(2))

In [None]:
chap_names = {a:k for a,k in zip(chaps, chap_names)}

In [None]:
ox_stress = """K00383                      
K00384 
K00432                      
K00799                      
K02426                      
K03387                      
K03564                      
K03671                      
K03674                      
K03676                      
K03781                      
K03782                      
K04047
K04063                      
K04487
K04488                      
K04564                      
K04565                      
K04761
K05919                      
K05997                      
K07304                      
K07305                      
K07322                      
K07400                      
K08968                      
K09013
K09014
K09015                      
K11065
K11209                      
K11717
K12262                      
K12267                      
K13639                      
K13643
K24119""".split()

ox_names = """gor
trxB
btuE
gst
sufE
ahpF
bcp
trxA
grxA
grxC
katE
katG
dps
osmC
iscS
iscU
SOD2
SOD1
oxyR
dfx
sufA
msrA
msrB
ytfE
nfuA
msrC
sufC
sufB
sufD
tpx
yfcG
sufS
cybB
msrAB
soxR
iscR
ahpC""".split()
ox_names = {a:b for a,b in zip(ox_stress, ox_names)}

In [None]:
aa ="""K00052               leuB, IMDH; 3-isopropylmalate dehydrogenase [EC:1.1.1.85] 
K00053               ilvC; ketol-acid reductoisomerase [EC:1.1.1.86] 
K00263               E1.4.1.9; leucine dehydrogenase [EC:1.4.1.9] 
K00826               E2.6.1.42, ilvE; branched-chain amino acid aminotransferase [EC:2.6.1.42] 
K00835               avtA; valine--pyruvate aminotransferase [EC:2.6.1.66] 
K01649               leuA, IMS; 2-isopropylmalate synthase [EC:2.3.3.13] 
K01652               E2.2.1.6L, ilvB, ilvG, ilvI; acetolactate synthase I/II/III large subunit [EC:2.2.1.6] 
K01653               E2.2.1.6S, ilvH, ilvN; acetolactate synthase I/III small subunit [EC:2.2.1.6] 
K01687               ilvD; dihydroxy-acid dehydratase [EC:4.2.1.9] 
K01702               LEU1; 3-isopropylmalate dehydratase [EC:4.2.1.33] 
K01703               leuC, IPMI-L; 3-isopropylmalate/(R)-2-methylmalate dehydratase large subunit [EC:4.2.1.33 4.2.1.35] 
K01704               leuD, IPMI-S; 3-isopropylmalate/(R)-2-methylmalate dehydratase small subunit [EC:4.2.1.33 4.2.1.35] 
K01754               E4.3.1.19, ilvA, tdcB; threonine dehydratase [EC:4.3.1.19] 
K09011               cimA; (R)-citramalate synthase [EC:2.3.3.21] 
K11258               ilvM; acetolactate synthase II small subunit [EC:2.2.1.6] 
K14260               alaA; alanine-synthesizing transaminase [EC:2.6.1.66 2.6.1.2] 
K17989               SDS, SDH, CHA1; L-serine/L-threonine ammonia-lyase [EC:4.3.1.17 4.3.1.19] 
K21359               MAM-IS, IPMI2; 3-isopropylmalate/methylthioalkylmalate dehydratase small subunit [EC:4.2.1.33 4.2.1.  """.split("\n")


names = [a.split()[1].split(';')[0].strip().strip(',') for a in aa if a]
aa = [a.split()[0].strip() for a in aa if a]
aa_names = {a:k for a,k in zip(aa, names)}
aa_df = res[res.COG_bac.isin(['COG4660', 'COG4657', 'COG2878', 'COG4656', 'COG4658', 'COG4659',
       'COG2221,COG2878', 'COG2768,COG2878', 'COG1148,COG2878',
       'COG4658,COG4659'])][['KO', 'log2FoldChange', 'padj', 'genome']]
aa_df = res[res.KO.isin(aa)][['KO', 'log2FoldChange', 'padj', 'genome']]
aa_df['KO'] = aa_df['KO'].replace(aa_names)

In [None]:
ox_df = res[res.KO.isin(ox_stress)][['ID', 'KO', 'log2FoldChange', 'padj', 'genome']]
ox_df = ox_df.groupby(['genome', 'KO']).log2FoldChange.median().reset_index().dropna()
ox_df['name'] = ox_df['KO'].replace(ox_names) 


In [None]:
ox_df

In [None]:
chap_df = res[res.KO.isin(chaps)][['KO', 'log2FoldChange', 'padj', 'genome']]
chap_df['name'] = chap_df['KO'].replace(chap_names)

In [None]:
aa_df.log2FoldChange.min()

In [None]:
aa_df.log2FoldChange.max()

In [None]:
df = pd.read_csv("/nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq/scratch/03_23_transcriptomics/analysis/17-08-23-LCM-taxa-dada2.csv")

In [None]:
import plotly.graph_objects as go
fig = go.Figure(data=[go.Heatmap(
            z=aa_df.log2FoldChange,
            x=aa_df.genome,
            y=aa_df.KO,
            xgap=3,
            ygap=3,
            colorscale=["black", "black"],
            showscale=False,
        ),
    
    go.Heatmap(
        z=aa_df.log2FoldChange,
        x=aa_df.genome,
        y=aa_df.KO,
 
        xgap=5,
        ygap=5,
        colorscale= [[0, 'rgba(0, 0, 139, 0.85)'],   
               [0.87, 'rgba(255, 255, 255, 0.85)'],  
               [1, 'rgba(53, 94, 59, 0.85)']],),
        
               ])

fig.update_layout(width=500, height=500, template='simple_white', yaxis_scaleanchor="x",
    plot_bgcolor="#fff",

    margin=dict(b=0, t=20, l=0, r=20))

pio.write_image(fig, out_dir/"aa_heatmap.svg", width=500, height=500, scale=2)

In [None]:
import plotly.graph_objects as go
fig = go.Figure(data=[go.Heatmap(
            z=chap_df.log2FoldChange,
            x=chap_df.genome,
            y=chap_df.name,
            xgap=3,
            ygap=3,
            colorscale=["black", "black"],
            showscale=False,
        ),
    
    go.Heatmap(
        z=chap_df.log2FoldChange,
        x=chap_df.genome,
        y=chap_df.name,
 
        xgap=5,
        ygap=5,
        colorscale= [[0, 'rgba(0, 0, 139, 0.85)'],   
               [0.37, 'rgba(255, 255, 255, 0.85)'],  
               [1, 'rgba(53, 94, 59, 0.85)']],),
        
               ],
        
        )

fig.update_layout(width=500, height=600, template='simple_white', yaxis_scaleanchor="x",
    plot_bgcolor="#fff", 

    margin=dict(b=0, t=20, l=0, r=20))
pio.write_image(fig, out_dir/"chaperones_heatmap.svg", width=500, height=600, scale=2)

In [None]:
#ox_df = ox_df.sort_values('name', ascending=False)
fig = go.Figure(data=[go.Heatmap(
            z=ox_df.log2FoldChange,
            x=ox_df.genome,
            y=ox_df.name,
            xgap=3,
            ygap=3,
            colorscale=["black", "black"],
            showscale=False,
        ),
    
    go.Heatmap(
        z=ox_df.log2FoldChange,
        x=ox_df.genome,
        y=ox_df.name,
 
        xgap=5,
        ygap=5,
        colorscale= [[0, 'rgba(0, 0, 139, 0.85)'],   
               [0.47, 'rgba(255, 255, 255, 0.85)'],  
               [1, 'rgba(53, 94, 59, 0.85)']],),
        
               ],
        
        )

fig.update_layout(width=500, height=700, template='simple_white', yaxis_scaleanchor="x",
    plot_bgcolor="#fff", 

    margin=dict(b=0, t=20, l=0, r=20))

pio.write_image(fig, out_dir/"oxidative_stress_heatmap.svg", width=500, height=700, scale=2)

In [None]:
res['KEGG_Pathway'] = res['KEGG_Pathway'].fillna('-')
clrs = px.colors.qualitative.G10

In [None]:
strain = 'I48'
df48, lup48, ldown48, funcup48, funcdown48 = process_strain(res, strain, ncbi_taxid_map)
up_terms = ['Protein folding', 'ATP synthesis']
up_terms = ['Protein folding']
down_terms = ['Biosynthesis of amino acids', 'Oxidoreductase activity']
down_terms = ['Biosynthesis of amino acids']
fdf48 = highlight_term(df48, funcup48, funcdown48, up_terms, down_terms)

In [None]:
fdf48['description'] = fdf48['description'].replace({"Oxidoreductase activity, acting on NAD(P)H, and protein flavinylation": 'Oxidoreductase activity',
                                                     'Biosynthesis of amino acids':'Amino acid biosynthesis'})

In [None]:
fig = px.scatter(fdf48, x='log2FoldChange', y=-np.log2(fdf48.padj), color='description', width=1000, height=700,
                color_discrete_map = {'Other': '#BAB0AC', 
                                        'Protein folding': clrs[0],
                                        'Oxidoreductase activity': clrs[1], 
                                        'ATP synthesis': clrs[2],
                                        'Amino acid biosynthesis': clrs[3]}, template='plotly_white', hover_data=['locus_tag', 'preferredNames'], 
                                        labels={'y': '-log(padj)'})

fig.add_hline(y=-np.log10(0.01), line_width=2, line_dash='dash')
fig.add_vline(x=-1, line_width=2, line_dash='dash')
fig.add_vline(x=1, line_width=2, line_dash='dash')
fig.update_traces(marker=dict(size=12,
                              line=dict(width=1,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'), 
        
    )
fig.update_layout(font=dict(
        size=18, 
    ))

pio.write_image(fig, out_dir/"i48_volacano.svg", width=1000, height=700, scale=2)

In [None]:
def func_graph(df, c='Blues_r'):
    df = df[(df.term.str.startswith('GO')) | (df.term.str.startswith('KW') |(df.category == 'KEGG'))].copy()
    df['description'] = df['description'] +" | " + df['term']
    fig = px.bar(df.sort_values('number_of_genes'), y='description', x='number_of_genes', color='fdr', orientation='h',
    width =900, height=500, template= 'plotly_white', color_continuous_scale=c, 
    labels={'number_of_genes':'Number of genes', 'description': ''})
    return fig

In [None]:
func_graph(funcdown48, 'Reds_r')

In [None]:
df = funcup48[(funcup48.term.str.startswith('GO')) | (funcup48.term.str.startswith('KW') |(funcup48.category == 'KEGG'))].copy()
df['description'] = df['description'] +" | " + df['term']
px.bar(df.sort_values('number_of_genes'), y='description', x='number_of_genes', color='fdr', orientation='h',
width =900, height=500, template= 'plotly_white', color_continuous_scale='Blues_r', 
labels={'number_of_genes':'Number of genes', 'description': ''})

In [None]:
strain = 'YL32'
df32, lup32, ldown32, funcup32, funcdown32 = process_strain(res, strain, ncbi_taxid_map)
up_terms = ['Protein folding', 
#'Carbohydrate transport', 
#'glycerol metabolic process',
#'nucleobase catabolic process']
]
down_terms = ['Amino acid transport', 
              #'Peptide transport', 
              'Ribosome', 
              #'Flagellar assembly',
                ]
fdf32 = highlight_term(df32, funcup32, funcdown32, up_terms, down_terms)

In [None]:
fdf32['description'] = fdf32['description'].replace({'Protein folding, and serine-type endopeptidase activity':'Protein folding', 
'Mixed, incl. purine nucleobase catabolic process, and nucleobase transport':'Purine nucleobase catabolic process',
'Mixed, incl. purine nucleobase catabolic process, and nucleotide catabolic process':'Purine nucleobase catabolic process',
'Ribosome, and regulation of translation':'Ribosome',
'Peptide transport': 'Amino acid/peptide transport',
'Amino acid transport':'Amino acid/peptide transport'
})

In [None]:
fdf32.description.unique()

In [None]:
func_graph(funcup32)

In [None]:
fig = func_graph(funcdown32, 'Reds_r')
fig.update_layout(height=1000)

In [None]:
fig = px.scatter(fdf32, x='log2FoldChange', y=-np.log2(fdf32.padj), color='description', width=1000, height=700,
                color_discrete_map = {'Other': '#BAB0AC', 
                                        'Protein folding': clrs[0],
                                         'Purine nucleobase catabolic process': clrs[1],
                                        'Ribosome': clrs[2], 
                                        'Amino acid/peptide transport': clrs[3],
                                        'Flagellar assembly': clrs[4]}, template='plotly_white', hover_data=['locus_tag'], 
                                        labels={'y': '-log(padj)'})

fig.add_hline(y=-np.log10(0.05), line_width=2, line_dash='dash')
fig.add_vline(x=-1, line_width=2, line_dash='dash')
fig.add_vline(x=1, line_width=2, line_dash='dash')
fig.update_traces(marker=dict(size=12,
                              line=dict(width=1,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))

#pio.write_image(fig, out_dir/"YL32_volacano.svg", width=1000, height=700, scale=2)

In [None]:
strain = 'YL27'
df27, lup27, ldown27, funcup27, funcdown27 = process_strain(res, strain, ncbi_taxid_map)
up_terms = ['Chaperone']
down_terms = []
#fdf27 = highlight_term(df27, funcup27, funcdown27, up_terms, down_terms)

In [None]:
lup27

In [None]:
res.query('COG_bac == "COG4656"')

In [None]:
res[res.genome == 'YL44'].head()

In [None]:
strain = 'YL58'
df58, lup58, ldown58, funcup58, funcdown58 = process_strain(res, strain, ncbi_taxid_map)
up_terms = ['Protein folding', 
#'amino acid catabolic process',
#'nucleobase catabolic process
]
down_terms = [#'ABC transporters', 
              'Ribosome', 
              'tyrosine', 
              'leucine',
              'Histidine']
fdf58 = highlight_term(df58, funcup58, funcdown58, up_terms, down_terms)

In [None]:
funcup58.description.unique()

In [None]:
fdf58.description.unique()

In [None]:
fdf58['description'] = fdf58['description'].replace({'Protein folding, and protein stabilization':'Protein folding', 
'Phenylalanine, tyrosine and tryptophan biosynthesis': 'Amino acid biosynthesis',
'Pyruvate metabolism, and Valine, leucine and isoleucine biosynthesis':'Amino acid biosynthesis',
'Histidine biosynthetic process':'Amino acid biosynthesis',
'Histidine biosynthesis': 'Amino acid biosynthesis',
'Mixed, incl. purine nucleobase catabolic process, and deaminase activity':'Purine nucleobase catabolic process',
'Mixed, incl. purine nucleobase catabolic process, and nucleotide catabolic process':'Purine nucleobase catabolic process',
'Ribosome, and translation regulator activity': 'Ribosome',
'Peptide transport': 'Amino acid/peptide transport',
'Amino acid transport':'Amino acid/peptide transport'
})

In [None]:
fig = px.scatter(fdf58, x='log2FoldChange', y=-np.log2(fdf58.padj), color='description', width=1000, height=700,
                color_discrete_map = {'Other': '#BAB0AC', 
                                        'Protein folding': clrs[0],
                                        'Ribosome': clrs[2], 
                                        'Alpha-amino acid catabolic process': clrs[2],
                                        'Amino acid biosynthesis': clrs[3],
                                        'ABC transporters': clrs[4],
                                        'Purine nucleobase catabolic process': clrs[5]}, template='plotly_white', hover_data=['locus_tag'], 
                                        labels={'y': '-log(padj)'})

fig.add_hline(y=-np.log10(0.01), line_width=2, line_dash='dash')
fig.add_vline(x=-1, line_width=2, line_dash='dash')
fig.add_vline(x=1, line_width=2, line_dash='dash')
fig.update_traces(marker=dict(size=12,
                              line=dict(width=1,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))

pio.write_image(fig, out_dir/"YL58_volacano.svg", width=1000, height=700, scale=2)

In [None]:
fig

In [None]:
fig = func_graph(funcup58)
fig.update_layout(height=300)

In [None]:
func_graph(funcdown58, c='Reds_r')

In [None]:
#Stress/protein folding
folding_genes = func_up[func_up.description == 'Protein folding'].inputGenes.values[0]
#ATP Synthesis
atp_genes = func_up[func_up.description == 'ATP synthesis'].inputGenes.values[0]
#-
#Amino-acid biosynthesis

#Oxireductase activity

In [None]:
def clean_kos(k):
    if type(k) == str:
        return k.split(",")
    return 
nested_list = [ clean_kos(k) for k in resup.KO.unique()]
KOs = [element for sublist in nested_list if sublist for element in sublist]

In [None]:
for a in resup.sample(10).locus_tag.values:
    print(a)

In [None]:
for ko in KOs:
    print(ko)

## Saturation curves

In [None]:
from numpy.random import RandomState

def rarefy(x, depth=1000, iterations=1, seed=42):
    res = None
    if iterations > 100000:
        print('Max number of iterations allowed is 100000')
        return None
    if iterations > 1:
        seeds = np.random.choice(100000, size=iterations)
    else:
        seeds = [seed]
    for seed in seeds:
        prng = RandomState(seed)
        noccur = np.sum(x)
        nvar = len(x)
        p = x/noccur
        if depth > noccur:
            return np.array([np.nan]*nvar)
        choice = prng.choice(nvar, size=int(depth), p=p)
        if res is None:
            res = np.bincount(choice, minlength=nvar)[np.newaxis,:]
            
        else:
            res = np.concatenate((res, np.bincount(choice, minlength=nvar)[np.newaxis, :]))
    return np.nanmean(res, axis=0)

def rarefy_dataframe(df, depths, seed=0):
    df_list = []
    df.columns.name = 'sample_id'
    for depth in depths: 
        rare_df = df.apply(rarefy, depth=depth, seed=seed).assign(depth=depth)
        df_list.append(rare_df)
        rdf = pd.concat(df_list).reset_index()
        to_drop = rdf.columns[0]
    return rdf.melt(id_vars=[to_drop, 'depth'], var_name='sample_id', value_name='counts')

def saturationCurves(df, depths, cutoffs, seed):
    rareDf = rarefy_dataframe(df, depths, seed).dropna()
    sat_curve_df = (rareDf.groupby(['sample_id', 'depth'])
                    .agg({'counts': [lambda x, c=c: (x > c).sum() for c in cutoffs]})
                    .reset_index())
    sat_curve_df.columns  = ['sample_id', 'depth'] + [f'>{c} reads' for c in cutoffs]
    return rareDf, sat_curve_df

In [None]:
def saturation_curve_strain(df, strain):
    strain_df = df[df.genome == strain]
    strain_df = strain_df[['gene_id', 'sample_id', 'num_reads']].pivot(index='gene_id', columns='sample_id')
    strain_df.columns = [f[1] for f in strain_df.columns]
    rare_df, sat_curve_df = saturationCurves(strain_df, [1e5,3e5, 5e5,7e5, 1e6,1.5e6, 2e6,
                                                    3e6, 4e6, 5e6, 6e6, 7e6, 8e6], [5], 9)
    fig = px.line(sat_curve_df, x='depth', y='>5 reads', color='sample_id',
       template='plotly_white', title=strain)
    fig.add_vline(x=3000000, line_width=1, line_dash="dash")
    return fig

In [None]:
saturation_curve_strain(count_df, 'YL32')

In [None]:
saturation_curve_strain(count_df, 'I48')

In [None]:
saturation_curve_strain(count_df, 'YL58')

In [None]:
saturation_curve_strain(count_df, 'YL27')

In [None]:
i48_d = sd[sd.genome == 'I48'].copy()
i48_d['i48_dominant'] = i48_d.perc_genome > 49
sd = sd.merge(i48_d[['sample_id', 'i48_dominant']], on='sample_id')

In [None]:
sample_info = """
AU647
Oligo
PBS
1
AU648
Oligo
PBS
1
AU649
Oligo
PBS
1
AU650
Oligo
LPS
2
AU651
Oligo
LPS
2
AU652
Oligo
LPS
2
AU653
Oligo
LPS
2
AU654
Oligo
PBS
3
AU655
Oligo
PBS
3
AU656
Oligo
PBS
3
AU657
Oligo
LPS
4
AU658
Oligo
LPS
4
""".strip().split("\n")
sample_info = pd.DataFrame(np.array(sample_info).reshape(12, 4), columns=['sample_id', 'Mouse', 'Treatment', 'Cage'])

In [None]:
sd = sd.merge(sample_info[['sample_id', "Cage"]], on='sample_id')

In [None]:
sd

In [None]:
fig, pc_df = get_strain_pca(count_df, 'YL32', sd)

In [None]:
fig

In [None]:
fig, pc_df = get_strain_pca(count_df, 'I48', sd, symbol_by='i48_dominant')
fig

In [None]:
fig, pc_df = get_strain_pca(count_df, 'YL58', sd)
fig

In [None]:
fig, pc_df = get_strain_pca(count_df, 'YL27', sd)
fig

In [None]:
fig, pc_df = get_strain_pca(count_df, 'YL44', sd)
fig

### Writing out datasets

In [None]:
def get_strain_df(df, strain, saturation='low'):
    strain_df = df[df.genome == strain]
    if saturation not in ['low', 'med', 'high']:
        return None
    if saturation == 'med':
        strain_df = strain_df[strain_df.saturation != 'low']
    elif saturation == 'high':
        strain_df = strain_df[strain_df.saturation == 'high']
    strain_df = strain_df[['gene_id', 'sample_id', 'num_reads']].pivot(index='gene_id', columns='sample_id')
    strain_df.columns = [f[1] for f in strain_df.columns]
    return strain_df

In [None]:
count_df.sample(10)

In [None]:
yl32 = get_strain_df(count_df, 'YL32')
yl32.to_csv(count_dir/f"{today}_YL32_all_samples.csv")

i48 = get_strain_df(count_df, 'I48', saturation='high')
i48.to_csv(count_dir/f"{today}_I48_all_samples.csv")

yl58 = get_strain_df(count_df, 'YL58')
yl58.to_csv(count_dir/f"{today}_YL58_all_samples.csv")

yl27 = get_strain_df(count_df, 'YL27')
yl27.to_csv(count_dir/f"{today}_YL27_all_samples.csv")

In [None]:
yl32 = get_strain_df(count_df, 'YL32', 'high')
yl32.to_csv(count_dir/f"{today}_YL32_high_samples.csv")

i48 = get_strain_df(count_df, 'I48', 'high')
i48.to_csv(count_dir/f"{today}_I48_high_samples.csv")

yl58 = get_strain_df(count_df, 'YL58', 'high')
yl58.to_csv(count_dir/f"{today}_YL58_high_samples.csv")

yl27 = get_strain_df(count_df, 'YL27', 'high')
yl27.to_csv(count_dir/f"{today}_YL27_high_samples.csv")

In [None]:
yl32 = get_strain_df(count_df, 'YL32', 'med')
yl32.to_csv(count_dir/f"{today}_YL32_med_samples.csv")

i48 = get_strain_df(count_df, 'I48', 'med')
i48.to_csv(count_dir/f"{today}_I48_med_samples.csv")

yl58 = get_strain_df(count_df, 'YL58', 'med')
yl58.to_csv(count_dir/f"{today}_YL58_med_samples.csv")

yl27 = get_strain_df(count_df, 'YL27', 'med')
yl27.to_csv(count_dir/f"{today}_YL27_med_samples.csv")