In [None]:
# %load ../snippets/basic_settings.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from pathlib import Path
import seaborn as sns
import sys
import plotly.express as px
import yaml

sns.set_context("notebook", font_scale=1.1)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
plt.rcParams["figure.figsize"] = (16, 12)
plt.rcParams['savefig.dpi'] = 200
plt.rcParams['figure.autolayout'] = False
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14

plt.rcParams['font.serif'] = "cm"
#pd.set_option('display.float_format', lambda x: '{:,.2f}'.format(x))

In [None]:
dbauer_dir = Path("/nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq/scratch/deutschbauer/fastq")

In [None]:
counts_dir = dbauer_dir/"counts"

In [None]:
my_cnts = pd.read_csv(counts_dir/"SB2B_GACCTGCAGCGTACGB20_mbarq_merged_counts.csv")
db_cnts = pd.read_table(dbauer_dir/"SB_count_table.poolcount")

In [None]:
my_cnts_annotated = my_cnts[~my_cnts.Name.isna()]

In [None]:
my_cnts_annotated.head()

In [None]:
my_cnts_annotated.to_csv(dbauer_dir/"counts/SB2B_GACCTGCAGCGTACGB20_mbarq_merged_counts_annotated.csv", index=False)

In [None]:
db_cnts = db_cnts[['rcbarcode'] +[c for c in db_cnts.columns if 'set1' in c]]

In [None]:
my_cnts.sort_values('H1').tail(5)

In [None]:
db_cnts.sample(5)

In [None]:
db_cnts.shape

In [None]:
my_cnts[~my_cnts.Name.isna()].shape

In [None]:
cnts = my_cnts.merge(db_cnts, left_on='barcode', right_on='rcbarcode', how='inner')

In [None]:
cnts.shape

In [None]:
cnts.head()

In [None]:
def comp_2(cnts_df, c1, c2):
    
    df = cnts_df[['barcode', c1, c2]].copy().set_index('barcode')

    df = np.log2(df +0.5)
    f = px.scatter(df, x=c1, y=c2)
    return df, f

In [None]:
df, f = comp_2(cnts, 'H1', 'SB2B_ML5_set1.H1')
f

In [None]:
2**0.58

In [None]:
df[df.H1 == -1]

In [None]:
df, f = comp_2(cnts, 'H46', 'SB2B_ML5_set1.H46')
f

In [None]:
cnts2 = cnts.drop(['rcbarcode', 'Name'], axis=1).set_index('barcode')
#cnts2 = cnts2/cnts2.sum(axis=1)
tots = cnts2.sum()

In [None]:
cnts3 = np.log2(cnts2/tots*1000000 +0.5)

In [None]:
cnts3.head()

In [None]:
cc = cnts3.corr()


In [None]:
cc = cc.stack().reset_index(name="correlation")
cc2 = cc[~(cc.level_0.str.contains('SB2B'))&(cc.level_1.str.contains('SB2B'))].copy()

In [None]:
"SB2B_ML5_set1_H" + cc2['level_1'].str.replace('SB2B_ML5_set1.H', '').str.zfill(2)

In [None]:
cc2['level_0'] = "H" + cc2['level_0'].str.replace('H', '').str.zfill(2)
cc2['level_1'] = "SB2B_ML5_set1_H" + cc2['level_1'].str.replace('SB2B_ML5_set1.H', '').str.zfill(2)

In [None]:
cc2[cc2.level_0 == 'H15']

In [None]:
g = sns.relplot(
    data=cc2.sort_values(['level_0', 'level_1']),
    x="level_0", y="level_1", hue="correlation", size="correlation",
    palette="vlag", hue_norm=(0, 1), edgecolor=".7",
    height=10, sizes=(10, 100), size_norm=(0, 1),
)

# Tweak the figure to finalize
g.set(xlabel="", ylabel="", aspect="equal")
g.despine(left=True, bottom=True)
g.ax.margins(.02)
for label in g.ax.get_xticklabels():
    label.set_rotation(90)
for artist in g.legend.legendHandles:
    artist.set_edgecolor(".7")

In [None]:
cc.min()

In [None]:
cc[cc.level_1=='H10']

In [None]:
def get_carbon_results(carbon, carbon2=""):
    gene_ann = pd.read_csv(dbauer_dir/"test_out/TnSeq_SB2B_ML5_l10_RC_old_loci.annotated.csv")[['Name', 'old_locus_tag']]
    my_res = pd.read_csv(dbauer_dir/f"test_out/Set1NoReps_rra_results.csv", index_col=0)
    my_res = my_res.merge(gene_ann, on='Name', how='left')
    my_res = my_res[my_res.contrast == carbon].copy()
    db_res = pd.read_table(dbauer_dir/"sb2b_ml5.gene_fitness.tab")
    tstat = pd.read_table(dbauer_dir/"sb2b_t_stat.tab")    
    if carbon2:
        db_res = db_res[['sysName'] +[c for c in db_res.columns if carbon2 in c]]
        tstat = tstat[['sysName']+[c for c in db_res.columns if carbon2 in c]]
        tstat.columns = ['sysName'] + [f"{c}_t" for c in db_res.columns if carbon2 in c]
    else:
        db_res = db_res[['sysName'] +[c for c in db_res.columns if carbon.replace('_', " ") in c]]
        tstat = tstat[['sysName']+[c for c in db_res.columns if carbon.replace('_', " ") in c]]
        tstat.columns = ['sysName'] + [f"{c}_t" for c in db_res.columns if carbon.replace('_', " ") in c]
    my_res = my_res.merge(db_res, left_on='old_locus_tag', right_on='sysName', how='outer').drop_duplicates()
    my_res = my_res.merge(tstat, on='sysName', how='outer').drop_duplicates()
    return my_res
    

In [None]:
tween = get_carbon_results("Tween_20")

In [None]:
tween['mbarq_hits'] = ( ((tween.neg_selection_fdr < 0.001))).astype(int)
tween['db_hits'] = ((tween["set1H14 Tween 20_t"] < -4)).astype(int)*2
tween['hits'] = (tween['mbarq_hits'] + tween['db_hits']).astype(str)
tween.hits.replace({'0': 'Not a hit', '1': 'mBARq Hit', '2': 'FEBA Hit', '3': 'Hit'}, inplace=True)


In [None]:
tween.dropna().hits.value_counts()

In [None]:
288/(288+55)

In [None]:
tween[['LFC', 'set1H14 Tween 20']].corr()

In [None]:
clrs = px.colors.qualitative.Alphabet

In [None]:
fig = px.scatter(tween, x='LFC', y='set1H33 Tween 20', color='hits',
           color_discrete_map = {'Not a hit': clrs[8], 'Hit': clrs[23], 
                                 'mBARq Hit': clrs[14], 'FEBA Hit':clrs[13]},
           hover_data=['Name', 'neg_selection_fdr'], 
           labels = {'LFC': 'mBARq LFC', 'set1H33 Tween 20': 'FEBA LFC'} ,     
                 category_orders = {'hits':['Not a hit', 'FEBA Hit', 'mBARq Hit', 'Hit']},
                 height=900, width=900, template='plotly_white')
            
fig.update_traces(marker=dict(size=10,
                    line=dict(width=1,
                                                color='DarkSlateGrey'), opacity=0.6),
                          selector=dict(mode='markers'))

fig.update_layout(font={'size':20})

In [None]:
fig = px.scatter(test, x="neg|lfc", y="set1H20 CAS aminos", trendline='ols', hover_data=['id'] )
fig.update_layout(shapes = [{'type': 'line', 'yref': 'paper', 'xref': 'paper', 'y0': 0, 'y1': 1, 'x0': 0, 'x1': 1}])

In [None]:
db_res = pd.read_table(dbauer_dir/"sb2b_ml5.gene_fitness.tab")
db_res = db_res[['sysName'] + [c for c in db_res.columns if 'set1' in c]]
db_res.head()
#db_res.melt(id_vars=['sysName'], var_name='contrast', value_name='LFC')

In [None]:
gene_ann = pd.read_csv(dbauer_dir/"test_out/TnSeq_SB2B_ML5_l10_RC_old_loci.annotated.csv")[['Name', 'old_locus_tag']]
my_res = pd.read_csv(dbauer_dir/f"test_out/Set1NoReps_rra_results.csv", index_col=0)
my_res = my_res.merge(gene_ann, on='Name', how='left')
my_res = my_res[~my_res.old_locus_tag.isna()]
my_res = my_res[['old_locus_tag', 'contrast', 'LFC']]
my_res = my_res.drop_duplicates().pivot(index='old_locus_tag', columns='contrast').reset_index()

In [None]:
my_res.columns = [c[1] if c[1] else c[0] for c in my_res.columns ]


In [None]:
fdf = my_res.merge(db_res, left_on='old_locus_tag', right_on = 'sysName', how='inner')

In [None]:
fdf_corr = fdf.corr().stack().reset_index(name="correlation")

In [None]:
test.level_0.unique()

In [None]:
test.level_1.unique()

In [None]:
level_0_columns = ['set1H14 Tween 20', 'set1H20 CAS aminos', 'set1H21 Putrescine', 'set1H22 NAG', 
                   'set1H4 D-Maltose', 'set1H28 D-Mannose', 'set1H32 D,L-Lactate', 'set1H30 L-Serine' ]

level_0_rename = {'set1H14 Tween 20': 'FEBA: Tween 20',
                  'set1H20 CAS aminos': 'FEBA: CAS amino acids',
                  'set1H21 Putrescine': 'FEBA: Putrescine',
                  'set1H22 NAG': 'FEBA: NAG', 
                   'set1H4 D-Maltose': 'FEBA: D-Maltose', 
                  'set1H28 D-Mannose': 'FEBA: D-Mannose',
                  'set1H32 D,L-Lactate': 'FEBA: D,L-Lactate',
                  'set1H30 L-Serine': 'FEBA: L-Serine'}


level_1_columns = ['Tween_20', 'CAS_amino_acids','Putrescine_Dihydrochloride', 
                   'N-Acetyl-D-Glucosamine', 'D-Maltose_monohydrate', 'D-Mannose', 'Sodium_D,L-Lactate',
                  'L-Serine']

level_1_rename = {'Tween_20': 'mBARq: Tween 20',
                 'CAS_amino_acids': 'mBARq: CAS amino acids',
                 'Putrescine_Dihydrochloride': 'mBARq: Putrescine',
                 'N-Acetyl-D-Glucosamine': 'mBARq: NAG',
                 'D-Maltose_monohydrate': 'mBARq: D-Maltose',
                 'D-Mannose': 'mBARq: D-Mannose',
                 'Sodium_D,L-Lactate': 'mBARq: D,L-Lactate',
                  'L-Serine': 'mBARq: L-Serine'}

test = fdf_corr[(fdf_corr.level_0.isin(level_0_columns)) & (fdf_corr.level_1.isin(level_1_columns))].copy()

In [None]:
test['level_0'].replace(level_0_rename, inplace=True)
test['level_1'].replace(level_1_rename, inplace=True)

In [None]:
test

In [None]:
g = sns.relplot(
    data=test.sort_values(['level_0', 'level_1']),
    x="level_0", y="level_1", hue="correlation", size="correlation",
    palette="vlag", hue_norm=(0, 1), edgecolor=".5",
    height=8, sizes=(50, 500), size_norm=(0, 1),
)

# Tweak the figure to finalize
g.set(xlabel="", ylabel="", aspect="equal")
g.despine(left=True, bottom=True)
g.ax.margins(.1)
for label in g.ax.get_xticklabels():
    label.set_rotation(90)
for artist in g.legend.legendHandles:
    artist.set_edgecolor(".7")

In [None]:
tstat = pd.read_table(dbauer_dir/"sb2b_t_stat.tab")[['sysName', 'set1H20 CAS aminos']]
tstat.columns = ['sysName', "set1H20_CAS_aminos_t"]
my_res = my_res.merge(tstat, on='sysName', how='outer')

In [None]:
my_res.head()

In [None]:
sig_res = my_res[(my_res['neg|fdr'] < 0.0001) |(my_res['pos|fdr'] < 0.0001) | (abs(my_res['set1H20_CAS_aminos_t']) > 4)]

In [None]:
sig_res.shape

In [None]:
sig_res.shape

In [None]:
sig_res[(sig_res['neg|fdr'] < 0.05) & abs(sig_res['set1H20_CAS_aminos_t']) > 4].shape

In [None]:
sig_res[sig_res['neg|fdr'] < 0.05].shape

In [None]:
sig_res[abs(sig_res['set1H20_CAS_aminos_t']) > 4].shape

In [None]:
px.scatter(sig_res,  x="neg|lfc", y="set1H20 CAS aminos", trendline='ols', hover_data=['sysName', 'neg|fdr'])