In [None]:
import pandas as pd
from pathlib import Path
from tnseq2.src.analysis import *
from tnseq2.src.method2_analysis import *
import numpy as np
from scipy.stats import ranksums
import matplotlib.pyplot as plt
import chart_studio
import chart_studio.tools as tls
import chart_studio.plotly as py
import plotly.express as px
%matplotlib inline
import cufflinks as cf
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
cf.go_offline()
import dash_bio as dashbio

# Table of Contents: <a id='start'></a>

1. [Loading the data](#loading-data)
2. [Method 1](#Method-1)
3. [Method 2](#Method-2)
4. [Compare the results](#Compare)

## Setup: <a id='loading-data'></a>

- Loading the example dataset. 

- `lib10_cnt` dataframe contains raw counts and meta info
- Identify `good_samples` using `calculate_correlation`

In [None]:
counts_dir ="/Users/ansintsova/git_repos/nguyenb_tnseq/data/13_04_results/counts"
outdir = '/Users/ansintsova/git_repos/nguyenb_tnseq/data/01_06'
control_file = Path("/Users/ansintsova/git_repos/nguyenb_tnseq/data/13_04_results")/'controls.txt'
dnaids = ['dnaid1315', 'dnaid1428', 'dnaid1429', 'dnaid2015', 'dnaid2016', 'dnaid2017', 'dnaid2018', 'dnaid2019',
         'dnaid2023', 'dnaid2024', 'dnaid2025', 'dnaid2026', 'dnaid2027', 'dnaid2028', 'dnaid2029' ]
cnt_df = load_files(dnaids, Path(counts_dir))
lib10_cnt = cnt_df[cnt_df.library == 'library_10_2'].copy()
lib10_cnt['sampleIDExp'] = lib10_cnt['sampleID'] + "_"+ lib10_cnt['dnaid'] + "_" + lib10_cnt['experiment']
annotation_df = cnt_df[['barcode', 'ShortName', 'locus_tag', 'phenotype', 'conc']].drop_duplicates()

corr_df, good_samples = calculate_correlation(lib10_cnt, control_file, for_each='sampleIDExp')
good_samples = list(good_samples)
good_samples.remove('unenriched_inoculum_d0_dnaid2017_TV4592A')
lib10_cnt[['barcode', 'cnt', 'ShortName', 'sampleIDExp']].head()

In [None]:
n_samples = collections.Counter([si.split("_")[1] for si in good_samples])
print(f"Number of independent experiments: {lib10_cnt.experiment.nunique()}")
print(f"Number of Samples per day:")
for day in sorted(n_samples.keys()):
    print(f'{day}: {n_samples[day]} samples')

# Method 1:  <a id='Method-1'></a>

0. For each sample, correlation between WITS dilution and counts is calculated, samples with correlations < 0.8 are discarded. 

1. Raw barcode counts are filtered for abundance. [Why?](#method-1-filter)

2. Filtered counts are input into DESeq2. [What does the output look like?](#method-1-deseq-output)

3. For each **gene** calculate mean fitness (on log2 scale) and a Z-score based on log2FoldChanges and lfcSE calculated by DESeq2. [How is Z-score calculated?](#method-1-z-score)

4. Calculate p-value associated with each Z-score and perform multiple test adjustment. [How is p value calculated?](#method-1-p-value)

### [How to interpret Z-score?](#method-1-z-score-interpret)


In [None]:
method1_fitness, method1_results = analyze_library(lib10_cnt, sample_id="sampleIDExp", 
                          good_samples=good_samples, 
                          dnaid='library10', experiment='2', 
                          control_file=control_file, 
                          to_filter=1000, outdir=outdir)

#method1_fitness.to_csv(Path("/Users/ansintsova/git_repos/avocado/data")/"method1_fitness.csv")
#method1_results.to_csv(Path("/Users/ansintsova/git_repos/avocado/data")/"method1_results.csv")

In [None]:
method1_results.sample(5, random_state =42)

In [None]:
print(f'Tested {method1_results.shape[0]} genes/barcodes')
for day in ['d1', 'd2', 'd3', 'd4']:
    print(f'Number of significant hits on {day}: {method1_results[method1_results[day+"_padj"] < 0.05].shape[0]}')

# Method 2
<a id='Method-2'></a>

0. For each sample, correlation between WITS dilution and counts is calculated, samples with correlations < 0.8 are discarded.
1. Raw barcode counts are run through DESeq2 VST transformation.
2. Calculate mean inoculum value for each barcode based on all inoculum samples.
3. Calculate fitness for each barcode as abundance on a given day compared to the inoculum.
4. Calculate fitness for each **gene** as the median of fitness values of the barcodes mapped to that gene. WT fitness is the median fitness of all WITS barcodes.
    - [How consistent are fitness values across mice?](#method-2-fitness-conistency)
    - [Are the fitness values for control barcodes within expectations?](#method-2-fitness-controls)
5. P-value is calculated using Mann-Whitney U test. Multi-test correction using Benjamini/Hochberg (non-negative). [How is p-value calculated?](#method-2-p-value)
6. Calculate CI as gene fitness relative to  WT fitness. Same test for significance using ssaV mutants as controls. 


In [None]:
vst_df = run_VST_transformation(lib10_cnt, "method2-lib10-2", good_samples, outdir, sample_id='sampleIDExp').set_index('barcode')
#vst_df.to_csv(Path("/Users/ansintsova/git_repos/avocado/data")/"vst_counts.csv")

In [None]:
all_fitness_df, gene_fitness_df, ci_df, results_df, wt_fitness_df, ssa_ci_df = method2_analysis(vst_df, annotation_df, good_samples, sample_id='sampleID', hits=0.05)
#all_fitness_df.to_csv(Path("/Users/ansintsova/git_repos/avocado/data")/"method2_all_fitness.csv")
#gene_fitness_df.to_csv(Path("/Users/ansintsova/git_repos/avocado/data")/"method2_gene_fitness.csv")
#ci_df.to_csv(Path("/Users/ansintsova/git_repos/avocado/data")/"method2_all_ci.csv")
#results_df.to_csv(Path("/Users/ansintsova/git_repos/avocado/data")/"method2_all_results.csv")

#wt_fitness_df.to_csv(Path("/Users/ansintsova/git_repos/avocado/data")/"method2_wt_fitness.csv")
#ssa_ci_df.to_csv(Path("/Users/ansintsova/git_repos/avocado/data")/"method2_ssa_ci_fitness.csv")

In [None]:
vst_df.sample(5, random_state=5)
results_df.sample(5, random_state=42)
results_df.ci_hits.sum()

In [None]:
results_df.groupby('day').padj.count()
print('Tested 1888 genes/barcodes')
for day in ['d1', 'd2', 'd3', 'd4']:
    print(f'Number of significant hits on {day}: {results_df[(results_df.day == day)&(results_df.ci_padj < 0.05)].shape[0]}')

# Comparing Method 1 and Method 2 results

## Method 1: Filtering for abundant barcodes <a id='method-1-filter'></a>

- For some inoculum samples, there are barcodes present at very low abundances
- Including these in the analysis introduces a lot of noise (See example below)

[Back to Method 1](#Method-1)

In [None]:
inoculum_samples = [s for s in lib10_cnt.sampleIDExp.unique() if 'inoculum' in s]
inoculum_counts = lib10_cnt[lib10_cnt.sampleIDExp.isin(inoculum_samples)]

In [None]:
(px.histogram(inoculum_counts, x='cnt', color='sampleIDExp')
.update_layout(title={"text": "Distribution of barcode counts in the inoculum", "x": 0.5}, 
               yaxis_title="Frequency", xaxis_title="Count"))

## Analyze results with Method 1 without filtering:

In [None]:
m1_fitness_unfiltered, method1_results_unfiltered = analyze_library(lib10_cnt, sample_id="sampleIDExp", 
                          good_samples=good_samples, 
                          dnaid='library10', experiment='2', 
                          control_file=control_file, 
                          to_filter=0, outdir=outdir)

In [None]:
print(f'Tested {method1_results_unfiltered.shape[0]} genes/barcodes')
for day in ['d1', 'd2', 'd3', 'd4']:
    print(f'Number of significant hits on {day}: {method1_results_unfiltered[method1_results_unfiltered[day+"_padj"] < 0.05].shape[0]}')

In [None]:
to_keep = ['d1_fitness_mean','d1_fitness_std', 'd1_padj']
m1_filtering_comp = method1_results[to_keep].merge(method1_results_unfiltered[to_keep], left_index=True, right_index=True)
fig = px.scatter(m1_filtering_comp, x='d1_fitness_mean_x', y='d1_fitness_mean_y', hover_data=[m1_filtering_comp.index],
                width=1000, height=800, color='d1_fitness_std_y',
                labels={'d1_fitness_mean_x': 'Day 1 Fitness Filtered', 
                       'd1_fitness_mean_y': 'Day 1 Fitness Unfiltered',
                       'd1_fitness_std_y':'Day 1 stdev between barcodes (unfiltered)'},
                )
fig.update_xaxes(range=[-8, 3])
fig.update_yaxes(range=[-8, 3])
fig

## Example 1: *rfbI*

- In the unfiltered data, there are 2 barcodes mapping to *rfbI*, one with very low abundance.
- This barcode has an exaggerated log2FoldChange, and hides the signal from the other barcode present in high abundace

In [None]:
rfbi_bc = ['AACCATAATCCCCCGAT', 'AGCTAATCCCCCTGCCG']
m1_fitness_unfiltered[m1_fitness_unfiltered.barcode.isin(rfbi_bc)].sort_values('day')

## Another Example: *rfaL*

In [None]:
rfaL_bc = [c.strip() for c in method1_results_unfiltered.loc['rfaL'].barcode.split(",")]
m1_fitness_unfiltered[m1_fitness_unfiltered.barcode.isin(rfaL_bc)].sort_values('day')

## Method 1: DESeq2 Output  <a id='method-1-deseq-output'></a>

- For each barcode, DESeq2 calculates log2FoldChange, associated error (lfcSE), and FDR corrected p-value (padj)
- Also included how many samples (i.e. mice were included for each day)

[Back to Method 1](#Method-1)

In [None]:
m1_fitness_unfiltered.sample(5)

## Method 1: Z-Score Calculation: <a id='method-1-z-score'></a>





[Back to Method 1](#Method-1)

## Method 1: p-values
<a id='method-1-p-value'></a>
[Back to Method 1](#Method-1)

## Method 1: How to interpret Z-score? 
<a id='method-1-z-score-interpret'></a>

- Map between CI and Z-score? Z-score as another measure of CI. 

## Method 2: How consistent are fitness values across mice? 
<a id='method-2-fitness-conistency'></a>


[Back to Method 2](#Method-2)

In [None]:
rsd = gene_fitness_df.groupby(['ShortName', 'day']).agg({'Fitness': ['mean', 'std']}).reset_index()
rsd.columns = ['ShortName', 'day', 'mean', 'std']
rsd['rsd'] = rsd['std']/rsd['mean']


In [None]:
mouse = gene_fitness_df.sampleID.str.split("_", expand=True)[0]
mouse.name = 'mouse'
gene_fitness = pd.concat([gene_fitness_df, mouse], axis=1)

In [None]:
px.strip(gene_fitness[gene_fitness['ShortName'] == 'dcuB'], x='day', y='Fitness', hover_data=['mouse'], 
        log_y = True)

In [None]:
gene_fitness['log2FC'] = np.log2(gene_fitness.Fitness)
t = gene_fitness.pivot(index=['ShortName', 'mouse'] , columns='day', values='log2FC').reset_index()


In [None]:
gene_fitness_df.sample(10)

## Method 2: Are the fitness values for control barcodes within expectations? <a id='method-2-fitness-controls'></a>

- Prior to both methods we calculate correlation between different dilutions of WITS barcodes and counts, and discard those with correlation of < 0.8
- For Method 1 we can also look at Z-scores and log2FC for each barcode/each dilution
- For Method 2 we can also look at fitness of each barcode in each mouse and look for outliers 

In [None]:
def get_control_df(fitness, phenotype='wt'):
    fitness.columns = [c.replace("unenriched_", "unenriched-") for c in fitness.columns]
    fitness = fitness.drop(['day'], axis=1)

    wt = fitness[fitness.phenotype == phenotype].dropna(axis=1).drop(['inoculum'], axis=1)
    wt = wt.melt(id_vars=['barcode', 'phenotype', 'conc'], var_name='sampleExpID', value_name='fitness')
    new = wt.sampleExpID.str.split("_", expand=True)
    new.columns = ['mouse', 'day', 'dnaid', 'experiment']
    wt = wt.merge(new, left_index=True, right_index=True)
    return wt

wt = get_control_df(all_fitness_df)
hyb = get_control_df(all_fitness_df, phenotype='hyb')
chey = get_control_df(all_fitness_df, phenotype='chey')
ssaV_invG = get_control_df(all_fitness_df, phenotype='ssaV_invG')

In [None]:
wt_cnt_d1 = lib10_cnt[(lib10_cnt.phenotype == 'wt') & (lib10_cnt.day == 'd1')][['barcode', 'mouse', 'cnt', 'conc']]
wt_cnt_d1['lconc'] = np.log2(wt_cnt_d1.conc)
wt_cnt_d1['lcnt'] = np.log2(wt_cnt_d1.cnt +1)

In [None]:
lib10_cnt.groupby('sampleIDExp').cnt.sum().reset_index().sort_values('cnt').head(10)

In [None]:
px.scatter(wt_cnt_d1.sort_values('mouse'), x="lconc", y="lcnt", facet_col="mouse", facet_col_wrap=3,height=3000, width=800, trendline='ols')

In [None]:
day = 'd1'

In [None]:

wt_d1 = wt[wt.day == day]
fig = px.strip(wt_d1, x='mouse', y=np.log2(wt_d1['fitness']), color='mouse',  hover_data=['conc', 'fitness'],
        template='simple_white', title = f'WT-{day}',
              labels={"y": "log2(Fitness)",
                     "conc": "Dilution", "fitness": "Fitness"})
fig.add_hline(y=0, line_width=3, line_dash="dash")

In [None]:

hyb_d1 = hyb[hyb.day == day]
fig = px.strip(hyb_d1, x='mouse', y=np.log2(hyb_d1['fitness']), color='mouse',  hover_data=['conc', 'fitness'],
               title = f'hyb-{day}',
        template='simple_white', labels={"y": "log2(Fitness)",
                     "conc": "Dilution", "fitness": "Fitness"})
fig.add_hline(y=0, line_width=3, line_dash="dash")

In [None]:

chey_d1 = chey[chey.day == day]
fig = px.strip(chey_d1, x='mouse', y=np.log2(chey_d1['fitness']), color='mouse',  hover_data=['conc', 'fitness'],
               title = f'chey-{day}',
        template='simple_white', labels={"y": "log2(Fitness)",
                     "conc": "Dilution", "fitness": "Fitness"})
fig.add_hline(y=0, line_width=3, line_dash="dash")

In [None]:

ssaV_invG_d1 = ssaV_invG[ssaV_invG.day == day]
fig = px.strip(ssaV_invG_d1, x='mouse', y=np.log2(ssaV_invG_d1['fitness']), color='mouse',  hover_data=['conc', 'fitness'],
               title = f'ssaV_invG-{day}',
        template='simple_white', labels={"y": "log2(Fitness)",
                     "conc": "Dilution", "fitness": "Fitness"})
fig.add_hline(y=0, line_width=3, line_dash="dash")

In [None]:
gene = 'dcuB'
test = vst_gene_cnts[(vst_gene_cnts.ShortName == gene)& (vst_gene_cnts.day != 'd0')]
test_inoculum = vst_gene_cnts[(vst_gene_cnts.ShortName == gene)& (vst_gene_cnts.mouse == 'inoculum')]
fig = px.line(test, x='day', y="norm_count", color='mouse',
         hover_data=['mouse'],color_discrete_sequence= px.colors.qualitative.Dark24, 
             template='simple_white'
             )
fig.add_hline(y=test_inoculum.norm_count.mean(), line_width=3, line_dash="dash", annotation_text="Mean Inoculum Count")
fig.add_hline(y=5.2, line_width=3, line_dash="dash", annotation_text="Detection Limit")
fig.update_traces(mode='markers+lines')


## Method 2: How is p-value calculated?
<a id='method-2-p-value'></a>
[Back to Method 1](#Method-1)

In [None]:
wt_d1.median()

In [None]:
wt_d1[(wt_d1.mouse != 'am487') & (wt_d1.mouse != 'am732')].median()

# Compare the Results <a id='Compare'></a>

[Back to the start](#start)

## Merge Results from Method 1 and 2.

In [None]:
# Melt method1_results:

not_to_melt = ['gene', 'locus', 'num_barcodes', 'library', 'barcode', 'sstart', 'sseqid']
def melt_day(df, var):
    mdf = df.reset_index().melt(id_vars=not_to_melt, value_vars=[c for c in df if var in c], value_name=var, var_name='day')
    mdf['day'] = mdf.day.str.split("_", expand=True)[0]
    return mdf
to_melt = ['padj', 'fitness_mean', 'num_samples', 'zscore', 'ci', 'fitness_std', 'pval']
dfs = []
method1_melted = pd.DataFrame(columns = not_to_melt+['day'])
for v in to_melt:
    print(v)
    mdf = melt_day(method1_results, v).drop_duplicates()
    method1_melted =  method1_melted.merge(mdf, on=not_to_melt+["day"], how='outer')
    print('done')
method1_melted = method1_melted[['gene', 'day', 'padj', 'fitness_mean', 'num_samples', 'zscore', 'ci', 'fitness_std', 'pval']]
method1_melted.columns = [f'{c}_method1' for c in method1_melted.columns] 

In [None]:

method1_melted = method1_melted.rename({'gene_method1': 'gene', 'day_method1': 'day'}, axis=1)


In [None]:
method2_melted = results_df.reset_index()
method2_melted.columns = [f'{c}_method2' for c in method2_melted.columns]
method2_melted = method2_melted.rename({'ShortName_method2': 'gene', 'day_method2': 'day'}, axis=1)

In [None]:
method2_melted

In [None]:
compare = method1_melted.merge(method2_melted, on=['gene', 'day'])
compare['hits_method1'] = compare['padj_method1'] < 0.05

In [None]:
compare

In [None]:
compare['hits'] = compare['ci_hits_method2'].astype(int)*3 + compare['hits_method1'].astype(int)
compare['hits'] = compare['hits'].replace({0: 'Not a hit', 1: 'Method1 Hit', 3: 'Method2 Hit', 4: 'Method 1&2 Hit'})
compare

In [None]:
day1= compare[compare.day == 'd1']
fig = px.scatter(day1, x='fitness_mean_method1', y='mean_fitness_method2', color='hits_method2', 
                  log_y=True, hover_data=['gene'], 
                template='simple_white', 
                
                )
fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig

In [None]:
compare.to_csv("/Users/ansintsova/git_repos/avocado/data/compare.csv")

In [None]:
day1= compare[compare.day == 'd1']
fig = px.scatter(day1, x='ci_method1', y='median_CI_method2', color='hits', 
                 log_x=True, log_y=True, hover_data=['gene'],
                template='simple_white', 
                color_discrete_sequence= px.colors.qualitative.Vivid,
                )
fig.update_traces(marker=dict(size=14,
                              line=dict(width=0.5,
                                        color='black')),
                  selector=dict(mode='markers'))
fig.update_yaxes(tickvals=[0.01, 0.1, 0.5,1,2, 10, 100])
fig.update_xaxes(tickvals=[0.01, 0.1, 0.5,1,2, 10, 100])
fig

In [None]:

def get_median_for_gene_on_a_day(df, annotation, day, grp_by='ShortName'):
    df = df.merge(annotation, on='barcode')
    samples = dict.fromkeys(([c for c in df.columns if day in c]), ['median'])
    day_median_value = df.groupby('ShortName').agg(samples)
    day_median_value.columns = [c[0] for c in day_median_value.columns]
    return day_median_value

days = ['_d0', '_d1', '_d2', '_d3', '_d4']
vst_cnt_genes = []
for day in days:
    print(day)
    df = get_median_for_gene_on_a_day(vst_df, annotation_df, day)
    vst_cnt_genes.append(df)
vst_gene_cnts = pd.concat(vst_cnt_genes, axis=1)
vst_gene_cnts.columns =[c.replace('unenriched_inoculum', 'unenriched-inoculum') for c in vst_gene_cnts.columns]
vst_gene_cnts = vst_gene_cnts.reset_index().melt(id_vars='ShortName', var_name='SampleID', value_name='norm_count')


new = vst_gene_cnts.SampleID.str.split('_', expand=True)
new.columns = ['mouse', 'day', 'dnaid', 'experiment']
vst_gene_cnts = pd.concat([vst_gene_cnts, new], axis=1)
vst_gene_cnts.to_csv("/Users/ansintsova/git_repos/avocado/data/vst_gene_counts.csv")

In [None]:
vst_gene_cnts

In [None]:
compare

In [None]:
vst_gene_cnts[(vst_gene_cnts.mouse == 'inoculum') &(vst_gene_cnts.ShortName == 'dcuB')].norm_count.mean()

# Counts

In [None]:
gene = 'dcuB'
test = vst_gene_cnts[(vst_gene_cnts.ShortName == gene)& (vst_gene_cnts.day != 'd0')]
test_inoculum = vst_gene_cnts[(vst_gene_cnts.ShortName == gene)& (vst_gene_cnts.mouse == 'inoculum')]
fig = px.line(test, x='day', y="norm_count", color='mouse',
         hover_data=['mouse'],color_discrete_sequence= px.colors.qualitative.Dark24, 
             template='simple_white'
             )
fig.add_hline(y=test_inoculum.norm_count.mean(), line_width=3, line_dash="dash", annotation_text="Mean Inoculum Count")
fig.add_hline(y=5.2, line_width=3, line_dash="dash", annotation_text="Detection Limit")
fig.update_traces(mode='markers+lines')



In [None]:
gene = 'rfaI'
test = vst_gene_cnts[(vst_gene_cnts.ShortName == gene)& (vst_gene_cnts.day != 'd0')]
test_inoculum = vst_gene_cnts[(vst_gene_cnts.ShortName == gene)& (vst_gene_cnts.mouse == 'inoculum')]
test['dayN'] = test.day.replace({'d1':1, 'd2':2, 'd3':3, 'd4':4})
fig = px.scatter(test, x='dayN', y="norm_count", color='mouse',
         hover_data=['mouse'],color_discrete_sequence= px.colors.qualitative.Dark24, 
             template='simple_white', 
             )
fig.add_hline(y=test_inoculum.norm_count.mean(), line_width=3, line_dash="dash", annotation_text="Mean Inoculum Count")
fig.add_hline(y=5.2, line_width=3, line_dash="dash", annotation_text="Detection Limit")



In [None]:
gene = 'dcuB'
test = vst_gene_cnts[(vst_gene_cnts.ShortName == gene)]
test['dayN'] = test.day.replace({'d1':1, 'd2':2, 'd3':3, 'd4':4})
test_inoculum = vst_gene_cnts[(vst_gene_cnts.ShortName == gene)& (vst_gene_cnts.mouse == 'inoculum')]
fig = px.box(test, x='day', y="norm_count", color='day',
         hover_data=['mouse'],
             template='simple_white'
             )
fig.add_hline(y=test_inoculum.norm_count.mean(), line_width=3, line_dash="dash", annotation_text="Mean Inoculum Count")
fig.add_hline(y=5.2, line_width=3, line_dash="dash", annotation_text="Detection Limit")
fig

In [None]:
new = gene_fitness_df.sampleID.str.split('_', expand=True)
new.columns = ['mouse', 'day', 'dnaid', 'experiment']
gene_fitness_annotated = pd.concat([gene_fitness_df, new[['mouse', 'dnaid', 'experiment']]], axis=1)
gene_fitness_annotated.to_csv('/Users/ansintsova/git_repos/avocado/data/gene_fitness_annotated.csv')

In [None]:
gene_fitness_annotated

In [None]:
ci_df
new = ci_df.sampleID.str.split('_', expand=True)
new.columns = ['mouse', 'day', 'dnaid', 'experiment']
ci_annotated = pd.concat([ci_df, new[['mouse', 'dnaid', 'experiment']]], axis=1)
ci_annotated.to_csv('/Users/ansintsova/git_repos/avocado/data/ci_annotated.csv')

In [None]:
gene = 'dcuB'
test = gene_fitness_annotated[gene_fitness_annotated.ShortName == gene]

fig = px.line(test, x='day', y="Fitness", color='mouse',
         hover_data=['mouse'], color_discrete_sequence= px.colors.qualitative.Dark24, 
             template='simple_white', log_y=True, 
             )
fig.add_hline(y=1, line_width=3, line_dash="dash", annotation_text="Fitness")
fig.update_traces(mode='markers+lines')
fig.update_yaxes(tickvals=[0.01, 0.1, 0.5,1,2, 10, 100])

In [None]:
gene = 'dcuB'
test = gene_fitness_annotated[gene_fitness_annotated.ShortName == gene]

fig = px.box(test, x='day', y="Fitness", color='day',
         hover_data=['mouse'],
             template='simple_white', log_y=True
             )
fig.update_yaxes(tickvals=[0.01, 0.1, 0.5,1,2, 10, 100])
fig.add_hline(y=1, line_width=3, line_dash="dash", annotation_text="Detection Limit")

In [None]:
gene_fitness[gene_fitness.mouse == 'am731'].day.unique()

In [None]:
gene_fitness_df[gene_fitness_df.sampleID.str.startswith('am731')].day.unique()

In [None]:
gene_fitness_annotated[gene_fitness_annotated.ShortName == 'hybA']