In [1]:
import pandas as pd
import numpy as np
from numpy.random import RandomState
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import plotly.express as px

# Load the maps

In [2]:
def load_map(file, name):
    return (pd.read_csv(file)
            .assign(sample=name)[['barcode', 'number_of_reads', 'sample']]
            .drop_duplicates()
            .pivot(index="sample", columns='barcode', values='number_of_reads'))
    
    
def rarefaction(M, seed=0, depth=1000000):
    prng = RandomState(seed) # reproducible results
    noccur = np.sum(M, axis=1) # number of occurrences for each sample
    nvar = M.shape[1] # number of variables
    Mrarefied = np.empty_like(M)
    depths = []
    for i in range(M.shape[0]): # for each sample
        p = M.iloc[i] / float(noccur[i]) # relAb of each gene -> probability
        if depth >= noccur[i]:
            choice = prng.choice(nvar, noccur[i], p=p)
            depths.append(noccur[i])
        else:
            choice = prng.choice(nvar, depth, p=p)
            depths.append(depth)
        Mrarefied[i] = np.bincount(choice, minlength=nvar)
        
    return pd.DataFrame(Mrarefied, index=M.index, columns=M.columns), depths


def sat_curve(df, depths, cutoff=100):
    n_bcs = []
    n_reads = []
    for depth in depths:
        r, d = rarefaction(df, depth=depth)
        n_bcs.append((r>cutoff).sum(axis=1).iloc[0])
        n_reads.append(d[0])
    return pd.DataFrame([n_bcs, n_reads], index=['num_insertions', 'num_reads']).T


# def num_supported(read_range, pivot_lib):
#     supported = []
#     for support in read_range:
#         supported.append((pivot_lib>support).sum(axis=1).iloc[0])
#         sdf = pd.DataFrame([range(0,500,10), supported], index=['num_reads', 'num_bc']).T
#     return sdf


def depth_vs_num_inserts(pivot_map, depth, cutoffs):
    df_list = []
    for cutoff in cutoffs:
        df = sat_curve(pivot_map, depth, cutoff)
        df['ReadCutoff'] = f'{cutoff}Reads'
        df_list.append(df)
    return pd.concat(df_list)

In [3]:
data_dir = Path("/Users/ansintsova/git_repos/nguyenb_tnseq/data/08_22_manuscript")
ec_map = load_map(data_dir/"ec_map.annotated.csv", 'ec')
lib11_map = load_map(data_dir/"library_11_1_clean_R1.map.annotated.csv", "lib11")
db_map = load_map(data_dir/"TnSeq_SB2B_ML5_l10_RC.annotated.csv", 'DB')

In [16]:
print(ec_map.sum(axis=1))
ec_depth = list(range(100000, 1400000, 200000))
print(lib11_map.sum(axis=1))
lib11_depth = list(range(50000, 9000000, 1000000)) 
print(db_map.sum(axis=1))
db_depth = list(range(100000, 16000000, 1000000))

cutoffs = [5, 10, 50, 100, 500]

sample
ec    1347088
dtype: int64
sample
lib11    9023253
dtype: int64
sample
DB    16272047
dtype: int64


In [17]:
l11_curves = depth_vs_num_inserts(lib11_map, lib11_depth, cutoffs)


In [6]:
db_curves = depth_vs_num_inserts(db_map, db_depth, cutoffs)

In [7]:
ec_curves = depth_vs_num_inserts(ec_map, ec_depth, cutoffs)

In [18]:
fig = px.line(l11_curves, 
              x="num_reads", 
              y="num_insertions", 
              color='ReadCutoff',
              labels = {'num_reads': "Sequencing Depth (millions of reads)",
                        'num_insertions': "Number of unique insertions",
                        'ReadCutoff': 'Detection Limit'}, 
              template = "plotly_white", 
              height=600, 
              width=1000)

fig.update_traces(mode='markers+lines')

In [9]:
fig = px.line(db_curves, 
              x="num_reads", 
              y="num_insertions", 
              color='ReadCutoff',
              labels = {'num_reads': "Sequencing Depth (millions of reads)",
                        'num_insertions': "Number of unique insertions",
                        'ReadCutoff': 'Detection Limit'}, 
              template = "plotly_white", 
              height=600, 
              width=1000)

fig.update_traces(mode='markers+lines')

In [10]:
fig = px.line(ec_curves, 
              x="num_reads", 
              y="num_insertions", 
              color='ReadCutoff',
              labels = {'num_reads': "Sequencing Depth (millions of reads)",
                        'num_insertions': "Number of unique insertions",
                        'ReadCutoff': 'Detection Limit'}, 
              template = "plotly_white", 
              height=600, 
              width=1000)

fig.update_traces(mode='markers+lines')