In [None]:
# %load load_manuscript_data.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from pathlib import Path
import seaborn as sns
import sys
import plotly.express as px
import plotly.io as pio
import yaml

sns.set_context("notebook", font_scale=1.4)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
plt.rcParams["figure.figsize"] = (16, 12)
plt.rcParams['savefig.dpi'] = 200
plt.rcParams['figure.autolayout'] = False
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14
pd.set_option('display.float_format', lambda x: '{:,.4f}'.format(x))


config_file = "manuscript_config.yaml"
with open(config_file) as file:
    # The FullLoader parameter handles the conversion from YAML
    # scalar values to Python the dictionary format
    configs = yaml.load(file, Loader=yaml.FullLoader)
    
# Run on server:
run_on = "server"
root = Path(configs['root'][run_on])
scratchDir = Path(configs['scratchDir'][run_on])
figuresDir = Path(configs['figuresDir'][run_on])

alphabetClrs = px.colors.qualitative.Alphabet
clrs = ["#f7ba65", "#bf4713", "#9c002f", "#d73d00", "#008080", "#004c4c"]
colors = {'grey': alphabetClrs[8], 
        'light_yellow': clrs[0],
        'darko': clrs[1],
        'maroon':clrs[2],
        'brighto': clrs[3],
        'teal':clrs[4],
        'darkteal':clrs[5]
       }

# Load the maps

In [None]:
from numpy.random import RandomState

In [None]:
def load_map(file, name):
    return (pd.read_csv(file)
            .assign(sample=name)[['barcode', 'number_of_reads', 'sample']]
            .drop_duplicates()
            .pivot(index="sample", columns='barcode', values='number_of_reads'))
    
    
def rarefaction(M, seed=0, depth=1000000):
    prng = RandomState(seed) # reproducible results
    noccur = np.sum(M, axis=1) # number of occurrences for each sample, sequencing depth
    nvar = M.shape[1] # number of variables
    Mrarefied = np.empty_like(M)
    depths = []
    for i in range(M.shape[0]): # for each sample
        p = M.iloc[i] / float(noccur[i]) # relAb of each gene -> probability
        if depth >= noccur[i]:
            choice = prng.choice(nvar, noccur[i], p=p)
            depths.append(noccur[i])
        else:
            choice = prng.choice(nvar, depth, p=p)
            depths.append(depth)
        Mrarefied[i] = np.bincount(choice, minlength=nvar)
        
    return pd.DataFrame(Mrarefied, index=M.index, columns=M.columns), depths


def sat_curve(df, depths, cutoff=100):
    n_bcs = []
    n_reads = []
    for depth in depths:
        r, d = rarefaction(df, depth=depth)
        n_bcs.append((r>cutoff).sum(axis=1).iloc[0])
        n_reads.append(d[0])
    return pd.DataFrame([n_bcs, n_reads], index=['num_insertions', 'num_reads']).T


def depth_vs_num_inserts(pivot_map, depth, cutoffs):
    df_list = []
    for cutoff in cutoffs:
        df = sat_curve(pivot_map, depth, cutoff)
        df['ReadCutoff'] = f'{cutoff} Reads'
        df_list.append(df)
    return pd.concat(df_list)

In [None]:
nguyen_map_file = root/configs['nguyen']['mapFile']['unfiltered']
wetmore_map_file = root/configs['wetmore']['mapFile']['unfiltered']

In [None]:
nguyen_map = load_map(nguyen_map_file, 'nguyen')
wetmore_map = load_map(wetmore_map_file, 'wetmore')

In [None]:
test = nguyen_map.T.head(20).reset_index()
test['n2'] = np.random.randint(0, 1000, [20,1])
test = test.set_index('barcode')
test

In [None]:
print(nguyen_map.sum(axis=1))
nguyen_depth = list(range(50000, 9000000, 1000000))+ [250000, 500000, 750000]
nguyen_depth.sort()
print(wetmore_map.sum(axis=1))
wetmore_depth = list(range(100000, 17000000, 1000000))
cutoffs = [5, 10, 50, 100, 500]

In [None]:
def rarefy1(x, depth=1000, seed=42):
    prng = RandomState(seed)
    noccur = np.sum(x.values)
    nvar = len(x)
    p = x.values/noccur
    if depth > noccur:
        return []
    choice = prng.choice(nvar, depth, p=p)
    res = np.bincount(choice, minlength=nvar)
    return res 

def rarefy(df, depths, seed=0):
    df_list = []
    df.columns.name = 'sampleID'
    for depth in depths:            
        rare_df = df.apply(rarefy1, depth=depth, seed=seed).assign(depth=depth)
        df_list.append(rare_df)
    return pd.concat(df_list).reset_index().melt(id_vars=['index', 'depth'], var_name='sampleID', value_name='counts')


def saturationCurves(df, depths, cutoffs, seed):
    rareDf = rarefy(df, depths, seed)
    sat_curve_df = (rareDf.groupby(['sampleID', 'depth'])
                    .agg({'counts': [lambda x, c=c: (x > c).sum() for c in cutoffs]})
                    .reset_index())
    sat_curve_df.columns  = ['sampleID', 'depth'] + [f'>{c} reads' for c in cutoffs]
    return sat_curve_df

In [None]:
rdf = rarefy(test, depths=[100,500,100000])

In [None]:
test

In [None]:
saturationCurves(test, [100, 500, 1000, 2000, 10000, 100000], [5,10,100], seed=89)

In [None]:
#r = saturationCurves(test, [100, 1000, 10000, 100000], [5, 10, 100], seed = 78)
#r[r.sampleID == 'n2'].groupby('depth').value.apply(lambda x: (x>5).sum())

In [None]:
r[(r.sampleID == 'n2') & (r.depth == 8935)]

In [None]:
listOfLambdas = [lambda i=i: i*i for i in range(6)]

In [None]:
[lambda x,c=c:  x > c for c in range(3)]

In [None]:
for l in listOfLambdas:
    print(l())

In [None]:
for f in t3:
    print(f(pd.Series([10, 1])))

In [None]:
saturationCurves(t2, [5, 100])

In [None]:
t2

In [None]:
rarefy1(test.nguyen, 1000)

In [None]:
rarefy1(test.n2, 1000)

In [None]:

test

In [None]:
rarefy1(test.nguyen, 1000)

In [None]:
nguyen_curves = depth_vs_num_inserts(nguyen_map, nguyen_depth, cutoffs)

In [None]:
wetmore_curves = depth_vs_num_inserts(wetmore_map, wetmore_depth, cutoffs)

In [None]:
colors['darkteal']

In [None]:
fig = px.line(nguyen_curves, 
              x="num_reads", 
              y="num_insertions", 
              color='ReadCutoff',
              color_discrete_map = {'5 Reads': colors['darkteal'], 
                                   '10 Reads': colors['teal'],
                                   '50 Reads': colors['maroon'],
                                   '100 Reads': colors['brighto'], 
                                   '500 Reads':colors['light_yellow']},
              labels = {'num_reads': "Sequencing Depth (millions of reads)",
                        'num_insertions': "Number of unique insertions",
                        'ReadCutoff': 'Detection Limit'}, 
              template = "plotly_white", 
              height=600, 
              width=1000)

fig.update_traces(mode='markers+lines', line_width=4, marker_size=12)
fig.update_layout(
    
    font_size=22,
)


In [None]:
pio.write_image(fig, figuresDir/'nguyen_sat_curves.png', width=1000, height=600, scale=2)

In [None]:
fig = px.line(wetmore_curves, 
              x="num_reads", 
              y="num_insertions", 
              color='ReadCutoff',
              labels = {'num_reads': "Sequencing Depth (millions of reads)",
                        'num_insertions': "Number of unique insertions",
                        'ReadCutoff': 'Detection Limit'}, 
              color_discrete_map = {'5 Reads': colors['darkteal'], 
                                   '10 Reads': colors['teal'],
                                   '50 Reads': colors['maroon'],
                                   '100 Reads': colors['brighto'], 
                                   '500 Reads':colors['light_yellow']},
              template = "plotly_white", 
              height=600, 
              width=1000)

fig.update_traces(mode='markers+lines', line_width=4, marker_size=12)
fig.update_layout(    font_size=22)


In [None]:
pio.write_image(fig, figuresDir/'wetmore_sat_curves.png', width=1000, height=600, scale=2)

In [None]:
wetmore_unfiltered = pd.read_csv(wetmore_map_file, index_col=0)
wetmore_filtered = wetmore_unfiltered[wetmore_unfiltered.number_of_reads >= 5]

In [None]:
wetmore_filtered.to_csv(root/configs["wetmore"]["map_file"]["filtered"])

In [None]:
wetmore_filtered.head()

In [None]:
wetmore_filtered.shape