In [1]:
### Simulation parameters
n_donors = 2
n_conditions = 3
n_cells_per_donor_cond = 300
n_clones_in_don = 5
n_positions = 100
n_variants_per_clone_lambda = 1
n_donor_variants = 10

seq_error = 0.01
don_var_lim=(0.8,1)
clone_var_lim=(0.1,0.4)
depth_lim = (4,10)
chars = ["A", "G", "C", "T"]


outdir = "/data/Mito_Trace/output/clone_pileups_simulation/samePos"
pileup_outdir = "MT/cellr_True/numread_200/"
ref_fa = "/data/Mito_Trace/data/processed/genomes/mtMasked/GRCh38_MT_blacklist_A2_2020/chrM.fasta"

In [2]:
from Bio import SeqIO
from collections import Counter
import pandas as pd
import numpy as np
import seaborn as sns
from os.path import join, exists, dirname
import os
import matplotlib.pyplot as plt
np.random.seed(42)
%matplotlib inline

from pandarallel import pandarallel
pandarallel.initialize(nb_workers=2)


INFO: Pandarallel will run on 2 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
dirname(outdir)

'/data/Mito_Trace/output/clone_pileups_simulation'

In [4]:
params = {"donors":n_donors,"conditions":n_conditions, "cells_per_donor_cond":n_cells_per_donor_cond,
         "clones_in_don":n_clones_in_don, "positions":n_positions, 
          "variants_per_clone_lambda":n_variants_per_clone_lambda, "donor_variants":n_donor_variants}


seq_params = {"seq_error": seq_error, "don_var_lim":"_".join(np.array(don_var_lim).astype(str)), 
              "clone_var_lim": "_".join(np.array(clone_var_lim).astype(str)),
             "depth_lim":"_".join(np.array(depth_lim).astype(str))}

curr_out_meta = ""
for p in params:
    curr_out_meta = f"{curr_out_meta}__{p}_{params[p]}"

curr_out_meta = curr_out_meta.strip("__")
print(curr_out_meta)


curr_out_seq = ""
for p in seq_params:
    curr_out_seq = f"{curr_out_seq}__{p}_{seq_params[p]}"
curr_out_seq = curr_out_seq.strip("__")
print(curr_out_seq)


pile_in = join(dirname(outdir), curr_out_meta, curr_out_seq, "data")

samePos_outdir = join(outdir, curr_out_meta, curr_out_seq) #join(pile_in, "samePos")
#samePos_meta_outdir = join(pile_in, "samePos")

donors_2__conditions_3__cells_per_donor_cond_300__clones_in_don_5__positions_100__variants_per_clone_lambda_1__donor_variants_10
seq_error_0.01__don_var_lim_0.8_1.0__clone_var_lim_0.1_0.4__depth_lim_4_10


In [5]:
samePos_outdir

'/data/Mito_Trace/output/clone_pileups_simulation/samePos/donors_2__conditions_3__cells_per_donor_cond_300__clones_in_don_5__positions_100__variants_per_clone_lambda_1__donor_variants_10/seq_error_0.01__don_var_lim_0.8_1.0__clone_var_lim_0.1_0.4__depth_lim_4_10'

## load pileups

In [6]:
## Convert to nt's
pileup_conds = {}
curr_indir = pile_in
for nt in chars:
    for cond in np.arange(n_conditions):
        cond_outdir = join(curr_indir, f"cond{cond}", pileup_outdir)
        print(nt,cond)
        pileup_conds[(nt,cond)] = pd.read_csv(join(cond_outdir, f"cond{cond}.{nt}.strands.txt"), header=None)


A 0
A 1
A 2
G 0
G 1
G 2
C 0
C 1
C 2
T 0
T 1
T 2


In [7]:
pileups_df = pd.concat(pileup_conds).reset_index().rename({"level_0":"nt", "level_1":"cond"}, axis=1)

In [8]:
pileups_df = pileups_df.rename({0:"pos", 1:"cell", 2:"Fw Count", 3:"Fw BQ", 4:"Rev Count", 5:"Rev BQ"}, 
                               axis=1)
pileups_df

pileups_df["cov"] = pileups_df["Fw Count"] + pileups_df["Rev Count"] 

## Load meta

In [9]:
all_cells_df = pd.read_csv(join(dirname(pile_in),"cells_meta.csv"), index_col=0)
clones_meta_cond_df = pd.read_csv(join(dirname(pile_in),"clones_meta.csv"), index_col=0)
clone_vars_df = pd.read_csv(join(dirname(pile_in),"clone_vars.csv"), index_col=0)
don_vars_df = pd.read_csv(join(dirname(pile_in),"donor_vars.csv"), index_col=0)

### Generate cells with donor and clone assignment

In [10]:
def generate_nonspec_vars(cell_ser, cell_nm, seq_err=0.001, chars=["A","C","G","T"]):
#    print(cell_ser)
    reads = cell_ser["counts"]
    ref = cell_ser["ref"]
    ref_id = cell_ser["ref_id"]
    seq_err_counts = np.random.binomial(reads,seq_err)
    oth_nts = list(set(chars)-set([ref]))
    out = {}
    if seq_err_counts>0:
        # Change the nt to the non alt allele
        out = Counter(np.random.choice(oth_nts, size=seq_err_counts,replace=True))
    else:
        out  = {x:0 for x in oth_nts}
    out[ref] = reads - seq_err_counts

    out["pos"] = cell_ser["pos"]
    out["cell"] = cell_nm
    
    return pd.Series(out)

def cell_nonspec_variants(curr_counts, cell_name):
    curr_cell_pile = curr_counts.apply(generate_nonspec_vars, args=(cell_name,), axis=1).reset_index().rename({"index":"ref_id"},axis=1)
    curr_cell_pile = curr_cell_pile.drop("ref_id",axis=1).melt(id_vars=["pos", "cell"], value_name="counts", var_name="nt")
    curr_cell_pile = (curr_cell_pile.loc[curr_cell_pile["counts"]!=0]).dropna()
    return curr_cell_pile

def generate_specific_vars(d_v_ser,  cell_nm, don_var_lim, seq_err=0.001, chars=["A","C","G","T"]):
    #out = {}  
    reads = d_v_ser["cov"]
    ref = d_v_ser["ref"]
    alt = d_v_ser["alt"]
    pos = d_v_ser["pos"]
    ref_id = d_v_ser["ref_id"] 
    
    curr_af = np.random.uniform(don_var_lim[0],don_var_lim[1]) # generate using uniform distribution
    curr_af_counts = int(np.floor(curr_af*reads))
    #curr_af_counts = int(np.floor(curr_af*counts.loc[ref_id, "counts" ]))
    
    seq_err_counts = np.random.binomial(reads,seq_err)
        
    oth_nts = list(set(chars)-set([alt, ref]))
    if seq_err_counts>0:
        # Change the nt to the non alt allele
        out = Counter(np.random.choice(oth_nts, size=seq_err_counts,replace=True))
    else:
        out  = {x:0 for x in oth_nts}
        
    # Add in the reference and alt counts!
    out[alt] = reads - seq_err_counts
    out[ref] = max(reads - curr_af_counts - seq_err_counts,0)
    out["pos"] = d_v_ser["pos"]
    out["cell"] = cell_nm
    
    # Add the reference counts
    
    return pd.Series(out)

def cell_donor_variants(curr_cell_counts, cell_name, don_var_lim):
    """ Generates cell's donor variant counts based on the donor variant limits.
    """
    curr_cell_pile = curr_cell_counts.apply(generate_specific_vars, args=(cell_name,don_var_lim), axis=1).reset_index().rename({"index":"ref_id"},axis=1)
#     print('curr_cell_pile')
#     print(curr_cell_pile.head())
   # print(curr_cell_pile)
    #curr_cell_pile = curr_cell_pile.drop("ref_id",axis=1).melt(id_vars=["pos", "cell"], value_name="counts", var_name="nt")
    curr_cell_pile = curr_cell_pile.drop("ref_id",axis=1).melt(id_vars=["pos", "cell"], value_name="counts", var_name="nt")
    curr_cell_pile = (curr_cell_pile.loc[curr_cell_pile["counts"]!=0]).dropna()
    return curr_cell_pile

def cell_clone_variants(curr_cell_counts, cell_name, clone_var_lim):
    """ Generates cell's counts based on the clone limits.
    """
    curr_cell_pile = curr_cell_counts.apply(generate_specific_vars, args=(cell_name, clone_var_lim), axis=1).reset_index().rename({"index":"ref_id"},axis=1)
    #curr_cell_pile = curr_cell_pile.drop("ref_id",axis=1).melt(id_vars=["pos", "cell"], value_name="counts", var_name="nt")
    curr_cell_pile = curr_cell_pile.drop("ref_id",axis=1).melt(id_vars=["pos", "cell"], value_name="counts", var_name="nt")
    curr_cell_pile = (curr_cell_pile.loc[curr_cell_pile["counts"]!=0]).dropna()
    return curr_cell_pile


## For each cell, generate a 'pileup df', which contains read depth at each position
The columns are ["pos","cell","nt","counts","donor","condition"]

## Add variants with the same position

## Add variant that has same position but different reference

### a. Pick the variants

In [11]:
print(don_vars_df.shape)
don_vars_df.head()

new_don_vars = []

donors=list(set(don_vars_df["donor"].values))
n_same_pos_vars = 2
for i in range(n_same_pos_vars):
    curr = don_vars_df.iloc[i]
    new = curr.copy()
    new["alt"] = np.random.choice(list(set(chars)-set([curr["alt"], curr["ref"]])))
    new["donor"] = np.random.choice(list(set(donors)-set([curr["donor"]]))) # TODO: remove the same donor
    new.name = f"{new['pos']}_{new['ref']}_{new['alt']}"
    don_vars_df = don_vars_df.append(new)
    new_don_vars.append(new)

print(don_vars_df.shape)
don_vars_df.tail()

print(clone_vars_df.shape)
clone_vars_df.tail()

clone_vars_df.head()

n_same_pos_vars = 3
new_clone_vars = []
for i in range(n_same_pos_vars):
    curr = clone_vars_df.iloc[i]
    new = curr.copy()
    new["alt"] = np.random.choice(list(set(chars)-set([curr["alt"], curr["ref"]])))
    new["clone"] = np.random.choice(list(set(clone_vars_df["clone"].values)-set([curr["clone"]])), size=1)[0]
    new["donor"] = new["clone"].split("cl")[0][1:]
    new.name = new["ref_id"]+">"+new["alt"]+ "_" +new["clone"]
    #new.name = f"{new['pos']}_{new['ref']}_{new['alt']}"
    new_clone_vars.append(new)


(20, 5)
(22, 5)
(19, 6)


## b. For each new variant, get the pileup for cells in the new clone and with that position, and re-calculate the pileup with this af

In [12]:
#ref_char = np.random.choice(chars, n_positions)
ref_char = list(str(SeqIO.to_dict(SeqIO.parse(ref_fa, "fasta"))["chrM"].seq))

ref = [f"{i+1}_{c}" for i, c in enumerate(ref_char)]

ref_df = pd.DataFrame(np.array([range(1,1+len(ref_char)),ref_char]).transpose(),index=ref, columns=["pos", "ref"])
ref_df["pos"] = ref_df["pos"].astype(int)
ref_df.head()

Unnamed: 0,pos,ref
1_G,1,G
2_A,2,A
3_T,3,T
4_C,4,C
5_A,5,A


In [13]:
new_clone_vars_df = pd.DataFrame(new_clone_vars)
new_don_vars_df = pd.DataFrame(new_don_vars)
new_clone_vars_df["donor"] = new_clone_vars_df["donor"].astype(int)
new_clone_vars


new_cells_df = all_cells_df.loc[(all_cells_df["clone"].isin(new_clone_vars_df["clone"])) | 
                               (all_cells_df["donor"].isin(new_don_vars_df["donor"])) ]

new_ref_df = ref_df.loc[(ref_df["pos"].isin(new_don_vars_df["pos"])) | (ref_df["pos"].isin(new_clone_vars_df["pos"]))]


In [14]:
def new_cell_variants(cell_ser, ref_df, don_vars_df, clone_vars_df, seq_err=0.001, depth_lim=(2,10), strand_bin=0.5,
                 don_var_lim=(0.8,1), clone_var_lim=(0.1,0.5)):
    #cell_pileups = {}
    curr_clone = cell_ser["clone"]
    curr_don = cell_ser["donor"]
    cell_name = cell_ser.name
   # print('subset vars')
    curr_don_vars = don_vars_df.loc[don_vars_df["donor"]==curr_don].set_index("ref_id")
    curr_cl_vars = clone_vars_df.loc[clone_vars_df["clone"]==curr_clone].set_index("ref_id")
    curr_don_vars.index.name = None
    curr_cl_vars.index.name = None
    #print('after subset')
    # Generate counts at each position
    counts = pd.DataFrame({"cov":np.floor(2**(np.random.randint(2,10, size=ref_df.shape[0]))),
                      "ref":ref_df["ref"], "pos":ref_df["pos"],
                      "ref_id":ref_df.index, "cell": cell_name},
                      index=ref_df.index)
    counts["cov"] = counts["cov"].astype(int)
    
    # First construct donor variants
    #print('donor')
    cell_donSpec_counts = counts.loc[curr_don_vars.index]
    cell_donSpec_counts["alt"] = curr_don_vars["alt"]
    cell_donSpec_pileup = cell_donor_variants(cell_donSpec_counts, cell_name, don_var_lim)
    
    # Construct clone variants
    #print('clone')
    cell_cloneSpec_counts = counts.loc[curr_cl_vars.index]
    cell_cloneSpec_counts["alt"] = curr_cl_vars["alt"]
    cell_cloneSpec_pileup = cell_clone_variants(cell_cloneSpec_counts, cell_name, clone_var_lim)
    
    out_df = pd.concat([cell_donSpec_pileup, cell_cloneSpec_pileup], axis=0)
    out_df["donor"] = cell_ser["donor"]
    out_df["condition"] = cell_ser["condition"]
    
    #out_df["counts"] = out_df["nt_counts"]
    return out_df



In [15]:
new_pileups_all = new_cells_df.parallel_apply(new_cell_variants, 
                               args=(new_ref_df, new_don_vars_df, new_clone_vars_df, seq_error, depth_lim, 0.5, 
                                     don_var_lim, clone_var_lim), axis=1)

new_pileups_df = pd.concat(new_pileups_all.values)

In [16]:
new_pileups_df

Unnamed: 0,pos,cell,nt,counts,donor,condition
2,7151,Cell1donor0condition0cloned0cl4,A,16,0,0
3,7151,Cell1donor0condition0cloned0cl4,C,12,0,0
2,7151,Cell3donor0condition0cloned0cl4,A,64,0,0
3,7151,Cell3donor0condition0cloned0cl4,C,40,0,0
2,7151,Cell4donor0condition0cloned0cl4,A,64,0,0
...,...,...,...,...,...,...
7,14350,Cell298donor1condition2cloned1cl0,T,64,1,2
3,14350,Cell299donor1condition2cloned1cl0,C,6,1,2
4,11719,Cell299donor1condition2cloned1cl0,G,31,1,2
6,11719,Cell299donor1condition2cloned1cl0,T,256,1,2


#### Check results

In [17]:
len(new_pileups_df["pos"].unique())

5

## merge new_pileups_df and pileups_df by replacing counts_old with new if there is one
### Replace the current (cell, pos, nt) with the updated ones

In [18]:
new_pileups_df["cell_pos"] = new_pileups_df.apply(lambda x: f'{x["cell"]}_{x["pos"]}', axis=1)

In [19]:
new_pileups_df.head()

Unnamed: 0,pos,cell,nt,counts,donor,condition,cell_pos
2,7151,Cell1donor0condition0cloned0cl4,A,16,0,0,Cell1donor0condition0cloned0cl4_7151
3,7151,Cell1donor0condition0cloned0cl4,C,12,0,0,Cell1donor0condition0cloned0cl4_7151
2,7151,Cell3donor0condition0cloned0cl4,A,64,0,0,Cell3donor0condition0cloned0cl4_7151
3,7151,Cell3donor0condition0cloned0cl4,C,40,0,0,Cell3donor0condition0cloned0cl4_7151
2,7151,Cell4donor0condition0cloned0cl4,A,64,0,0,Cell4donor0condition0cloned0cl4_7151


### Add the forward and rev info for the new

In [20]:
def split_strand(ser):
    counts = ser["counts"]
    pos = np.random.binomial(counts, 0.5)
    neg = counts-pos
    ser["Fw Count"] = pos
    ser["Rev Count"] = neg
    return ser

new_pileups_df = new_pileups_df.parallel_apply(split_strand, axis=1)
new_pileups_df["Fw BQ"] = 37
new_pileups_df["Rev BQ"] = 37
new_pileups_df["Fw BQ"] = new_pileups_df["Fw BQ"].astype(int)
new_pileups_df["Rev BQ"] = new_pileups_df["Rev BQ"].astype(int)

new_pileups_df["Rev Count"] = new_pileups_df["Rev Count"].astype(int)
new_pileups_df["Fw Count"] = new_pileups_df["Fw Count"].astype(int)

new_pileups_df["pos"] = new_pileups_df["pos"].astype(int)
new_pileups_df.head()

Unnamed: 0,pos,cell,nt,counts,donor,condition,cell_pos,Fw Count,Rev Count,Fw BQ,Rev BQ
2,7151,Cell1donor0condition0cloned0cl4,A,16.0,0,0,Cell1donor0condition0cloned0cl4_7151,5,11,37,37
3,7151,Cell1donor0condition0cloned0cl4,C,12.0,0,0,Cell1donor0condition0cloned0cl4_7151,6,6,37,37
2,7151,Cell3donor0condition0cloned0cl4,A,64.0,0,0,Cell3donor0condition0cloned0cl4_7151,37,27,37,37
3,7151,Cell3donor0condition0cloned0cl4,C,40.0,0,0,Cell3donor0condition0cloned0cl4_7151,21,19,37,37
2,7151,Cell4donor0condition0cloned0cl4,A,64.0,0,0,Cell4donor0condition0cloned0cl4_7151,28,36,37,37


In [21]:
pileups_df["cell_pos"] = pileups_df.apply(lambda x: f'{x["cell"]}_{x["pos"]}', axis=1)
pileups_filt_df = pileups_df.loc[~(pileups_df["cell_pos"].isin(new_pileups_df["cell_pos"].unique()))]
#pileups_filt_df = pileups_filt_df[["nt", "cond", "pos", "cell", "counts", "cell_pos"]]

In [22]:
pileups_filt_df = pileups_filt_df.rename({"cov":"counts"}, axis=1)

In [23]:
pileups_df_strand = pd.merge(new_pileups_df.rename({"condition":"cond"},axis=1), pileups_filt_df, how="outer",
                         on=["nt", "cond", "cell", "pos"],
                             suffixes=["_new","_old"])


pileups_df_strand

Unnamed: 0,pos,cell,nt,counts_new,donor,cond,cell_pos_new,Fw Count_new,Rev Count_new,Fw BQ_new,Rev BQ_new,level_2,Fw Count_old,Fw BQ_old,Rev Count_old,Rev BQ_old,counts_old,cell_pos_old
0,7151,Cell1donor0condition0cloned0cl4,A,16.0,0.0,0,Cell1donor0condition0cloned0cl4_7151,5.0,11.0,37.0,37.0,,,,,,,
1,7151,Cell1donor0condition0cloned0cl4,C,12.0,0.0,0,Cell1donor0condition0cloned0cl4_7151,6.0,6.0,37.0,37.0,,,,,,,
2,7151,Cell3donor0condition0cloned0cl4,A,64.0,0.0,0,Cell3donor0condition0cloned0cl4_7151,37.0,27.0,37.0,37.0,,,,,,,
3,7151,Cell3donor0condition0cloned0cl4,C,40.0,0.0,0,Cell3donor0condition0cloned0cl4_7151,21.0,19.0,37.0,37.0,,,,,,,
4,7151,Cell4donor0condition0cloned0cl4,A,64.0,0.0,0,Cell4donor0condition0cloned0cl4_7151,28.0,36.0,37.0,37.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33441371,16551,Cell9donor1condition2cloned1cl1,T,,,2,,,,,,2758387.0,17.0,37.0,15.0,37.0,32.0,Cell9donor1condition2cloned1cl1_16551
33441372,16555,Cell9donor1condition2cloned1cl1,T,,,2,,,,,,2758388.0,35.0,37.0,29.0,37.0,64.0,Cell9donor1condition2cloned1cl1_16555
33441373,16562,Cell9donor1condition2cloned1cl1,T,,,2,,,,,,2758389.0,10.0,37.0,6.0,37.0,16.0,Cell9donor1condition2cloned1cl1_16562
33441374,16568,Cell9donor1condition2cloned1cl1,T,,,2,,,,,,2758390.0,8.0,37.0,8.0,37.0,16.0,Cell9donor1condition2cloned1cl1_16568


In [24]:
pileups_df_strand["counts_new"] = pileups_df_strand["counts_new"].astype("Int64")
pileups_df_strand["counts_old"] = pileups_df_strand["counts_old"].astype("Int64")

In [25]:
pileups_df_strand["counts"] = pileups_df_strand["counts_new"].fillna(pileups_df_strand["counts_old"])

pileups_df_strand["Fw Count"] = pileups_df_strand["Fw Count_new"].fillna(pileups_df_strand["Fw Count_old"])
pileups_df_strand["Rev Count"] = pileups_df_strand["Rev Count_new"].fillna(pileups_df_strand["Rev Count_old"])
pileups_df_strand["Fw BQ"] = pileups_df_strand["Fw BQ_new"].fillna(pileups_df_strand["Fw BQ_old"])
pileups_df_strand["Rev BQ"] = pileups_df_strand["Rev BQ_new"].fillna(pileups_df_strand["Rev BQ_old"])


# pileups_df_strand["counts"] = pileups_df_strand.apply(lambda x: x["counts_old"] if pd.isnull(x["counts_new"]) else x["counts_new"],
#                                               axis=1)

pileups_df_strand

Unnamed: 0,pos,cell,nt,counts_new,donor,cond,cell_pos_new,Fw Count_new,Rev Count_new,Fw BQ_new,...,Fw BQ_old,Rev Count_old,Rev BQ_old,counts_old,cell_pos_old,counts,Fw Count,Rev Count,Fw BQ,Rev BQ
0,7151,Cell1donor0condition0cloned0cl4,A,16,0.0,0,Cell1donor0condition0cloned0cl4_7151,5.0,11.0,37.0,...,,,,,,16,5.0,11.0,37.0,37.0
1,7151,Cell1donor0condition0cloned0cl4,C,12,0.0,0,Cell1donor0condition0cloned0cl4_7151,6.0,6.0,37.0,...,,,,,,12,6.0,6.0,37.0,37.0
2,7151,Cell3donor0condition0cloned0cl4,A,64,0.0,0,Cell3donor0condition0cloned0cl4_7151,37.0,27.0,37.0,...,,,,,,64,37.0,27.0,37.0,37.0
3,7151,Cell3donor0condition0cloned0cl4,C,40,0.0,0,Cell3donor0condition0cloned0cl4_7151,21.0,19.0,37.0,...,,,,,,40,21.0,19.0,37.0,37.0
4,7151,Cell4donor0condition0cloned0cl4,A,64,0.0,0,Cell4donor0condition0cloned0cl4_7151,28.0,36.0,37.0,...,,,,,,64,28.0,36.0,37.0,37.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33441371,16551,Cell9donor1condition2cloned1cl1,T,,,2,,,,,...,37.0,15.0,37.0,32,Cell9donor1condition2cloned1cl1_16551,32,17.0,15.0,37.0,37.0
33441372,16555,Cell9donor1condition2cloned1cl1,T,,,2,,,,,...,37.0,29.0,37.0,64,Cell9donor1condition2cloned1cl1_16555,64,35.0,29.0,37.0,37.0
33441373,16562,Cell9donor1condition2cloned1cl1,T,,,2,,,,,...,37.0,6.0,37.0,16,Cell9donor1condition2cloned1cl1_16562,16,10.0,6.0,37.0,37.0
33441374,16568,Cell9donor1condition2cloned1cl1,T,,,2,,,,,...,37.0,8.0,37.0,16,Cell9donor1condition2cloned1cl1_16568,16,8.0,8.0,37.0,37.0


In [26]:
pileups_df_strand["cell_pos"] = pileups_df_strand["cell_pos_new"].fillna(pileups_df_strand["cell_pos_old"])



In [27]:
pileups_df_strand

Unnamed: 0,pos,cell,nt,counts_new,donor,cond,cell_pos_new,Fw Count_new,Rev Count_new,Fw BQ_new,...,Rev Count_old,Rev BQ_old,counts_old,cell_pos_old,counts,Fw Count,Rev Count,Fw BQ,Rev BQ,cell_pos
0,7151,Cell1donor0condition0cloned0cl4,A,16,0.0,0,Cell1donor0condition0cloned0cl4_7151,5.0,11.0,37.0,...,,,,,16,5.0,11.0,37.0,37.0,Cell1donor0condition0cloned0cl4_7151
1,7151,Cell1donor0condition0cloned0cl4,C,12,0.0,0,Cell1donor0condition0cloned0cl4_7151,6.0,6.0,37.0,...,,,,,12,6.0,6.0,37.0,37.0,Cell1donor0condition0cloned0cl4_7151
2,7151,Cell3donor0condition0cloned0cl4,A,64,0.0,0,Cell3donor0condition0cloned0cl4_7151,37.0,27.0,37.0,...,,,,,64,37.0,27.0,37.0,37.0,Cell3donor0condition0cloned0cl4_7151
3,7151,Cell3donor0condition0cloned0cl4,C,40,0.0,0,Cell3donor0condition0cloned0cl4_7151,21.0,19.0,37.0,...,,,,,40,21.0,19.0,37.0,37.0,Cell3donor0condition0cloned0cl4_7151
4,7151,Cell4donor0condition0cloned0cl4,A,64,0.0,0,Cell4donor0condition0cloned0cl4_7151,28.0,36.0,37.0,...,,,,,64,28.0,36.0,37.0,37.0,Cell4donor0condition0cloned0cl4_7151
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33441371,16551,Cell9donor1condition2cloned1cl1,T,,,2,,,,,...,15.0,37.0,32,Cell9donor1condition2cloned1cl1_16551,32,17.0,15.0,37.0,37.0,Cell9donor1condition2cloned1cl1_16551
33441372,16555,Cell9donor1condition2cloned1cl1,T,,,2,,,,,...,29.0,37.0,64,Cell9donor1condition2cloned1cl1_16555,64,35.0,29.0,37.0,37.0,Cell9donor1condition2cloned1cl1_16555
33441373,16562,Cell9donor1condition2cloned1cl1,T,,,2,,,,,...,6.0,37.0,16,Cell9donor1condition2cloned1cl1_16562,16,10.0,6.0,37.0,37.0,Cell9donor1condition2cloned1cl1_16562
33441374,16568,Cell9donor1condition2cloned1cl1,T,,,2,,,,,...,8.0,37.0,16,Cell9donor1condition2cloned1cl1_16568,16,8.0,8.0,37.0,37.0,Cell9donor1condition2cloned1cl1_16568


In [28]:
del pileups_filt_df
del new_pileups_df

In [29]:
pileups_df_strand = pileups_df_strand[["nt", "cond", "pos", "cell", "counts", "cell_pos", "donor",
                                      "Fw Count", "Rev Count", "Fw BQ", "Rev BQ"]]


In [30]:
pileups_df_strand

Unnamed: 0,nt,cond,pos,cell,counts,cell_pos,donor,Fw Count,Rev Count,Fw BQ,Rev BQ
0,A,0,7151,Cell1donor0condition0cloned0cl4,16,Cell1donor0condition0cloned0cl4_7151,0.0,5.0,11.0,37.0,37.0
1,C,0,7151,Cell1donor0condition0cloned0cl4,12,Cell1donor0condition0cloned0cl4_7151,0.0,6.0,6.0,37.0,37.0
2,A,0,7151,Cell3donor0condition0cloned0cl4,64,Cell3donor0condition0cloned0cl4_7151,0.0,37.0,27.0,37.0,37.0
3,C,0,7151,Cell3donor0condition0cloned0cl4,40,Cell3donor0condition0cloned0cl4_7151,0.0,21.0,19.0,37.0,37.0
4,A,0,7151,Cell4donor0condition0cloned0cl4,64,Cell4donor0condition0cloned0cl4_7151,0.0,28.0,36.0,37.0,37.0
...,...,...,...,...,...,...,...,...,...,...,...
33441371,T,2,16551,Cell9donor1condition2cloned1cl1,32,Cell9donor1condition2cloned1cl1_16551,,17.0,15.0,37.0,37.0
33441372,T,2,16555,Cell9donor1condition2cloned1cl1,64,Cell9donor1condition2cloned1cl1_16555,,35.0,29.0,37.0,37.0
33441373,T,2,16562,Cell9donor1condition2cloned1cl1,16,Cell9donor1condition2cloned1cl1_16562,,10.0,6.0,37.0,37.0
33441374,T,2,16568,Cell9donor1condition2cloned1cl1,16,Cell9donor1condition2cloned1cl1_16568,,8.0,8.0,37.0,37.0


In [31]:
## assert if merge pileups new is not na then old is na
#pileups_df_strand.apply(lambda x: if pd.isnull(""))

In [32]:
new_cells_df["cell"] = new_cells_df.index
all_cells_df["cell"] = all_cells_df.index
new_cells_df = new_cells_df.astype(object)
out_cells_df = pd.concat([new_cells_df,all_cells_df],ignore_index=True).drop_duplicates(subset=['cell'], keep='first')
out_cells_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,donor,condition,clone,cell
0,0,0,d0cl4,Cell1donor0condition0cloned0cl4
1,0,0,d0cl4,Cell3donor0condition0cloned0cl4
2,0,0,d0cl4,Cell4donor0condition0cloned0cl4
3,0,0,d0cl4,Cell9donor0condition0cloned0cl4
4,0,0,d0cl4,Cell12donor0condition0cloned0cl4
...,...,...,...,...
1975,0,2,d0cl0,Cell294donor0condition2cloned0cl0
1977,0,2,d0cl0,Cell296donor0condition2cloned0cl0
1978,0,2,d0cl0,Cell297donor0condition2cloned0cl0
1979,0,2,d0cl0,Cell298donor0condition2cloned0cl0


In [33]:
#pileups_df_strand = pileups_df_strand.drop(["Fw Count","Fw BQ", "Rev Count", "Rev BQ", "counts_old", "counts_new"], axis=1)

---

## Drop initial BQ and strand and re-do

## Add BQ and strand. Pick strand from binomial

In [34]:
# def split_strand(ser):
#     counts = ser["counts"]
#     pos = np.random.binomial(counts, 0.5)
#     neg = counts-pos
#     ser["Fw Count"] = pos
#     ser["Rev Count"] = neg
#     return ser

# pileups_df_strand = pileups_df_strand.parallel_apply(split_strand, axis=1)
# pileups_df_strand["Fw BQ"] = 37
# pileups_df_strand["Rev BQ"] = 37
# pileups_df_strand["Fw BQ"] = pileups_df_strand["Fw BQ"].astype(int)
# pileups_df_strand["Rev BQ"] = pileups_df_strand["Rev BQ"].astype(int)

# pileups_df_strand["Rev Count"] = pileups_df_strand["Rev Count"].astype(int)
# pileups_df_strand["Fw Count"] = pileups_df_strand["Fw Count"].astype(int)

# pileups_df_strand["pos"] = pileups_df_strand["pos"].astype(int)

----

## Save pileups

In [35]:
pileups_df_strand.head()

Unnamed: 0,nt,cond,pos,cell,counts,cell_pos,donor,Fw Count,Rev Count,Fw BQ,Rev BQ
0,A,0,7151,Cell1donor0condition0cloned0cl4,16,Cell1donor0condition0cloned0cl4_7151,0.0,5.0,11.0,37.0,37.0
1,C,0,7151,Cell1donor0condition0cloned0cl4,12,Cell1donor0condition0cloned0cl4_7151,0.0,6.0,6.0,37.0,37.0
2,A,0,7151,Cell3donor0condition0cloned0cl4,64,Cell3donor0condition0cloned0cl4_7151,0.0,37.0,27.0,37.0,37.0
3,C,0,7151,Cell3donor0condition0cloned0cl4,40,Cell3donor0condition0cloned0cl4_7151,0.0,21.0,19.0,37.0,37.0
4,A,0,7151,Cell4donor0condition0cloned0cl4,64,Cell4donor0condition0cloned0cl4_7151,0.0,28.0,36.0,37.0,37.0


In [36]:
## Convert to nt's
for (nt, cond), curr_out_df in pileups_df_strand.groupby(["nt", "cond"]):
    cond_outdir = join(samePos_outdir, "data", f"cond{cond}", pileup_outdir)
    print(cond_outdir)
    if not exists(cond_outdir):
        os.makedirs(cond_outdir)
    print(nt,cond)
    curr_out_df[["pos","cell","Fw Count","Fw BQ","Rev Count","Rev BQ"]].sort_values(["cell", "pos", "Fw Count"]).to_csv(join(cond_outdir, f"cond{cond}.{nt}.strands.txt"), 
                                                                               header=None, index=None)
    #curr_out.to_csv(join(outdir, f"cond{cond}.{nt}.strands.txt.gz"), compression='gzip')

/data/Mito_Trace/output/clone_pileups_simulation/samePos/donors_2__conditions_3__cells_per_donor_cond_300__clones_in_don_5__positions_100__variants_per_clone_lambda_1__donor_variants_10/seq_error_0.01__don_var_lim_0.8_1.0__clone_var_lim_0.1_0.4__depth_lim_4_10/data/cond0/MT/cellr_True/numread_200/
A 0
/data/Mito_Trace/output/clone_pileups_simulation/samePos/donors_2__conditions_3__cells_per_donor_cond_300__clones_in_don_5__positions_100__variants_per_clone_lambda_1__donor_variants_10/seq_error_0.01__don_var_lim_0.8_1.0__clone_var_lim_0.1_0.4__depth_lim_4_10/data/cond1/MT/cellr_True/numread_200/
A 1
/data/Mito_Trace/output/clone_pileups_simulation/samePos/donors_2__conditions_3__cells_per_donor_cond_300__clones_in_don_5__positions_100__variants_per_clone_lambda_1__donor_variants_10/seq_error_0.01__don_var_lim_0.8_1.0__clone_var_lim_0.1_0.4__depth_lim_4_10/data/cond2/MT/cellr_True/numread_200/
A 2
/data/Mito_Trace/output/clone_pileups_simulation/samePos/donors_2__conditions_3__cells_per_

## total coverage

In [37]:
for cond, curr_out_df in pileups_df_strand.groupby("cond"):
    print('cond', cond)
    coverage = curr_out_df.groupby(["cell", "pos"]).apply(lambda x: (x["Rev Count"]+x["Fw Count"]).sum()).reset_index()
    cond_outdir = join(samePos_outdir, "data", f"cond{cond}", pileup_outdir)
    print(cond_outdir)
    coverage["pos"] = coverage["pos"].astype(int)
    coverage[["pos","cell",0]].sort_values(["cell","pos"]).to_csv(join(cond_outdir, f"cond{cond}.coverage.strands.txt"), 
                                                                               header=None, index=None) #compression='gzip')


/data/Mito_Trace/output/clone_pileups_simulation/samePos/donors_2__conditions_3__cells_per_donor_cond_300__clones_in_don_5__positions_100__variants_per_clone_lambda_1__donor_variants_10/seq_error_0.01__don_var_lim_0.8_1.0__clone_var_lim_0.1_0.4__depth_lim_4_10/data/cond0/MT/cellr_True/numread_200/
/data/Mito_Trace/output/clone_pileups_simulation/samePos/donors_2__conditions_3__cells_per_donor_cond_300__clones_in_don_5__positions_100__variants_per_clone_lambda_1__donor_variants_10/seq_error_0.01__don_var_lim_0.8_1.0__clone_var_lim_0.1_0.4__depth_lim_4_10/data/cond1/MT/cellr_True/numread_200/
/data/Mito_Trace/output/clone_pileups_simulation/samePos/donors_2__conditions_3__cells_per_donor_cond_300__clones_in_don_5__positions_100__variants_per_clone_lambda_1__donor_variants_10/seq_error_0.01__don_var_lim_0.8_1.0__clone_var_lim_0.1_0.4__depth_lim_4_10/data/cond2/MT/cellr_True/numread_200/


### Save cell and variant assignments

#### Append the new clone variants, create new_clones_meta_cond_df, and revert donor index back to ref_id

In [38]:
out_clone_vars_df = clone_vars_df.append(new_clone_vars_df)
print("Any dups?" , out_clone_vars_df.index.duplicated().any())
assert(not out_clone_vars_df.index.duplicated().any())
out_clone_vars_df

Any dups? False


Unnamed: 0,pos,ref,alt,ref_id,donor,clone
7369_C>T_0_d0cl0,7369,C,T,7369_C,0,d0cl0
6842_T>A_0_d0cl0,6842,T,A,6842_T,0,d0cl0
7151_C>T_0_d0cl1,7151,C,T,7151_C,0,d0cl1
15306_T>G_0_d0cl1,15306,T,G,15306_T,0,d0cl1
4225_A>C_0_d0cl2,4225,A,C,4225_A,0,d0cl2
2841_T>C_0_d0cl2,2841,T,C,2841_T,0,d0cl2
5663_C>G_0_d0cl2,5663,C,G,5663_C,0,d0cl2
2549_C>A_0_d0cl3,2549,C,A,2549_C,0,d0cl3
1972_A>G_0_d0cl3,1972,A,G,1972_A,0,d0cl3
15378_T>A_0_d0cl3,15378,T,A,15378_T,0,d0cl3


In [39]:
# Make clones_meta_cond_df and clones_meta_df
new_clones_meta_cond_df = new_cells_df.groupby(["clone", "condition", "donor"]).size().to_frame('ncells').reset_index()
new_clones_meta_cond_df

Unnamed: 0,clone,condition,donor,ncells
0,d0cl4,0,0,48
1,d0cl4,1,0,73
2,d0cl4,2,0,60
3,d1cl0,0,1,63
4,d1cl0,1,1,61
5,d1cl0,2,1,59
6,d1cl1,0,1,59
7,d1cl1,1,1,61
8,d1cl1,2,1,58
9,d1cl2,0,1,61


In [40]:
new_don_vars_df = don_vars_df.set_index("ref_id")

In [41]:
new_don_vars_df.to_csv(join(samePos_outdir, "donor_vars.csv"))
out_clone_vars_df.to_csv(join(samePos_outdir, "clone_vars.csv"))
out_cells_df.to_csv(join(samePos_outdir, "cells_meta.csv"))
new_clones_meta_cond_df.to_csv(join(samePos_outdir, "clones_meta.csv"))

In [42]:
out_cells_df

Unnamed: 0,donor,condition,clone,cell
0,0,0,d0cl4,Cell1donor0condition0cloned0cl4
1,0,0,d0cl4,Cell3donor0condition0cloned0cl4
2,0,0,d0cl4,Cell4donor0condition0cloned0cl4
3,0,0,d0cl4,Cell9donor0condition0cloned0cl4
4,0,0,d0cl4,Cell12donor0condition0cloned0cl4
...,...,...,...,...
1975,0,2,d0cl0,Cell294donor0condition2cloned0cl0
1977,0,2,d0cl0,Cell296donor0condition2cloned0cl0
1978,0,2,d0cl0,Cell297donor0condition2cloned0cl0
1979,0,2,d0cl0,Cell298donor0condition2cloned0cl0


In [43]:
new_don_vars_df

Unnamed: 0_level_0,alt,ref,pos,donor
ref_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
11719_G,A,G,11719,0
14350_C,G,C,14350,0
9528_C,G,C,9528,0
12833_C,T,C,12833,0
5767_C,T,C,5767,0
12920_C,G,C,12920,0
1558_A,G,A,1558,0
11150_G,T,G,11150,0
4832_C,A,C,4832,0
8953_A,G,A,8953,0


In [44]:
print

<function print>