In [1]:
### Simulation parameters
n_donors = 2
n_conditions = 3
n_cells_per_donor_cond = 300
n_clones_in_don = 5
n_positions = 100
n_variants_per_clone_lambda = 1
n_donor_variants = 10

seq_error = 0.01
don_var_lim=(0.8,1)
clone_var_lim=(0.1,0.4)
depth_lim = (4,10)
chars = ["A", "G", "C", "T"]


outdir = "/data/Mito_Trace/output/clone_pileups_simulation/samePos/"
pileup_outdir = "MT/cellr_True/numread_200/"
ref_fa = "/data/Mito_Trace/data/processed/genomes/mtMasked/GRCh38_MT_blacklist_A2_2020/chrM.fasta"

In [2]:
from Bio import SeqIO

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
from os.path import join, exists
import os
import matplotlib.pyplot as plt
np.random.seed(42)
%matplotlib inline



In [4]:
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=16)


INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [5]:
if not exists(outdir):
    os.mkdir(outdir)

In [6]:
params = {"donors":n_donors,"conditions":n_conditions, "cells_per_donor_cond":n_cells_per_donor_cond,
         "clones_in_don":n_clones_in_don, "positions":n_positions, 
          "variants_per_clone_lambda":n_variants_per_clone_lambda, "donor_variants":n_donor_variants}


seq_params = {"seq_error": seq_error, "don_var_lim":"_".join(np.array(don_var_lim).astype(str)), 
              "clone_var_lim": "_".join(np.array(clone_var_lim).astype(str)),
             "depth_lim":"_".join(np.array(depth_lim).astype(str))}

curr_out_meta = ""
for p in params:
    curr_out_meta = f"{curr_out_meta}__{p}_{params[p]}"

curr_out_meta = curr_out_meta.strip("__")
print(curr_out_meta)


curr_out_seq = ""
for p in seq_params:
    curr_out_seq = f"{curr_out_seq}__{p}_{seq_params[p]}"
curr_out_seq = curr_out_seq.strip("__")
print(curr_out_seq)


curr_outdir = join(outdir, curr_out_meta, curr_out_seq)

if not exists(curr_outdir):
    os.makedirs(curr_outdir)

donors_2__conditions_3__cells_per_donor_cond_300__clones_in_don_5__positions_100__variants_per_clone_lambda_1__donor_variants_10
seq_error_0.01__don_var_lim_0.8_1.0__clone_var_lim_0.1_0.4__depth_lim_4_10


### Generate cells with donor and clone assignment

In [19]:
def generate_nonspec_vars(cell_ser, cell_nm, seq_err=0.001, chars=["A","C","G","T"]):
#    print(cell_ser)
    reads = cell_ser["counts"]
    ref = cell_ser["ref"]
    ref_id = cell_ser["ref_id"]
    seq_err_counts = np.random.binomial(reads,seq_err)
    oth_nts = list(set(chars)-set([ref]))
    out = {}
    if seq_err_counts>0:
        # Change the nt to the non alt allele
        out = Counter(np.random.choice(oth_nts, size=seq_err_counts,replace=True))
    else:
        out  = {x:0 for x in oth_nts}
    out[ref] = reads - seq_err_counts

    out["pos"] = cell_ser["pos"]
    out["cell"] = cell_nm
    
    return pd.Series(out)

def cell_nonspec_variants(curr_counts, cell_name):
    curr_cell_pile = curr_counts.apply(generate_nonspec_vars, args=(cell_name,), axis=1).reset_index().rename({"index":"ref_id"},axis=1)
    curr_cell_pile = curr_cell_pile.drop("ref_id",axis=1).melt(id_vars=["pos", "cell"], value_name="counts", var_name="nt")
    curr_cell_pile = (curr_cell_pile.loc[curr_cell_pile["counts"]!=0]).dropna()
    return curr_cell_pile

def generate_specific_vars(d_v_ser,  cell_nm, don_var_lim, seq_err=0.001, chars=["A","C","G","T"]):
    #out = {}  
    reads = d_v_ser["counts"]
    ref = d_v_ser["ref"]
    alt = d_v_ser["alt"]
    pos = d_v_ser["pos"]
    ref_id = d_v_ser["ref_id"] 
    
    curr_af = np.random.uniform(don_var_lim[0],don_var_lim[1]) # generate using uniform distribution
    curr_af_counts = int(np.floor(curr_af*reads))
    #curr_af_counts = int(np.floor(curr_af*counts.loc[ref_id, "counts" ]))
    
    seq_err_counts = np.random.binomial(reads,seq_err)
        
    oth_nts = list(set(chars)-set([alt, ref]))
    if seq_err_counts>0:
        # Change the nt to the non alt allele
        out = Counter(np.random.choice(oth_nts, size=seq_err_counts,replace=True))
    else:
        out  = {x:0 for x in oth_nts}
        
    # Add in the reference and alt counts!
    out[alt] = reads - seq_err_counts
    out[ref] = max(reads - curr_af_counts - seq_err_counts,0)
    out["pos"] = d_v_ser["pos"]
    out["cell"] = cell_nm
    
    # Add the reference counts
    
    return pd.Series(out)

def cell_donor_variants(curr_cell_counts, cell_name, don_var_lim):
    """ Generates cell's donor variant counts based on the donor variant limits.
    """
    curr_cell_pile = curr_cell_counts.apply(generate_specific_vars, args=(cell_name,don_var_lim), axis=1).reset_index().rename({"index":"ref_id"},axis=1)
#     print('curr_cell_pile')
#     print(curr_cell_pile.head())
   # print(curr_cell_pile)
    curr_cell_pile = curr_cell_pile.drop("ref_id",axis=1).melt(id_vars=["pos", "cell"], value_name="counts", var_name="nt")
    curr_cell_pile = (curr_cell_pile.loc[curr_cell_pile["counts"]!=0]).dropna()
    return curr_cell_pile

def cell_clone_variants(curr_cell_counts, cell_name, clone_var_lim):
    """ Generates cell's counts based on the clone limits.
    """
    curr_cell_pile = curr_cell_counts.apply(generate_specific_vars, args=(cell_name, clone_var_lim), axis=1).reset_index().rename({"index":"ref_id"},axis=1)
    curr_cell_pile = curr_cell_pile.drop("ref_id",axis=1).melt(id_vars=["pos", "cell"], value_name="counts", var_name="nt")
    curr_cell_pile = (curr_cell_pile.loc[curr_cell_pile["counts"]!=0]).dropna()
    return curr_cell_pile


In [20]:
def cell_variants(cell_ser, ref_df, don_vars_df, clone_vars_df, seq_err=0.001, depth_lim=(2,10), strand_bin=0.5,
                 don_var_lim=(0.8,1), clone_var_lim=(0.1,0.5)):
    #cell_pileups = {}
    curr_clone = cell_ser["clone"]
    curr_don = cell_ser["donor"]
    cell_name = cell_ser.name
    
    # Get the donor and clone variants
    curr_don_vars = don_vars_df.loc[don_vars_df["donor"]==curr_don].set_index("ref_id")
    curr_cl_vars = clone_vars_df.loc[clone_vars_df["clone"]==curr_clone].set_index("ref_id")
    
    # Generate counts at each position
    counts = pd.DataFrame({"counts":np.floor(2**(np.random.randint(2,10, size=ref_df.shape[0]))),
                      "ref":ref_df["ref"], "pos":ref_df["pos"],
                      "ref_id":ref_df.index, "cell": cell_name},
                      index=ref_df.index)
    counts["counts"] = counts["counts"].astype(int)
    
    # Non-donor and non-clone vars
    #print('non-spec counts')
#     print('don vars', curr_don_vars.index)
#     print('clone vars', curr_cl_vars.index)
    non_spec_counts = counts.drop(curr_don_vars.index).drop(curr_cl_vars.index)
    non_spec_pileup = cell_nonspec_variants(non_spec_counts, cell_name)

    #print('don spec')
    # First construct donor variants
    cell_donSpec_counts = counts.loc[curr_don_vars.index]
    cell_donSpec_counts["alt"] = curr_don_vars["alt"]
    cell_donSpec_pileup = cell_donor_variants(cell_donSpec_counts, cell_name, don_var_lim)

    #print('clone spec')
    # Construct clone variants
    cell_cloneSpec_counts = counts.loc[curr_cl_vars.index]
    cell_cloneSpec_counts["alt"] = curr_cl_vars["alt"]
    cell_cloneSpec_pileup = cell_clone_variants(cell_cloneSpec_counts, cell_name, clone_var_lim)
    
#     for d_v in curr_don_vars:
#         curr_don_pileup = {}
#         curr_ref = curr_don_vars.loc[d_v, "ref"]
#         curr_alt = curr_don_vars.loc[d_v, "alt"]
#         curr_pos = curr_don_vars.loc[d_v, "pos"]
#         curr_ref_id = curr_don_vars.loc[d_v, "ref_id"] 
#         curr_af = np.random.uniform(clone_var_lim[0],clone_var_lim[1])
#         curr_af_counts = int(np.floor(curr_af*counts.loc[curr_ref_id, "counts" ]))
#         seq_err_counts = np.random.binomial(counts.loc[curr_ref_id, "counts"],seq_err)
#         curr_af_counts -= seq_err_counts
#         if len(seq_error_counts)>0:
#             # Change the nt to the non alt allele
#             seq_err_counts = Counter(np.random.choice(set(chars)-set([curr_alt]), size=seq_err_counts,replace=True))
            
        

#     print(cell_donSpec_counts.head())
#     print(cell_cloneSpec_counts.head())
#     print(non_spec.head())

    # concat donor-, clone-, and non-specific pileup dfs
    out_df = pd.concat([cell_donSpec_pileup, cell_cloneSpec_pileup, non_spec_pileup], axis=0)
    out_df["donor"] = cell_ser["donor"]
    out_df["condition"] = cell_ser["condition"]
    return out_df


## For each cell, generate a 'pileup df', which contains read depth at each position
The columns are ["pos","cell","nt","counts","donor","condition"]

In [21]:
# pileups_all = all_cells_df.parallel_apply(cell_variants, 
#                                args=(ref_df, don_vars_df, clone_vars_df, seq_error, depth_lim, 0.5, 
#                                      don_var_lim, clone_var_lim), axis=1)

# pileups_df = pd.concat(pileups_all.values)

# pileups_df["clone"] = pileups_df["cell"].apply(lambda x: x.split("cl")[-1]).astype(int)

## Add variants with the same position

## Add variant that has same position but different reference

### a. Pick the variants

In [106]:
print(don_vars_df.shape)
don_vars_df.head()

new_don_vars = []

donors=list(set(don_vars_df["donor"].values))
n_same_pos_vars = 2
for i in range(n_same_pos_vars):
    curr = don_vars_df.iloc[i]
    new = curr.copy()
    new["alt"] = np.random.choice(list(set(chars)-set([curr["alt"], curr["ref"]])))
    new["donor"] = np.random.choice(list(set(donors)-set([curr["donor"]]))) # TODO: remove the same donor
    new.name = f"{new['pos']}_{new['ref']}_{new['alt']}"
    don_vars_df = don_vars_df.append(new)
    new_don_vars.append(new)

print(don_vars_df.shape)
don_vars_df.tail()

print(clone_vars_df.shape)
clone_vars_df.tail()

clone_vars_df.head()

n_same_pos_vars = 3
new_clone_vars = []
for i in range(n_same_pos_vars):
    curr = clone_vars_df.iloc[i]
    new = curr.copy()
    new["alt"] = np.random.choice(list(set(chars)-set([curr["alt"], curr["ref"]])))
    new["clone"] = np.random.choice(list(set(clone_vars_df["clone"].values)-set([curr["clone"]])), size=1)[0]
    new["donor"] = new["clone"].split("cl")[0][1:]
    new.name = new["ref_id"]+">"+new["alt"]+ "_" +new["clone"]
    #new.name = f"{new['pos']}_{new['ref']}_{new['alt']}"
    new_clone_vars.append(new)
new_clone_vars_df["donor"] = new_clone_vars_df["donor"].astype(int)
new_clone_vars


(30, 5)
(32, 5)
(19, 6)


[pos         7369
 ref            C
 alt            G
 ref_id    7369_C
 donor          0
 clone      d0cl4
 Name: 7369_C>G_d0cl4, dtype: object,
 pos         6842
 ref            T
 alt            C
 ref_id    6842_T
 donor          0
 clone      d0cl4
 Name: 6842_T>C_d0cl4, dtype: object,
 pos         7151
 ref            C
 alt            G
 ref_id    7151_C
 donor          1
 clone      d1cl4
 Name: 7151_C>G_d1cl4, dtype: object]

## b. For each new variant, get the pileup for cells in the new clone and with that position, and re-calculate the pileup with this af

In [107]:
new_clone_vars_df = pd.DataFrame(new_clone_vars)
new_don_vars_df = pd.DataFrame(new_don_vars)

new_cells_df=all_cells_df.loc[(all_cells_df["clone"].isin(new_clone_vars_df["clone"])) | 
                               (all_cells_df["donor"].isin(new_don_vars_df["donor"])) ]

new_ref_df = ref_df.loc[(ref_df["pos"].isin(new_don_vars_df["pos"])) | (ref_df["pos"].isin(new_clone_vars_df["pos"]))]


Unnamed: 0,pos,ref,alt,ref_id,donor,clone
7369_C>G_d0cl4,7369,C,G,7369_C,0,d0cl4
6842_T>C_d0cl4,6842,T,C,6842_T,0,d0cl4
7151_C>G_d1cl4,7151,C,G,7151_C,1,d1cl4


In [111]:
def new_cell_variants(cell_ser, ref_df, don_vars_df, clone_vars_df, seq_err=0.001, depth_lim=(2,10), strand_bin=0.5,
                 don_var_lim=(0.8,1), clone_var_lim=(0.1,0.5)):
    #cell_pileups = {}
    curr_clone = cell_ser["clone"]
    curr_don = cell_ser["donor"]
    cell_name = cell_ser.name
    print('subset vars')
    curr_don_vars = don_vars_df.loc[don_vars_df["donor"]==curr_don].set_index("ref_id")
    curr_cl_vars = clone_vars_df.loc[clone_vars_df["clone"]==curr_clone].set_index("ref_id")
    curr_don_vars.index.name = None
    curr_cl_vars.index.name = None
    print('after subset')
    # Generate counts at each position
    counts = pd.DataFrame({"counts":np.floor(2**(np.random.randint(2,10, size=ref_df.shape[0]))),
                      "ref":ref_df["ref"], "pos":ref_df["pos"],
                      "ref_id":ref_df.index, "cell": cell_name},
                      index=ref_df.index)
    counts["counts"] = counts["counts"].astype(int)
    
    # First construct donor variants
    print('donor')
    cell_donSpec_counts = counts.loc[curr_don_vars.index]
    cell_donSpec_counts["alt"] = curr_don_vars["alt"]
    cell_donSpec_pileup = cell_donor_variants(cell_donSpec_counts, cell_name, don_var_lim)
    
    # Construct clone variants
    print('clone')
    cell_cloneSpec_counts = counts.loc[curr_cl_vars.index]
    cell_cloneSpec_counts["alt"] = curr_cl_vars["alt"]
    cell_cloneSpec_pileup = cell_clone_variants(cell_cloneSpec_counts, cell_name, clone_var_lim)
    
    out_df = pd.concat([cell_donSpec_pileup, cell_cloneSpec_pileup], axis=0)
    out_df["donor"] = cell_ser["donor"]
    out_df["condition"] = cell_ser["condition"]
    return out_df



In [118]:
new_ref_df

Unnamed: 0,pos,ref
6842_T,6842,T
7151_C,7151,C
7369_C,7369,C
11719_G,11719,G
14350_C,14350,C


In [119]:
new_clone_vars_df

Unnamed: 0,pos,ref,alt,ref_id,donor,clone
7369_C>G_d0cl4,7369,C,G,7369_C,0,d0cl4
6842_T>C_d0cl4,6842,T,C,6842_T,0,d0cl4
7151_C>G_d1cl4,7151,C,G,7151_C,1,d1cl4


In [122]:
new_cells_df.shape

(1081, 3)

In [124]:
all_cells_df.shape

(1800, 3)

In [130]:
new_pileups_all = new_cells_df.parallel_apply(new_cell_variants, 
                               args=(new_ref_df, new_don_vars_df, new_clone_vars_df, seq_error, depth_lim, 0.5, 
                                     don_var_lim, clone_var_lim), axis=1)

new_pileups_df = pd.concat(new_pileups_all.values)

subset vars
after subsetsubset vars

donorafter subset

subset varsdonor



subset vars


clonesubset varsafter subsetsubset vars




subset vars
after subsetsubset vars
subset vars
subset varsclone


after subsetafter subsetsubset vars
subset vars
donorafter subsetafter subset
subset vars

subset varsdonordonor




after subsetdonor
after subsetafter subset

subset varsdonorsubset varsafter subset


donor


after subset
after subset

subset vars
subset varscloneafter subsetdonor
donorafter subsetdonordonorclone









donorafter subset
cloneclone



clonedonor





cloneafter subset
clonedonorclone
donorsubset varssubset vars


donor

clone
subset vars




clone




donorsubset varscloneafter subset



cloneafter subsetsubset varssubset vars






clone


cloneafter subsetsubset vars




subset varsafter subset



subset varsdonorafter subset
donorclone

subset vars






clone



after subset




after subsetafter subset
donor
subset varsdonorsubset varsdonorafter subsetsubset vars
clone
subset varsafter subset





donorclonesubset vars
donor





donorafter subsetclone
after subsetafter subsetcloneafter subset
subset vars

donor
after subsetclonedonorsubset vars
clone









clone

donorsubset varsdonordonordonorafter subset

donor
subset varsafter subsetclone



cloneafter subset

clone











subset varsdonorsubset vars
subset vars
clonecloneclone

subset varsafter subsetdonorcloneclone

subset varsafter subset

donor







after subset
subset vars

donorsubset vars
after subsetafter subset
cloneclone

donorsubset vars
subset vars
subset vars
after subsetafter subset
subset vars
clone


donorcloneafter subsetdonor

subset vars


donor
after subset
donor
after subsetsubset vars

after subset

subset varsclonedonorsubset varsafter subset


cloneafter subset

donorclonesubset vars

donor

clone


donordonor
after subset
donorafter subsetafter subset




donorsubset vars


clonesubset varsafter subset
donor
clone
clonedonorsubset varsclone

subset vars
clonecloneafter subset
donor


donor







after subsetafter subsetclone
clone

after subsetclone
donor
clone



subset varsclone
subset varssubset varssubset varssubset varsdonorsubset vars






donor
subset varsclonedonorsubset varsafter subset
subset vars
subset varsafter subsetafter subset



after subsetsubset varsafter subsetclone

after subsetafter subset




cloneclone

after subsetclone


subset varsdonor
after subsetdonordonordonorafter subset

donorsubset vars


after subsetdonor



donordonor
subset vars

after subset
donorafter subsetdonor
clonesubset vars


donor
donorclone


clone

clonesubset vars

cloneclone
donorafter subsetcloneafter subset
clone
cloneclone


subset vars

clone
clonesubset varsdonorafter subset


subset vars

subset vars



donorclone
donor
after subsetsubset vars



after subsetsubset varssubset vars
subset varsafter subsetclonesubset varssubset varssubset vars

clone

clone

donorafter subsetsubset vars
donor

after subset

subset vars

after subset
after subset
donor
after subset


af

after subsetdonorafter subsetdonor
subset varsafter subset
cloneclone
after subset
subset vars







cloneclonesubset varsdonordonorsubset varssubset vars

after subsetsubset vars
subset vars
donordonor




subset varssubset varsclone
after subsetcloneclone
donor

after subsetafter subsetafter subsetafter subset
clone


subset vars


after subsetdonor

clonecloneafter subset

after subset

subset vars
subset varsdonor
donor

donor
donor

donorclonesubset vars
donorsubset vars
after subset
donorafter subset
clonesubset vars
subset vars


after subsetclone


clone



subset varsclone
cloneafter subsetafter subset
clonedonordonorcloneafter subset



donor


subset varssubset vars


after subset
clone

subset vars
subset varsdonor
after subsetdonorclonesubset vars

donorcloneafter subsetsubset varssubset varsdonor
after subset


clone
after subset





clonecloneafter subset

after subsetdonorafter subset



donordonor



subset varssubset varsafter subset
clonedonordonordonorclone

subse




cloneafter subset
subset vars

donorafter subsetafter subset
subset varsdonor



after subsetdonor


clone

subset varsafter subsetdonor
donorsubset vars
donorafter subset

donordonor
after subsetsubset vars



donor
after subset
after subset
clone
donorsubset vars
clone
clone



clone
donorcloneclone
donorafter subset

after subsetdonor


cloneclonesubset vars


clone
donor


donorsubset vars


donorclonesubset varssubset varssubset varssubset varscloneafter subsetsubset varsclone

clone
subset vars

clone







after subsetafter subsetsubset vars
subset varsafter subsetsubset varscloneafter subsetdonorafter subsetafter subset



subset vars

clone




after subsetsubset vars
donor
after subsetdonor
after subset
donorafter subsetdonor

donordonor
donorsubset vars
after subsetclonesubset varsdonor




donordonorafter subset





subset vars
clone

clone
after subsetsubset varscloneclonedonor

after subsetclonedonorclone
clone
clone


clonecloneafter subset






donor
after subset
su

#### Check results

In [131]:
new_don_vars_df

Unnamed: 0,alt,ref_id,ref,pos,donor
11719_G_C,C,11719_G,G,11719,1
14350_C_A,A,14350_C,C,14350,1


In [132]:
len(new_pileups_df["pos"].unique())

5

In [137]:
new_pileups_df.sort_values(["cell", "pos", "counts"])

Unnamed: 0,pos,cell,nt,counts,donor,condition
5,6842,Cell0donor0condition1cloned0cl4,G,1,0,1
1,6842,Cell0donor0condition1cloned0cl4,A,2,0,1
7,6842,Cell0donor0condition1cloned0cl4,T,439,0,1
3,6842,Cell0donor0condition1cloned0cl4,C,509,0,1
2,7369,Cell0donor0condition1cloned0cl4,C,13,0,1
...,...,...,...,...,...,...
1,7151,Cell9donor1condition1cloned1cl4,G,63,1,1
4,11719,Cell9donor1condition2cloned1cl1,G,3,1,2
2,11719,Cell9donor1condition2cloned1cl1,C,512,1,2
3,14350,Cell9donor1condition2cloned1cl1,C,35,1,2


In [128]:
new_pileups_df["pos"].unique()

array(['7369', '6842', '14350', '11719', '7151'], dtype=object)

In [134]:
new_clone_vars_df

Unnamed: 0,pos,ref,alt,ref_id,donor,clone
7369_C>G_d0cl4,7369,C,G,7369_C,0,d0cl4
6842_T>C_d0cl4,6842,T,C,6842_T,0,d0cl4
7151_C>G_d1cl4,7151,C,G,7151_C,1,d1cl4


In [136]:
new_don_vars_df

Unnamed: 0,alt,ref_id,ref,pos,donor
11719_G_C,C,11719_G,G,11719,1
14350_C_A,A,14350_C,C,14350,1


## Replace the current (cell, pos, nt) with the updated ones

In [129]:
pileups_df

Unnamed: 0,pos,cell,nt,counts,donor,condition,clone
0,11719,Cell0donor0condition0cloned0cl3,A,16.0,0,0,3
3,12833,Cell0donor0condition0cloned0cl3,A,16.0,0,0,3
4,5767,Cell0donor0condition0cloned0cl3,A,16.0,0,0,3
6,1558,Cell0donor0condition0cloned0cl3,A,2.0,0,0,3
9,8953,Cell0donor0condition0cloned0cl3,A,3.0,0,0,3
...,...,...,...,...,...,...,...
82770,16550,Cell299donor1condition2cloned1cl0,T,16.0,1,2,0
82771,16551,Cell299donor1condition2cloned1cl0,T,31.0,1,2,0
82775,16555,Cell299donor1condition2cloned1cl0,T,4.0,1,2,0
82782,16562,Cell299donor1condition2cloned1cl0,T,128.0,1,2,0


---

## Add BQ and strand. Pick strand from binomial

In [None]:
def split_strand(ser):
    counts = ser["counts"]
     
    pos = np.random.binomial(counts, 0.5)
    neg = counts-pos
    
#     pos_ser = ser.copy()
#     pos_ser["counts"] = pos
#     pos_ser["strand"] = "+"
#     neg_ser = ser.copy()
#     neg_ser["counts"] = neg
#     neg_ser["strand"] = "-"
    
    ser["Count Fw"] = pos
    ser["Count Rev"] = neg
    return ser
    #return pd.concat([pd.DataFrame(pos_tmp),pd.DataFrame(neg_tmp)], axis=1, ignore_index=True).transpose()

In [None]:
# pileups_df_strand = pileups_df.parallel_apply(split_strand, axis=1)
# pileups_df_strand

# pileups_df_strand["BQ Fw"] = 37
# pileups_df_strand["BQ Rev"] = 37

# pileups_df_strand["BQ Fw"] = pileups_df_strand["BQ Fw"].astype(int)
# pileups_df_strand["BQ Rev"] = pileups_df_strand["BQ Rev"].astype(int)

# pileups_df[pileups_df["counts"]<=0]

# pileups_df_strand["Count Rev"] = pileups_df_strand["Count Rev"].astype(int)
# pileups_df_strand["Count Fw"] = pileups_df_strand["Count Fw"].astype(int)

# pileups_df_strand["pos"] = pileups_df_strand["pos"].astype(int)

# pileups_df_strand.head()

In [None]:
## Convert to nt's
for (nt, cond), curr_out_df in pileups_df_strand.groupby(["nt", "condition"]):
    cond_outdir = join(curr_outdir, f"cond{cond}", pileup_outdir)
    print(cond_outdir)
    if not exists(cond_outdir):
        os.makedirs(cond_outdir)
    print(nt,cond)
    curr_out_df[["pos","cell","Count Fw","BQ Fw","Count Rev","BQ Rev"]].sort_values(["cell", "pos", "Count Fw"]).to_csv(join(cond_outdir, f"cond{cond}.{nt}.strands.txt"), 
                                                                               header=None, index=None)
    #curr_out.to_csv(join(outdir, f"cond{cond}.{nt}.strands.txt.gz"), compression='gzip')

----

## Save pileups

In [38]:
## Convert to nt's
for (nt, cond), curr_out_df in pileups_df_strand.groupby(["nt", "condition"]):
    cond_outdir = join(curr_outdir, f"cond{cond}", pileup_outdir)
    print(cond_outdir)
    if not exists(cond_outdir):
        os.makedirs(cond_outdir)
    print(nt,cond)
    curr_out_df[["pos","cell","Count Fw","BQ Fw","Count Rev","BQ Rev"]].sort_values(["cell", "pos", "Count Fw"]).to_csv(join(cond_outdir, f"cond{cond}.{nt}.strands.txt"), 
                                                                               header=None, index=None)
    #curr_out.to_csv(join(outdir, f"cond{cond}.{nt}.strands.txt.gz"), compression='gzip')

/data/Mito_Trace/output/clone_pileups_simulation/donors_2__conditions_3__cells_per_donor_cond_300__clones_in_don_5__positions_100__variants_per_clone_lambda_1__donor_variants_10/seq_error_0.01__don_var_lim_0.8_1.0__clone_var_lim_0.1_0.4__depth_lim_4_10/cond0/data/MT/cellr_True/numread_200/
A 0
/data/Mito_Trace/output/clone_pileups_simulation/donors_2__conditions_3__cells_per_donor_cond_300__clones_in_don_5__positions_100__variants_per_clone_lambda_1__donor_variants_10/seq_error_0.01__don_var_lim_0.8_1.0__clone_var_lim_0.1_0.4__depth_lim_4_10/cond1/data/MT/cellr_True/numread_200/
A 1
/data/Mito_Trace/output/clone_pileups_simulation/donors_2__conditions_3__cells_per_donor_cond_300__clones_in_don_5__positions_100__variants_per_clone_lambda_1__donor_variants_10/seq_error_0.01__don_var_lim_0.8_1.0__clone_var_lim_0.1_0.4__depth_lim_4_10/cond2/data/MT/cellr_True/numread_200/
A 2
/data/Mito_Trace/output/clone_pileups_simulation/donors_2__conditions_3__cells_per_donor_cond_300__clones_in_don_5_

## total coverage

In [39]:
for cond, curr_out_df in pileups_df_strand.groupby("condition"):
    coverage = curr_out_df.groupby(["cell", "pos"]).parallel_apply(lambda x: (x["Count Rev"]+x["Count Fw"]).sum()).reset_index()
    cond_outdir = join(curr_outdir, f"cond{cond}", pileup_outdir)
    print(cond_outdir)
    coverage["pos"] = coverage["pos"].astype(int)
    coverage[["pos","cell",0]].sort_values(["cell","pos"]).to_csv(join(cond_outdir, f"cond{cond}.coverage.strands.txt"), 
                                                                               header=None, index=None) #compression='gzip')


/data/Mito_Trace/output/clone_pileups_simulation/donors_2__conditions_3__cells_per_donor_cond_300__clones_in_don_5__positions_100__variants_per_clone_lambda_1__donor_variants_10/seq_error_0.01__don_var_lim_0.8_1.0__clone_var_lim_0.1_0.4__depth_lim_4_10/cond0/data/MT/cellr_True/numread_200/
/data/Mito_Trace/output/clone_pileups_simulation/donors_2__conditions_3__cells_per_donor_cond_300__clones_in_don_5__positions_100__variants_per_clone_lambda_1__donor_variants_10/seq_error_0.01__don_var_lim_0.8_1.0__clone_var_lim_0.1_0.4__depth_lim_4_10/cond1/data/MT/cellr_True/numread_200/
/data/Mito_Trace/output/clone_pileups_simulation/donors_2__conditions_3__cells_per_donor_cond_300__clones_in_don_5__positions_100__variants_per_clone_lambda_1__donor_variants_10/seq_error_0.01__don_var_lim_0.8_1.0__clone_var_lim_0.1_0.4__depth_lim_4_10/cond2/data/MT/cellr_True/numread_200/


In [40]:
cond_outdir

'/data/Mito_Trace/output/clone_pileups_simulation/donors_2__conditions_3__cells_per_donor_cond_300__clones_in_don_5__positions_100__variants_per_clone_lambda_1__donor_variants_10/seq_error_0.01__don_var_lim_0.8_1.0__clone_var_lim_0.1_0.4__depth_lim_4_10/cond2/data/MT/cellr_True/numread_200/'

### Save cell and variant assignments

In [41]:
don_vars_df.to_csv(join(curr_outdir, "donor_vars.csv"))
clone_vars_df.to_csv(join(curr_outdir, "clone_vars.csv"))
all_cells_df.to_csv(join(curr_outdir, "cells_meta.csv"))
clones_meta_cond_df.to_csv(join(curr_outdir, "clones_meta.csv"))