In [1]:
import pandas as pd
from os.path import join, isdir

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from icecream import ic
from src.utils import variant_utils as vu
from glob import glob
from itertools import product
from collections import defaultdict
from tqdm.notebook import tqdm
import pickle

In [2]:
indir = "/data/Mito_Trace/output/pipeline/v02/CHIP_b1/MTBlacklist_A2/data/merged/MT/cellr_True/numread_200/filters/minC10_minR50_topN0_hetT0.001_hetC10_hetCount5_bq20/mgatk/vireoIn/multiplex/clones_simpleUnion/mt_clones_thresh"
outdir = "/data/Mito_Trace/output/pipeline/v02/CHIP_b1/MTBlacklist_A2/data/merged/MT/cellr_True/numread_200/filters/minC10_minR50_topN0_hetT0.001_hetC10_hetCount5_bq20/mgatk/vireoIn/multiplex/clones_simpleUnion/mt_clones_thresh/best_params/"
N_DONORS=2

## Parameters:
# af thresholds
# coverage threshold
# other_af thresholds
# number of cells / fraction cells
# number of other cells / fraction other cells
af_t = 0.1
oth_af_t = 0.1
cov_t = 10
oth_cov_t = 10
ncells = 10
oth_ncells = 0.25 
mean_pos_cov = 0




In [3]:
cells_dir=join(indir, "cells")

In [4]:
def fill_mt_bin(curr_pos, curr_cells):
    print(curr_pos.name)
    curr_pos.loc[curr_cells[curr_pos.name]["other_cells"]] = 0
    curr_pos.loc[curr_cells[curr_pos.name]["clone_cells"]] = 1
    return curr_pos

In [5]:
bin_d = {}
for d in range(N_DONORS): 
    curr_f = join(cells_dir, f"don.{d}_af.{af_t}_othaf.{oth_af_t}_cov.{cov_t}_othcov.{oth_cov_t}.p")
    curr_cells = pickle.load(open(curr_f,'rb'))
    params_results = pd.read_csv(join(indir, f"donor_{d}_thresh_results.tsv"), sep="\t")    


    curr_p = params_results.loc[(params_results["af"]==af_t) &
                       (params_results["oth_af"]==oth_af_t) &
                       (params_results["cov"]==cov_t) &
                       (params_results["oth_cov"]==oth_cov_t) &
                       (params_results["ncells"]==ncells) &
                       (params_results["oth_ncells"]==oth_ncells) &
                       (params_results["mean_cov"]==mean_pos_cov)]
    assert(len(curr_p)==1)
    
    ## Construct a binary cell-by-variant matrix for the kept variants. 
    ## Can add N/A if not in oth cells
    curr_vars = curr_p.iloc[0]["Variants"].split(";")

    all_cells = set()
    for x in curr_cells:
        all_cells = all_cells.union(set(curr_cells[x]["clone_cells"])) 
        all_cells = all_cells.union(set(curr_cells[x]["other_cells"])) 

    mt_bin = pd.DataFrame(index = all_cells, columns=curr_vars)
    bin_d[d] = mt_bin.apply(fill_mt_bin, curr_cells=curr_cells,axis=0).fillna(0)


10463C
10559G
11251G
11453A
11719A
11812G
13368A
14233G
14674C
14905A
150T
152C
15452A
15607G
15928A
16129A
16294T
16296T
16304C
16519C
1888A
1949A
2623G
3109C
3244A
4117C
4216C
4917G
5147A
5580C
5581G
709A
8697A
8743A
930A
9899C
10397G
10589A
11761T
13188T
146C
15297C
15497A
16129A
16355T
16362C
1949A
196C
204C
2442C
2623G
3109C
3244A
3847C
5442C
5581G
58C
64T
7598A
827G
8292A
8461T
9899C


In [6]:
for d in bin_d:
    print(d)
    print(bin_d[d].shape)
    bin_d[d] = bin_d[d].loc[~((bin_d[d])==0).all(axis=1),~((bin_d[d])==0).all(axis=0)]
    print(bin_d[d].shape)
    

    bin_d[d].to_csv(join(outdir, f"donor_{d}_binary.csv"))

bin_comb = pd.concat(list(bin_d.values()),axis=0).fillna(0).astype(int)
bin_comb.to_csv(join(outdir, "combined_binary.csv"))

0
(9020, 36)
(6202, 36)
1
(8274, 27)
(6371, 27)


In [9]:
curr_p.to_csv(join(outdir, "best_params.csv"))
