## Finding specific variants in each cluster
Using the clones, we want to find variants in each clone that are specific to them.
To define specificity, we will use two parameters: VAF frequency cutoff, and different %of population to have that variant.
These parameters will be used to compare each clone's variants to the population

In [1]:

indir = "/data/Mito_Trace/output/pipeline/v02/CHIP_b1/MTBlacklist_A2/data/merged/MT/cellr_True/numread_200/filters/minC10_minR50_topN0_hetT0.001_hetC10_hetCount5_bq20/mgatk/vireoIn/clones/variants_init/knn/kparam_30/"
outdir = "/data/Mito_Trace/output/pipeline/v02/CHIP_b1/MTBlacklist_A2/data/merged/MT/cellr_True/numread_200/filters/minC10_minR50_topN0_hetT0.001_hetC10_hetCount5_bq20/mgatk/vireoIn/clones/variants_init/knn/kparam_30/distinct_variants/donor0/scrap/"
donor = 0
anno_cells_meta_f = "/data/Mito_Trace/output/pipeline/v02/CHIP_b1/MTBlacklist_A2/data/merged/MT/cellr_True/numread_200/filters/minC10_minR50_topN0_hetT0.001_hetC10_hetCount5_bq20/mgatk/vireoIn/clones/variants_init/knn/kparam_30/gff_A2_black/annotation_clones/se_cells_meta_labels.tsv"
# pct_thresh = [0.01, 0.1, 0.25, 0.4, 0.5, 0.75, 0.95]
# other_pct_thresh = [0.01, 0.1, 0.25, 0.5]
# af_thresh = [0, 0.01, 0.1, 0.25, 0.4]

# Objective weights. order of the columns
weights = [1,0,0,1,-1, 1, 1] #[1,1,1,1,1] #np.ones([len(objectives),])
objectives_l = ["variants_with_clone_norm_by_1_over_nclones_with_variant", 
                "max_clone_ncells_over_nclones", "max_clone_ncells_over_ncells", 
                "pct_thresh","other_pct_thresh", "n_vars", "obj_nclones_more_than_one_unique"] #"nvars"
ncpus=8
topn=16


In [2]:
from os.path import join, exists, dirname
from glob import glob
import pickle
import mplh.cluster_help as ch
import mplh.fig_utils as fu

import os
import vireoSNP
import numpy as np
from scipy import sparse
from scipy.io import mmread
import matplotlib.pyplot as plt
from scipy.stats import hypergeom
print(vireoSNP.__version__)

import pandas as pd
import seaborn as sns
from mplh import cluster_help as ch
from vireoSNP import Vireo
np.set_printoptions(formatter={'float': lambda x: format(x, '.3f')})

from icecream import ic

fig_utils from mplh
0.5.6


In [3]:
objectives = {ind:x for ind,x in enumerate(objectives_l)}
weights = np.array(weights)

assert(len(weights)==len(objectives))

In [4]:
objectives

{0: 'variants_with_clone_norm_by_1_over_nclones_with_variant',
 1: 'max_clone_ncells_over_nclones',
 2: 'max_clone_ncells_over_ncells',
 3: 'pct_thresh',
 4: 'other_pct_thresh',
 5: 'n_vars',
 6: 'obj_nclones_more_than_one_unique'}

In [5]:
param_names = ["pct_thresh","af_thresh", "other_pct_thresh"]
n_params = 3

In [6]:
import multiprocessing


In [7]:
import pandas as pd
import numpy as np
import random
from deap import base, algorithms
from deap import creator
from deap import tools

In [8]:
if not exists(outdir):
    os.mkdir(outdir)

## Load & preprocess:
- AF df
- DP df
- cells_meta with clone labels. need to create name as donor_lineage

Remove donor variants (>0.9 in 90% of pop)

In [9]:
af_indir = join(indir, "sc_af", f"donor{donor}")

AF_df = pd.read_csv(join(af_indir, "af.tsv"), index_col=0, sep="\t")
DP_df = pd.read_csv(join(af_indir, "dp.tsv"), index_col=0, sep="\t")

print(AF_df.shape)
print(DP_df.shape)
print("Depth")
print(DP_df.head())
AF_df.head()

cells_meta = pd.read_csv(join(indir, "cells_meta.tsv"), sep='\t', index_col="ID")#.sort_values(["donor", "lineage"])
cells_meta["name"] = cells_meta["donor"].astype(str)+"_"+cells_meta["lineage"].astype(str)
# if "donor_index" in cells_meta.columns and "lineage_index" in cells_meta.columns:
#     cells_meta = cells_meta.sort_values(["donor_index", "lineage_index"])
#AD_df = pd.merge(AD_df, vcf[["#CHROM", "POS", "ALT"]], how="inner", left_index=True,right_index=True).set_index(["#CHROM", "POS", "ALT"])
curr_labels = cells_meta[cells_meta["donor"]==donor]
curr_labels

conditions = curr_labels["condition"].unique()
conditions

def rm_high(df, thresh, pct_thresh):
    return df.loc[~(((df>thresh).sum(axis=1)>pct_thresh*df.shape[0]))]

def rm_low(df, thresh, pct_thresh):
    return df.loc[~((df<thresh).sum(axis=1)>(pct_thresh*df.shape[1]))]

    #return df.loc[~(((df<=thresh).sum(axis=1)>pct_thresh*df.shape[0]))]
#df[(df<0.01).sum(axis=1)]

## Get donor inds

donor_inds = AF_df.index[((AF_df>0.9).sum(axis=1)>(0.9*AF_df.shape[1]))]
donor_inds

(68, 9023)
(68, 9023)
Depth
         AAACGAAAGAATCAAC-1_Control  AAACGAAAGCGAGAAA-1_Control  \
Variant                                                           
10397G                          2.0                        16.0   
10463C                         11.0                        16.0   
10559G                         15.0                        26.0   
10589A                         17.0                        21.0   
11251G                         13.0                        25.0   

         AAACGAAAGTACAGAT-1_Control  AAACGAAAGTTAGCAA-1_Control  \
Variant                                                           
10397G                          1.0                         5.0   
10463C                          3.0                         9.0   
10559G                          5.0                         5.0   
10589A                          6.0                         7.0   
11251G                          5.0                         7.0   

         AAACGAACACTCGCAG-1_Cont

Index(['10397G', '10589A', '11761T', '13188T', '1438G', '146C', '14766T',
       '15326G', '16126C', '16355T', '16362C', '196C', '204C', '2442C', '263G',
       '2706G', '3847C', '4769G', '58C', '64T', '7028T', '73G', '750G',
       '7598A', '827G', '8292A', '8461T', '8860G'],
      dtype='object', name='Variant')

## Plot all cells in each clone

## objective: 
1. Maximize: /sum_{v \in V}{max({c_{ncells}\in C}/|C|), where V is number of variants, c_{i,v,ncells} is number of cells in clone i with variant v, and C is the set of clones with the variant. We want to maximize this objective. Across all variants.
2. Same as 1, but denominator is not number of clones with variant but number of cells with variant
3. Maximize pct_thresh
4. Minimize other_pct_thresh (not high priority)

## Constraint:
1. pct_thresh>=other_pct_thresh
2. af_thresh*coverage_thresh>=2

## Bounds:
1. pct_thresh: 0.1-1
2. other_pct_thresh: 0.1-1
3. af_thresh: 0.01-0.4
4. coverage_thresh: 2-60


In [10]:
from icecream import ic

ic.disable()

In [11]:
def get_unique_variants(cln_af, other_af, pct_thresh, af_thresh, other_pct_thresh):
    """ gets the distinct variants in a clone.
    """
    n_thresh = pct_thresh*cln_af.shape[1]
    n_oth_thresh = other_pct_thresh*other_af.shape[1]
    bin_cln = cln_af>af_thresh
    bin_other = other_af>af_thresh
    cells_above = bin_cln.sum(axis=1)
    pct_cells_above = cells_above/bin_cln.shape[1]
    up_vars = bin_cln.loc[cells_above > n_thresh].index
    cells_other_above = bin_other.sum(axis=1)
    pct_cells_other_above = cells_other_above/bin_other.shape[1]
    up_oth_vars = bin_other.loc[cells_other_above > n_oth_thresh].index
    uniq_vars = list(set(up_vars) - set(up_oth_vars))
    out = pd.DataFrame(index=uniq_vars, data={"n_cells":cells_above.loc[uniq_vars].values, 
                                              "n_other_cells": cells_other_above.loc[uniq_vars].values,
                                              "pct_above": pct_cells_above,
                                              "pct_other_above": pct_cells_other_above})
    out["pct_thresh"] = pct_thresh
    out["af_thresh"] = af_thresh
    out["other_pct_thresh"] = other_pct_thresh
    return out


def get_clones_unique_variants(solution, data):
    all_unique_df = []
    pct_thresh, af_thresh, other_pct_thresh = solution["pct_thresh"], solution["af_thresh"], solution["other_pct_thresh"] #solution[0], solution[1], solution[2]
    curr_labels = data["curr_labels"]
    AF_df = data["AF_df"]
    DP_df = data["DP_df"]
    for cln, val in curr_labels.groupby("name"):
        #ic(cln)
        cln_af = AF_df.loc[:, val.index]
        other_af = AF_df.loc[:, curr_labels.drop(val.index).index]
        curr_dp = DP_df.loc[:, val.index]
        curr_labs = curr_labels[curr_labels.index.isin(cln_af.columns)]
        #ic(cln_af.shape)
        unique_df = get_unique_variants(cln_af, other_af, pct_thresh, af_thresh, other_pct_thresh)
        unique_df["clone"] = cln
        unique_df["id"] = unique_df["clone"] + "_" + unique_df["pct_thresh"].astype(str)+ "_" + unique_df["af_thresh"].astype(str)+ "_" + unique_df["other_pct_thresh"].astype(str)
        unique_df["variant"] = unique_df.index
        unique_df = unique_df.set_index("id")
        all_unique_df.append(unique_df)
    all_unique_df = pd.concat(all_unique_df)
    all_unique_df["log2_n_cells"] = np.log2(all_unique_df["n_cells"]+1)
    return all_unique_df



def _objective_two_unique_vars_in_clone(all_unique_df, to_pivot=True):
    if to_pivot:
        if "id" in all_unique_df.columns:
            df = all_unique_df.pivot(index="id", columns="variant", values="n_cells").fillna(0).astype(int)
        else:
            df = all_unique_df.reset_index().pivot(index="id", columns="variant", values="n_cells").fillna(0).astype(int)
    else:
        df = all_unique_df
    vars_to_keep = df.loc[:,(df>0).sum()==1].columns # Variants with just 1 clone
    clones_to_keep = df.loc[df.sum(axis=1)>1].index # Clones w >2 enriched variants
    obj = 0
    def cl_more_than_one(cl_ser):  
        curr = cl_ser[cl_ser > 0] # variants in a clone
        # check if more than one unique variant for this clone
        return sum([True if x in vars_to_keep else False for x in curr.index]) > 1

#     def cl_more_than_one(cl_ser, vars_to_keep):  
#         curr = cl_ser[cl_ser > 0] # variants in a clone
#         # check if more than one unique variant for this clone
#         return sum([True if x in vars_to_keep else False for x in curr.index]) > 1
    #obj = sum(df.loc[clones_to_keep].apply(lambda x: x(lambda y):, axis=0))
    obj = sum(df.loc[clones_to_keep].apply(cl_more_than_one, axis=1))
    return obj


def _objectives(data):
    all_unique_df = data["all_unique_df"]
    #print('all_unique_df', all_unique_df.head)
    #ic('all_unique_df', all_unique_df.shape)
    obj_max_nce_over_ncl = 0
    obj_max_nce_over_nce = 0
    obj_cl_over_ncl = 0
    obj_nvars = 0
    if len(all_unique_df) == 0:
        #print('all 0', all_unique_df.columns)
        #return {x:(-1*np.inf) for x in objectives_l} # return score of 0 since all positive values
        return {x:(0) for x in objectives_l} # return score of 0 since all positive values
    obj_d = all_unique_df.iloc[0]["pct_thresh"] 
    obj_e = all_unique_df.iloc[0]["other_pct_thresh"]
    for v, v_df in all_unique_df.groupby("variant"):
        #ic(v)
        max_ncells = max(v_df["n_cells"])
        n_clones = len(set(v_df["clone"].values))
        obj_max_nce_over_ncl += max_ncells/n_clones
        obj_max_nce_over_nce += max_ncells/v_df["n_cells"].sum()
        
        if n_clones != 0:
            obj_cl_over_ncl += 1/n_clones
            obj_nvars += 1
        
    # calculate objective number of clones with more than one unique variant
    obj_nclones_more_than_one_unique =  _objective_two_unique_vars_in_clone(all_unique_df, to_pivot=True)
    
    objectives = {"variants_with_clone_norm_by_1_over_nclones_with_variant":obj_cl_over_ncl,
                  "max_clone_ncells_over_nclones":obj_max_nce_over_ncl, 
                  "max_clone_ncells_over_ncells":obj_max_nce_over_nce, 
                  "pct_thresh":obj_d,"other_pct_thresh":obj_e,
                   "n_vars":obj_nvars, "obj_nclones_more_than_one_unique": obj_nclones_more_than_one_unique}
    return objectives

def _constraints(solution):
    #if solution["pct_thresh"] < solution["other_pct_thresh"]:
    if "coverage_thresh" not in solution:
        return None
    if solution["af_thresh"]*solution["coverage_thresh"] >= 2:
        return True
    else:
        return False


def evaluate_series(individual_ser, AF_df, DP_df, curr_labels, return_data=False):
    params = individual_ser.to_dict()
    #print('params', params)
    #solution = {"pct_thresh": individual[0], "af_thresh":individual[1],  "other_pct_thresh": individual[2]}
    data = {"AF_df": AF_df, "DP_df":DP_df, "curr_labels":curr_labels} 
    all_unique_df = get_clones_unique_variants(params, data)
    data["all_unique_df"] = all_unique_df
    eval_out = _objectives(data)
    if return_data:
        return pd.Series(eval_out), data
    else:
        return pd.Series(eval_out)



## GA or gridsearch on parameters and evaluate objectives 


## Grid search

In [12]:
# from itertools import product
# pct_thresh = np.arange(0.05, 1, 0.05)
# other_pct_thresh = np.arange(0.005, 1, 0.05)
# af_thresh = np.arange(0.005, 1, 0.05)


# # There are 7 params to use for calling the clone
# params = {"pct_thresh": pct_thresh,
#           "other_pct_thresh": other_pct_thresh,
#           "af_thresh": af_thresh,}
# full_params = list(product(*list(params.values())))
# full_params = pd.DataFrame(full_params, columns=params.keys())

# print(full_params.shape)
# full_params.head()


In [13]:
def set_multi_rank(results, weights):
    if "multi" in results.columns: #in case multi was added before
        rank_results = results.drop("multi",axis=1).rank(na_option='top')
    else:
        rank_results = results.rank(na_option='top')
    rank_results["multi"] = (weights*rank_results).sum(axis=1)
    return rank_results.sort_values(by="multi")[::-1]

def set_multi(results, weights):
    print(results.shape)
    # first normalize results for each column to sum to 1
    objs_total = results.replace([-np.inf, np.inf], np.nan).sum(axis=0)
    print('objs_total', objs_total.head())
    results_norm = results.apply(lambda x: x/objs_total.loc[x.name], axis=0)
    
    results_norm["multi"] = (weights*results_norm).sum(axis=1)
    return results_norm.sort_values(by="multi")[::-1]

---

# Using GA

In [14]:
ic.enable()

In [15]:
# the random initialization of the genetic algorithm is done here
# it gives a list of integers with for each products the number of times it is bought
lower_bounds = [0,0,0]
upper_bounds = [1,1,1]


def init_pop(n):
    return list(np.random.random(size=n))


# def checkBounds(min, max):
#     def decorator(func):
#         def wrapper(*args, **kargs):
#             offspring = func(*args, **kargs)
#             print('offspring', offspring)
#             print('offspring len', len(offspring))
#             print('offspring len 0', len(offspring[0]))
#             for child in offspring:
#                 for i in range(len(child)):
#                     print(child[i], 'max', max[i])
#                     for j, attr in enumerate(child[i]):
#                         if attr > max[j]:
#                             child[i][j] = max
#                         elif 
#                     if child[i] > max[i]:
#                         child[i] = max[i]
#                     elif child[i] < min[i]:
#                         child[i] = min[i]
#             return offspring
#         return wrapper
#     return decorator
def checkBounds(min, max):
    def decorator(func):
        def wrapper(*args, **kargs):
            offspring = func(*args, **kargs)
            for child in offspring:
                for i in range(len(child)):
                    if child[i] > max:
                        child[i] = max
                    elif child[i] < min:
                        child[i] = min
            return offspring
        return wrapper
    return decorator
#from scoop import futures

# this is the setup of the deap library: registering the different function into the toolbox
creator.create("FitnessMulti", base.Fitness, weights = (1,0,0,1,-1, 1, 1))
creator.create("Individual", list, fitness=creator.FitnessMulti)

toolbox = base.Toolbox()

#toolbox.register("map", futures.map)
toolbox.register("n_per_product", init_pop, n=n_params)

toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.n_per_product, n=n_params)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)



def evaluate(individual, AF_df, DP_df, curr_labels):
    ic('individual before', individual)
    ic(len(individual))
    individual = individual[0]
    ic('individual after', individual)
    ic(len(individual))
    #pct_thresh, af_thresh, other_pct_thresh = individual["pct_thresh"], individual["af_thresh"], individual["other_pct_thresh"] #solution[0], solution[1], solution[2]
    
    solution = {"pct_thresh": individual[0], "af_thresh":individual[1],  "other_pct_thresh": individual[2]}
    #data = 
    data = {"AF_df": AF_df, "DP_df":DP_df, "curr_labels":curr_labels} 
    all_unique_df = get_clones_unique_variants(solution, data)
    data["all_unique_df"] = all_unique_df
    eval_out = _objectives(data)
    ic(tuple(eval_out.values()))
    print(tuple(eval_out.values()))
    return tuple(eval_out.values())

toolbox.register("evaluate", evaluate, AF_df=AF_df, DP_df=DP_df, curr_labels=curr_labels)

toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)

# toolbox.register("mate", tools.cxBlend, alpha=0.2)
# toolbox.register("mutate", tools.mutGaussian, mu=0, sigma=2, indp=0.05)

# LOWER AND UPPER BOUNDs
# toolbox.decorate("mate", checkBounds(lower_bounds, upper_bounds))
# toolbox.decorate("mutate", checkBounds(lower_bounds, upper_bounds))
toolbox.decorate("mate", checkBounds(0, 1))
toolbox.decorate("mutate", checkBounds(0, 1))
#toolbox.decorate("evaluate", tools.De)
# ## Initialize ga
pool = multiprocessing.Pool(processes=16)

toolbox.register("map", pool.map)


stats = tools.Statistics(key=lambda ind: ind.fitness.values)
stats.register("avg", np.mean)
stats.register("std", np.std)
stats.register("min", np.min)
stats.register("max", np.max)

# logbook = tools.Logbook()
# logbook.record(gen=0, evals=30, **record)
# import pickle
# pickle.dump(logbook, lb_file)


In [16]:
ic.disable()

In [17]:

## Run
## V02 - using main function taken from site

# this is the definition of the total genetic algorithm is executed, it is almost literally copied from the deap library
def main():
    pop = toolbox.population(n=8)
    
    # Evaluate the entire population
    fitnesses = list(map(toolbox.evaluate, pop))
    for ind, fit in zip(pop, fitnesses):
        ind.fitness.values = fit

    # CXPB  is the probability with which two individuals
    #       are crossed
    #
    # MUTPB is the probability for mutating an individual
    CXPB, MUTPB = 0.5, 0.2
    
    # Extracting all the fitnesses of 
    print(len(pop))
    #print(len(pop[0].fitness.values))
    print(len(ind.fitness.values))
    fits = [ind.fitness.values[0] for ind in pop]
    
    # Variable keeping track of the number of generations
    g = 0
    
    # Begin the evolution
    while g < 2:
        # A new generation
        g = g + 1
        print("-- Generation %i --" % g)
        
        # Select the next generation individuals
        offspring = toolbox.select(pop, len(pop))
        # Clone the selected individuals
        offspring = list(map(toolbox.clone, offspring))
        
        # Apply crossover and mutation on the offspring
        for child1, child2 in zip(offspring[::2], offspring[1::2]):
            if random.random() < CXPB:
                toolbox.mate(child1[0], child2[0])
                del child1.fitness.values
                del child2.fitness.values

        for mutant in offspring:
            if random.random() < MUTPB:
                toolbox.mutate(mutant[0])
                del mutant.fitness.values
            
        # Evaluate the individuals with an invalid fitness
        invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
        fitnesses = map(toolbox.evaluate, invalid_ind)
        for ind, fit in zip(invalid_ind, fitnesses):
            ind.fitness.values = fit
            
        pop[:] = offspring
        
        # Gather all the fitnesses in one list and print the stats
        print(len(ind.fitness.values))
        fits = [ind.fitness.values[0] for ind in pop]
        
        length = len(pop)
        mean = sum(fits) / length
        sum2 = sum(x*x for x in fits)
        std = abs(sum2 / length - mean**2)**0.5
        
        ic(min(fits), max(fits), mean, std)
    print('len pop', len(pop))
    print('len toolbox', len(toolbox.evaluate(pop[0]) ))
    best = pop[np.argmax([toolbox.evaluate(x) for x in pop])]
    return best, pop

best_solution, pop = main()

#products_table['univariate_choice'] = pd.Series(best_solution[0])
#products_table.head()
pd.Series(best_solution[0])

(3.13125, 391.98125, 3.3998545287276465, 0.5370836996640017, 0.9294268879634274, 5, 0)
(2.590909090909091, 286.04545454545456, 3.1485515382887943, 0.046588613875601315, 0.5699834257647383, 4, 0)
(0, 0, 0, 0, 0, 0, 0)
(1.1111111111111112, 139.11111111111111, 1.3021442495126705, 0.705265235408423, 0.4353603264532637, 2, 0)
(10.0, 456.0, 10.313616071428571, 0.10579650394585671, 0.22666567688416972, 11, 1)
(3.0, 277.0, 3.0, 0.26591531074268937, 0.059651214269890174, 3, 0)
(3.0, 415.0, 3.0, 0.9320203462171699, 0.07306184353512313, 3, 0)
(3.125, 499.75, 3.2906940553999378, 0.5409150097160518, 0.3921148622028032, 4, 0)
8


ZeroDivisionError: float division by zero

In [None]:
best_solution

In [None]:
# ## Initialize ga
# pool = multiprocessing.Pool(processes=16)


In [None]:
# ## V01 - using pre-defined algorithms

# # this is the definition of the total genetic algorithm is executed, it is almost literally copied from the deap library
# def main():
#     pop = toolbox.population(n=10)
#     pop, logbook = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=3, 
#                                        stats=stats, verbose=True)

# best_solution = main()

# #products_table['univariate_choice'] = pd.Series(best_solution[0])
# #products_table.head()
# pd.Series(best_solution[0])
