## TreeHarmonizer CNA

This notebook serves to place CNAs called by Wakhan onto the given phylogenetic tree.

This notebook is being converted to serve as a element of a single TreeHarmonizer tool.

### Required Packages

* pandas
* importlib 
* intervaltree (loaded in th_utils, install via pip)
* functools (imported from th_utils)

## Load dependencies and utils

In [17]:
import importlib
import pandas as pd
import TreeHarmonizer_utils as th_utils
importlib.reload(th_utils)

pd.set_option('display.width', 5000)
pd.set_option("display.expand_frame_repr", True)
pd.set_option("display.max_colwidth", 1000)
pd.set_option('display.max_columns', None)

## Input CNA and SNV Paths

* Expects folder structure within SNV path to be the following: `[snv_path]/[sample_name]/[sample_name].vcf`
  * SNV vcfs are assumed to follow the standard VCF 4.2 format, output by DeepVariant.

  * Sample name within the VCF file is assumed to be the same as `[sample_name]`.

  * All folders within the path will be assumed to be a sample by default, except those that begin with an underscore `_`.

  * In order to limit folders further, add the `predefined_sample_list` argument to the call to `th_utils.generate_merged_df` in the Load SNVs and Tree data cell.

  * Argument value should be a list of directory names within the `snv_path`, VCF names are still expected to match the sample name.

* Expects folder structure wihtin the CNA path to be the following: `[cna_path]/[sample_name]/bed_output/[sample]_copynumber_segments.bed` (expected output structure for Wakhan)

In [18]:
#snv_path = "Insert Path to SNV Calls Here"
#cna_path = "Insert Path to CNA Calls Here"

snv_path = "/data/KolmogorovLab/agoretsky/TreeHarmonizer/snv_vcfs"
cna_path = "/data/KolmogorovLab/agoretsky/TreeHarmonizer/cna_beds"

## Load SNVs and Tree Data

In [19]:
# Load SNV data into a merged DataFrame
dv_merged, sample_list = th_utils.generate_merged_df(caller_path=snv_path)

####### In order to use a predefined sample list, use the following: #######
# predefined_sample_list = ['sample1', 'sample2', ...]
# dv_merged = th_utils.generate_merged_df(caller_path=snv_path, predefined_sample_list=predefined_sample_list)

# Load tree input via newick string and parse it into various components
imported_tree, non_terminals, terminals, non_terminal_paths, terminal_paths, non_terminal_leaves, terminal_paths_o_keys, non_terminal_paths_without_N1 = th_utils.get_tree_data()

# Add informative columns to the merged DataFrames that were lost from original merging.
dv_merged['CHROM'] = dv_merged['KEY'].str.split(":").str[0]
dv_merged['POS'] = dv_merged['KEY'].str.split(":").str[1]
dv_merged['REF'] = dv_merged['KEY'].str.split(":").str[2]
dv_merged['ALT'] = dv_merged['KEY'].str.split(":").str[3]

dv_merged['CHROM'] = dv_merged['CHROM'].astype(str)
dv_merged['POS'] = dv_merged['POS'].astype(int)

df_merged = dv_merged.copy()

In [20]:
print(df_merged)

                                      C1              KEY                               C10                               C11                               C12                               C13                               C14                               C15                             C16                             C17                              C18                               C19                               C20                              C21                              C22                              C23                               C24                                C3                              C4                                C5                                C6                                C7                                C8                               C9 CHROM        POS REF ALT
0          0/1:6:31:22,9:0.290323:5,0,40    1:3111314:T:C  0/1:23:29:11,17:0.586207:23,0,48  0/1:21:35:19,16:0.457143:20,0,51   0/1:22:19:9,10:0.526316:22,0,50   0/1:23:22:9,13:0.5

## Load in and process CNA data

In [None]:
# Create interval tree of all severus deletion ranges
wakhan_cna_trees_per_chromosome = {}
wakhan_cna_1_only_trees_per_chromosome = {}
wakhan_cna_0_only_trees_per_chromosome = {}

# Create blank interval trees per chromosome
for sub in sample_list:
#for sub in sample_list:#
    for chrom in th_utils.autosomes:
        wakhan_cna_trees_per_chromosome.update({sub + "-" + chrom: th_utils.it.IntervalTree()})
        wakhan_cna_1_only_trees_per_chromosome.update({sub + "-" + chrom: th_utils.it.IntervalTree()})
        wakhan_cna_0_only_trees_per_chromosome.update({sub + "-" + chrom: th_utils.it.IntervalTree()})

for subline in sample_list:
    # New CNA DATA (01/13/25)
    # For old data, use old .bed reading method
    wk_copy_num = th_utils.read_bed_updated(cna_path + "/" + str(subline) + "/bed_output/" + str(subline) + "_copynumbers_segments.bed")
    wk_copy_num['chr'] = wk_copy_num['chr'].astype(str)

    # Filter wakhan down to only autosomes
    wk_copy_num = th_utils.keep_rows_by_values(wk_copy_num, 'chr', th_utils.autosomes)
    wk_copy_num['copynumber_state'] = wk_copy_num['copynumber_state'].astype(int)

    # For each copy num entry, add to its respective interval tree, with the interval being the copy num range, data being the copy num metadata
    for index, row in wk_copy_num.iterrows():
        wakhan_cna_trees_per_chromosome[str(subline) + "-" + str(row['chr'])].addi(int(row['start']), int(row['end']) + 1, (str(subline), row['copynumber_state'], row['coverage'], row['confidence'], row['svs_breakpoints_ids']))

    for index, row in wk_copy_num.iterrows():
        if row['copynumber_state'] == 1:
            wakhan_cna_1_only_trees_per_chromosome[str(subline) + "-" + str(row['chr'])].addi(int(row['start']), int(row['end']) + 1, (str(subline), row['copynumber_state'], row['coverage'], row['confidence'], row['svs_breakpoints_ids']))
    
    for index, row in wk_copy_num.iterrows():
        if row['copynumber_state'] == 0:
            wakhan_cna_0_only_trees_per_chromosome[str(subline) + "-" + str(row['chr'])].addi(int(row['start']), int(row['end']) + 1, (str(subline), row['copynumber_state'], row['coverage'], row['confidence'], row['svs_breakpoints_ids']))

# Create a copy of the merged dataframe to modify
df_merged_copy_wakhan = df_merged.copy(deep=True)

In [24]:
wakhan_cna_trees_per_chromosome["C1-2"]

IntervalTree([Interval(0, 3000001, ('C1', 0, 0.0, 1.0, '[]')), Interval(3000001, 120592338, ('C1', 2, 23.03, 0.9834876, "['severus_DEL1222']")), Interval(120592338, 120611137, ('C1', 1, 15.7, 0.7747951, "['severus_DEL1222']")), Interval(120611137, 140323736, ('C1', 2, 23.35, 0.9579735, "['severus_DEL1227']")), Interval(140323736, 140334287, ('C1', 0, 0.0, 1.0, "['severus_DEL1227']")), Interval(140334287, 175000001, ('C1', 2, 22.28, 0.9566935, '[]')), Interval(175000001, 177500001, ('C1', 0, 0.01, 0.9992021000000001, '[]')), Interval(177500001, 182113224, ('C1', 2, 22.1, 0.9423606, '[]'))])

## Limit CNA trees to amplifications and losses only

In [26]:
wakhan_cna_trees_per_chromosome_amp_only = {}
wakhan_cna_trees_per_chromosome_loss_only = {}

for subline in sample_list:
    for chrom in th_utils.autosomes:
        wakhan_cna_trees_per_chromosome_amp_only.update({subline + "-" + chrom: th_utils.it.IntervalTree()})
        wakhan_cna_trees_per_chromosome_loss_only.update({subline + "-" + chrom: th_utils.it.IntervalTree()})

for subline in sample_list:
    for chrom in th_utils.autosomes:
        for interval in wakhan_cna_trees_per_chromosome[subline + "-" + chrom]:
            if interval.data[1] > 2.0:
                # Modify the tree currently to only include the subline as data
                wakhan_cna_trees_per_chromosome_amp_only[subline + "-" + chrom].addi(interval.begin, interval.end, interval.data[0])
                #wakhan_cna_trees_per_chromosome_amp_only[subline + "-" + chrom].addi(interval.begin, interval.end, interval.data)
            if interval.data[1] < 2.0:
                wakhan_cna_trees_per_chromosome_loss_only[subline + "-" + chrom].addi(interval.begin, interval.end, interval.data[0])
                #wakhan_cna_trees_per_chromosome_loss_only[subline + "-" + chrom].addi(interval.begin, interval.end, interval.data)

In [27]:
wakhan_cna_trees_per_chromosome_amp_only["C15-1"]

IntervalTree([Interval(57500001, 57998042, 'C15'), Interval(57998042, 195471971, 'C15')])

### Merge intervals that are tangential to each other (Separate segments in the bed file but are both amplifications. They may have different CN values. Amplifications are generalized to >2 and losses are generalized to <2.)

In [28]:
for subline in sample_list:
    for chrom in th_utils.autosomes:
        wakhan_cna_trees_per_chromosome_amp_only[subline + "-" + chrom].merge_overlaps(strict=False, data_reducer=lambda x, y: x)
        wakhan_cna_trees_per_chromosome_loss_only[subline + "-" + chrom].merge_overlaps(strict=False, data_reducer=lambda x, y: x)

In [29]:
wakhan_cna_trees_per_chromosome_amp_only["C15-1"]

IntervalTree([Interval(57500001, 195471971, 'C15')])

In [32]:
amp_only_dicts = {}
loss_only_dicts = {}
dict_list = [amp_only_dicts, loss_only_dicts]

# Go through every subline and chromosome, and create a dictionary of lists of intervals
for subline in sample_list:
    for chrom in th_utils.autosomes:
            final_amp_list = []
            final_loss_list = []
            #final_list = []
            amp_to_place = wakhan_cna_trees_per_chromosome_amp_only[subline + "-" + chrom].items()
            loss_to_place = wakhan_cna_trees_per_chromosome_loss_only[subline + "-" + chrom].items()
            # Place the intervals into a list of lists
            # Each interval is a list of [start, end, data]
            # data is a list of [subline, copy num, coverage, confidence, svs_breakpoints_ids]
            for interval in amp_to_place:
                final_amp_list.append([interval.begin, interval.end, set([ interval.data ])])
            for interval in loss_to_place:
                # Exclude blank centromere region intervals
                if interval.begin == 0 and (interval.end == 3000001  or interval.end == 3000000 or interval.end == 3150000):
                    continue
                final_loss_list.append([interval.begin, interval.end, set([ interval.data ])])

            amp_only_dicts.update({subline + "-" + chrom: final_amp_list})
            loss_only_dicts.update({subline + "-" + chrom: final_loss_list})

In [33]:
print(amp_only_dicts['C15-1'])
print(loss_only_dicts['C15-1'])

print(amp_only_dicts['C3-1'])
print(loss_only_dicts['C11-10'])
print(loss_only_dicts['C13-13'])

[[57500001, 195471971, {'C15'}]]
[]
[[54205853, 195471971, {'C3'}]]
[[56205027, 56271864, {'C11'}], [50536739, 50546376, {'C11'}], [117324713, 117329072, {'C11'}], [73705528, 73735821, {'C11'}], [110028101, 110041131, {'C11'}], [128736094, 128736355, {'C11'}]]
[[0, 44688633, {'C13'}]]


## Define overlap merge and fragment creation methods

In [34]:
import copy
DEBUG = False

# Function to make all overlapping intervals, where we only consider the overlaps that are not perfect supersets or subsets of the other - Aka w-------y-_-_-_-_x______z or y-----w-_-_-_-_z_____x
def make_all_partial_overlap_fragments(interval_set_a, interval_keys_a, interval_set_b, interval_keys_b):

    # Create a new array to store overlapping intervals
    overlapping_intervals = []
    overlapping_keys = set()

    for interval_a in interval_set_a:
        for interval_b in interval_set_b:
            
            if DEBUG:
                print("Interval A (w, x): ", interval_a)
                print("Interval B (y, z): ", interval_b)

            if interval_a[0] < interval_b[0] and interval_b[0] < interval_a[1] and interval_a[1] < interval_b[1]:
                
                if DEBUG:
                    print("Case 1, w-------y-_-_-_-_x______z")
                    print("interval_a[0]: ", interval_a[0], " interval_a[1]: ", interval_a[1])
                    print("interval_b[0]: ", interval_b[0], " interval_b[1]: ", interval_b[1])
                    print("The math: ", interval_a[0], "<=", interval_b[0], " and ", interval_a[1], ">=", interval_b[0])
                    print("New Interval: ", [interval_b[0], interval_a[1], set(interval_a[2]).union(set(interval_b[2]))])

                # If they overlap, create a new interval and add it to the list
                overlapping_intervals.append([interval_b[0], interval_a[1], set(interval_a[2]).union(set(interval_b[2]))])
                overlapping_keys.add((interval_a[0], interval_b[1], frozenset(set(interval_a[2]).union(set(interval_b[2])))))
            ## Consider the second type of overlap case: Aka y-----w-_-_-_-_z_____x
            elif interval_b[0] < interval_a[0] and interval_a[0] < interval_b[1] and interval_b[1] < interval_a[1]:
                
                if DEBUG:
                    print("Case 2, y-----w-_-_-_-_z_____x")
                    print("interval_a[0]: ", interval_a[0], " interval_a[1]: ", interval_a[1])
                    print("interval_b[0]: ", interval_b[0], " interval_b[1]: ", interval_b[1])
                    print("The math: ", interval_b[0], "<=", interval_a[0], " and ", interval_b[1], ">=", interval_a[0])
                    print("New Interval: ", [interval_a[0], interval_b[1], set(interval_a[2]).union(set(interval_b[2]))])

                # If they overlap, create a new interval and add it to the list
                overlapping_intervals.append([interval_a[0], interval_b[1], set(interval_a[2]).union(set(interval_b[2]))])
                overlapping_keys.add((interval_a[0], interval_b[1], frozenset(set(interval_a[2]).union(set(interval_b[2])))))

    # Make an union list of all intervals
    all_intervals = interval_set_a + interval_set_b + overlapping_intervals
    all_keys = interval_keys_a.union(interval_keys_b).union(overlapping_keys)
    
    # Return the list of overlapping intervals
    return overlapping_intervals, all_intervals, all_keys


# Function to merge overlapping intervals into singular intervals
# If an interval [a, b] overlaps perfectly with [c, d]:
    # If data is the same, we will delete [c, d] (arbitrary)   ac-_-_-_-_-_bd
    # If data is different, we will merge the two intervals into [a, d] with a union of data and delete the original intervals
# If an interval [a, b], is a subset of [c, d], they will be merged into [c, d] c----a____b-----d
    # If data is the same, we will delete [a, b].
    # If the data is different:
        # Create new interval [a, b] with a union of data, delete the original [a, b] (update [a,b] data)
# If an interval [a, b] is a superset of [c, d], they will be merged into [a, b]   a_____c-----d_____b
    # If the data is the same, we will delete [c, d].
    # If the data is different:
        # Create new interval [c, d] with a union of data, delete the original [c, d] (update [c,d] data)
def merge_superset_subset_fragments(interval_set, interval_keys):
    if (len(interval_set) == 1 or len(interval_set) == 0):
        return interval_set, interval_keys

    intervals_to_be_added = []
    intervals_to_be_removed = []
    interval_keys_to_be_added = set()
    interval_keys_to_be_removed = set()

    for x in range(0, len(interval_set)):
        for y in range(x + 1, len(interval_set)):

            if DEBUG:
                print("Current Interval Set at START: ", interval_set)
                print("Currently comparing intervals X and Y: ", interval_set[x], " and ", interval_set[y])

            # If interval overlaps perfectly with another interval:
            if interval_set[x][0] == interval_set[y][0] and interval_set[y][1] == interval_set[x][1]:
                if DEBUG:
                    print("Intervals overlap perfectly: ", interval_set[x], " and ", interval_set[y])
                # Check data, if data are the same, delete second interval from the set
                if interval_set[x][2] == interval_set[y][2]:
                    if DEBUG:
                        print("data is the same, deleting subsetted interval")
                        print("keeping", interval_set[x], "deleting: ", interval_set[y])
                    intervals_to_be_removed.append(interval_set[y])
                    interval_keys_to_be_removed.add((interval_set[y][0], interval_set[y][1], frozenset(set(interval_set[y][2]))))
                #If data is different, make new interval that is the data union of the two intervals, delete the two original intervals
                else:
                    new_interval = [interval_set[x][0], interval_set[x][1], set(interval_set[x][2]).union(set(interval_set[y][2]))]
                    # Before setting this interval to be added, check that this exact same interval hasn't already been created from another combination earlier
                    new_interval_key = (new_interval[0], new_interval[1], frozenset(set(new_interval[2])))
                    if new_interval_key not in interval_keys_to_be_added:
                        if DEBUG:
                            print("adding new interval to list to be added")
                            print("said new interval: ", new_interval)
                        intervals_to_be_added.append(new_interval)
                        interval_keys_to_be_added.add(new_interval_key)
                    intervals_to_be_removed.append(interval_set[x])
                    intervals_to_be_removed.append(interval_set[y])
                    interval_keys_to_be_removed.add((interval_set[x][0], interval_set[x][1], frozenset(set(interval_set[x][2]))))
                    interval_keys_to_be_removed.add((interval_set[y][0], interval_set[y][1], frozenset(set(interval_set[y][2]))))
                
                   
            # Elif first interval is a subset of the second interval:
            # Per comments above, [a,b] is interval_set[y], [c,d] is interval_set[x]
            elif interval_set[x][0] <= interval_set[y][0] and interval_set[y][1] <= interval_set[x][1]:
                if DEBUG:
                    print("Second interval is a subset of the first")
                # Check data
                # If data is the same, delete the subsetted interval
                if interval_set[x][2] == interval_set[y][2]:
                    if DEBUG:
                        print("data is the same, deleting subsetted interval")
                        print("keeping", interval_set[x], "deleting: ", interval_set[y])
                    intervals_to_be_removed.append(interval_set[y])
                    interval_keys_to_be_removed.add((interval_set[y][0], interval_set[y][1], frozenset(set(interval_set[y][2]))))
                # If the data is different, update data of [a, b] to be the union of data from the two intervals
                else:
                    if DEBUG:
                        print("doing in place modification for a subset")
                        print("modifying: ", interval_set[y], " data to ", set(interval_set[x][2]).union(set(interval_set[y][2])))
                    
                    interval_set[y][2] = set(interval_set[x][2]).union(set(interval_set[y][2]))
                    if DEBUG:
                        print("resulting interval: ", interval_set[y])
                
    
            # Elif the second interval is a subset of the first interval:
            # Per comments above, [a,b] is interval_set[y], [c,d] is interval_set[x]
            elif interval_set[y][0] <= interval_set[x][0] and interval_set[x][1] <= interval_set[y][1]:
                if DEBUG:
                    print("first interval is a subset of the second")
                # Check data
                # If data is the same, delete the subsetted interval
                if interval_set[x][2] == interval_set[y][2]:
                    if DEBUG:
                        print("data is the same, deleting subsetted interval")
                        print("keeping", interval_set[y], "deleting: ", interval_set[x])
                    intervals_to_be_removed.append(interval_set[x])
                    interval_keys_to_be_removed.add((interval_set[x][0], interval_set[x][1], frozenset(set(interval_set[x][2]))))
                    #interval_set.pop(x)
                # If the data is different, update data of [c, d] to be the union of data from the two intervals
                else:
                    if DEBUG:
                        print("doing in place modification for a subset")
                        print("modifying: ", interval_set[x], " data to ", set(interval_set[x][2]).union(set(interval_set[y][2])))
                    interval_set[x][2] = set(interval_set[x][2]).union(set(interval_set[y][2]))
                    if DEBUG:
                        print("resulting interval: ", interval_set[x])

            if DEBUG:
                print("One Iteration Complete")
                print("Current Interval Set at END: ", interval_set)
                print("X and Y are (as of end of iteration): ", interval_set[x], " and ", interval_set[y])
                

    if DEBUG:
        print("all to be removed:")
        print(intervals_to_be_removed)
    if DEBUG:
        print("all to be added:")
        print(intervals_to_be_added)

    # Remove all intervals to be removed
    for interval in intervals_to_be_removed:
        try:
            interval_set.remove(interval)
        except ValueError:
            continue
        if DEBUG:
            print("removed: " + str(interval))

        try:
            interval_keys.remove((interval[0], interval[1], frozenset(set(interval[2]))))
        except KeyError:
            continue
        if DEBUG:
            print("removed key: " + str((interval[0], interval[1], frozenset(set(interval[2])))))

    # Add all intervals to be added
    for interval in intervals_to_be_added:
        interval_set.append(interval)
        if DEBUG:
            print("added: " + str(interval))
        interval_keys.add((interval[0], interval[1], frozenset(set(interval[2]))))
        
    return interval_set, interval_keys

## Run the recursive procedure for merging superset / subsets, and creating new fragments split segments.

In [35]:
def run_procedure(dict_list, key_list, current_index, prev_part_all, prev_part_keys):
    #print("current index: ", current_index)
    if len(dict_list) == 1 or current_index == len(dict_list):
        # Is a single node leaf thing or we had 2 nodes and we are at the end
        return prev_part_all
    if current_index == len(dict_list) - 1:
        # Is the last node
        if DEBUG:
            print("last node")
            print(dict_list, current_index, prev_part_all, prev_part_keys)
        current_part_overlaps, current_part_all, current_part_keys = make_all_partial_overlap_fragments(dict_list[current_index], key_list[current_index], prev_part_all, prev_part_keys)
        current_part_reduced, current_part_reduced_keys = merge_superset_subset_fragments(current_part_all, current_part_keys)
        return current_part_reduced
    elif current_index == 0:
        # 0, 1
        # Is the first node
        if DEBUG:
            print("first node")
            print(dict_list[current_index], current_index, dict_list[current_index+1])
        current_part_overlaps, current_part_all, current_part_keys = make_all_partial_overlap_fragments(dict_list[current_index], key_list[current_index], dict_list[current_index + 1], key_list[current_index + 1])
        current_part_reduced, current_part_reduced_keys = merge_superset_subset_fragments(current_part_all, current_part_keys)
        return run_procedure(dict_list, key_list, current_index + 2, current_part_reduced, current_part_reduced_keys)
    else:
        
        # Is a middle node
        if DEBUG:
            print("middle node")
            print(dict_list[current_index], current_index, prev_part_all, prev_part_keys)
        
        current_part_overlaps, current_part_all, current_part_keys = make_all_partial_overlap_fragments(dict_list[current_index], key_list[current_index], prev_part_all, prev_part_keys)
        current_part_reduced, current_part_reduced_keys = merge_superset_subset_fragments(current_part_all, current_part_keys)
        return run_procedure(dict_list, key_list, current_index + 1, current_part_reduced, current_part_reduced_keys)

def generate_all_intersections_per_node_per_chrom(node, chrom, amp_or_loss="amp"):
    leaves = non_terminal_leaves[node]
    new_dict_list = []
    new_key_list = []
    for x in leaves:
        if amp_or_loss == "amp":
            temp_dict = copy.deepcopy(amp_only_dicts[str(x) + "-" + chrom])
        else: # loss
            temp_dict = copy.deepcopy(loss_only_dicts[str(x) + "-" + chrom])
        temp_dict.sort(key=lambda x: (x[0], x[1]))
        new_dict_list.append(temp_dict)
    for x in new_dict_list:
        temp_keys = set()
        for interval in x:
            temp_keys.add((interval[0], interval[1], frozenset(interval[2])))
        new_key_list.append(temp_keys)
    
    output = run_procedure(new_dict_list, new_key_list, 0, [], [])
    return output

In [37]:
## Some Testing

output_node = "N1"
output_chrom = '1'
current_output = generate_all_intersections_per_node_per_chrom(output_node, output_chrom, "amp")
print(output_node + "-" + output_chrom + " output")
print(current_output)

N1-1 output
[[3000001, 195471971, {'C1'}], [179520071, 179593483, {'C6', 'C15', 'C3', 'C14', 'C18', 'C1', 'C8'}], [57500001, 195471971, {'C14', 'C3', 'C15', 'C1'}], [45489721, 45569735, {'C18', 'C1'}], [57998042, 195471971, {'C18', 'C14', 'C3', 'C15', 'C1'}], [185203343, 195471971, {'C16', 'C15', 'C14', 'C3', 'C18', 'C1', 'C8'}], [109425893, 109451294, {'C18', 'C14', 'C3', 'C15', 'C1', 'C8'}], [122500001, 195471971, {'C18', 'C14', 'C3', 'C15', 'C1', 'C8'}], [54205853, 195471971, {'C3', 'C1', 'C14'}]]


## Generate Intersections for every node and chromosome

In [38]:
all_amp_intersections_per_node_per_chrom = {}
all_loss_intersections_per_node_per_chrom = {}

for node in non_terminal_leaves:
    if "O" in node:
        continue
    for chrom in th_utils.autosomes:
        all_amp_intersections_per_node_per_chrom[node + "-" + chrom] = generate_all_intersections_per_node_per_chrom(node, chrom, "amp")
        all_loss_intersections_per_node_per_chrom[node + "-" + chrom] = generate_all_intersections_per_node_per_chrom(node, chrom, "loss")
    print("Processed Node: ", node)

Processed Node:  N1
Processed Node:  N8
Processed Node:  N2
Processed Node:  N12
Processed Node:  N9
Processed Node:  N4
Processed Node:  N3
Processed Node:  N16
Processed Node:  N13
Processed Node:  N10
Processed Node:  N5
Processed Node:  N17
Processed Node:  N14
Processed Node:  N11
Processed Node:  N6
Processed Node:  N7
Processed Node:  N19
Processed Node:  N18
Processed Node:  N15
Processed Node:  N20
Processed Node:  N21
Processed Node:  N22


## Keep only ranges that meet the minimum subline threshold

In [39]:
minimum_subline_support_per_clade_size_requirement = {
            1: 1,
            2: 2,
            3: 2,
            4: 3,
            5: 4,
            7: 5,
            8: 6,
            12: 10,
            16: 13,
            23: 19
        }

removal_storage = {}

def reduce_intersections_to_min_clade_support(all_intersections):
    for node in non_terminal_leaves:
        if "O" in node:
            continue
        for chrom in th_utils.autosomes:
            to_be_removed = []
            #print("Node " + node + " Chrom " + chrom)
            for interval in all_intersections[node + "-" + chrom]:
                #print(interval)
                if len(interval[2]) >= minimum_subline_support_per_clade_size_requirement[len(non_terminal_leaves[node])]:
                    continue
                else:
                    to_be_removed.append(interval)
            for interval in to_be_removed:
                try:
                    all_intersections[node + "-" + chrom].remove(interval)
                except ValueError:
                    print("Debugging try, interval not found to remove: ", interval)
                    continue
            removal_storage.update({node + "-" + chrom: to_be_removed})

reduced_amp_intersections_per_node_per_chrom = copy.deepcopy(all_amp_intersections_per_node_per_chrom)
reduced_loss_intersections_per_node_per_chrom = copy.deepcopy(all_loss_intersections_per_node_per_chrom)
reduce_intersections_to_min_clade_support(reduced_amp_intersections_per_node_per_chrom)
reduce_intersections_to_min_clade_support(reduced_loss_intersections_per_node_per_chrom)

print("Reduced Amp Intersections")
for key, value in reduced_amp_intersections_per_node_per_chrom.items():
    print(key, ": ", value)

print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")

print("Reduced Loss Intersections")
for key, value in reduced_loss_intersections_per_node_per_chrom.items():
    print(key, ": ", value)

Reduced Amp Intersections
N1-1 :  []
N1-2 :  []
N1-3 :  []
N1-4 :  []
N1-5 :  [[136082251, 142429291, {'C17', 'C6', 'C16', 'C9', 'C5', 'C18', 'C1', 'C4', 'C23', 'C11', 'C8', 'C7', 'C15', 'C12', 'C21', 'C13', 'C22', 'C19', 'C10', 'C24', 'C3', 'C20'}], [142429459, 145000001, {'C17', 'C7', 'C16', 'C4', 'C22', 'C15', 'C9', 'C8', 'C23', 'C5', 'C19', 'C10', 'C11', 'C21', 'C24', 'C3', 'C18', 'C20', 'C1', 'C13'}], [145037771, 145063695, {'C17', 'C16', 'C6', 'C9', 'C5', 'C18', 'C1', 'C4', 'C23', 'C11', 'C8', 'C7', 'C15', 'C12', 'C21', 'C13', 'C22', 'C19', 'C10', 'C24', 'C3', 'C20'}], [145067630, 150197866, {'C17', 'C7', 'C4', 'C8', 'C22', 'C15', 'C9', 'C23', 'C5', 'C19', 'C10', 'C11', 'C21', 'C24', 'C3', 'C18', 'C20', 'C1', 'C13'}], [150198707, 151182480, {'C17', 'C7', 'C6', 'C4', 'C22', 'C15', 'C9', 'C23', 'C5', 'C19', 'C10', 'C11', 'C21', 'C24', 'C3', 'C18', 'C20', 'C1', 'C13'}], [151195633, 151708600, {'C17', 'C7', 'C6', 'C4', 'C22', 'C15', 'C9', 'C23', 'C5', 'C19', 'C10', 'C11', 'C21', 'C24

In [40]:
print(removal_storage)

{'N1-1': [[128659415, 128667770, {'C17'}], [135105476, 135105612, {'C19'}], [157851541, 157892038, {'C19'}], [9769560, 9769901, {'C23'}], [179483946, 179491370, {'C6'}], [179500259, 179520071, {'C6'}], [108718745, 108750001, {'C8'}], [108970592, 108971956, {'C8'}], [109242284, 109242409, {'C8'}], [109424041, 109425893, {'C8'}]], 'N1-2': [[141439926, 141847450, {'C12'}], [154082119, 154393937, {'C12'}], [140323736, 140334287, {'C1'}], [116270479, 116277026, {'C24'}], [8538576, 8538674, {'C13'}], [112109239, 112109475, {'C16'}], [120592338, 120611137, {'C22', 'C15', 'C1'}]], 'N1-3': [[79905402, 79905484, {'C19'}], [149818100, 153505718, {'C19', 'C11', 'C16'}], [154899343, 157195261, {'C19', 'C11', 'C16'}], [157469647, 160039680, {'C19', 'C11', 'C16'}], [149638327, 149638794, {'C11', 'C12', 'C16'}], [159765647, 160039680, {'C16', 'C12', 'C19', 'C11', 'C8'}], [133982941, 133987368, {'C23'}], [65877484, 65877966, {'C9'}], [54511023, 54511115, {'C13'}], [155971345, 156229157, {'C19', 'C11', 

## Union-merge all acceptible ranges.

In [41]:
final_unionized_amp_ranges_per_node_per_chrom = {}
final_unionized_loss_ranges_per_node_per_chrom = {}

def merge_overlapping_intervals(interval_set):

    if len(interval_set) == 0 or len(interval_set) == 1:
        return interval_set
    
    for x in range(0, len(interval_set)):
        for y in range(x + 1, len(interval_set)):

            # If the two intervals being compared overlap in any way, merge them
            if interval_set[x][0] <= interval_set[y][1] and interval_set[y][0] <= interval_set[x][1]:
                # If they overlap, create a new interval and add it to the list
                new_interval = [min(interval_set[x][0], interval_set[y][0]), max(interval_set[x][1], interval_set[y][1]), set(interval_set[x][2]).union(set(interval_set[y][2]))]
                # Remove the two original intervals
                interval_set.pop(y)
                interval_set.pop(x)
                # Add the new interval
                interval_set.append(new_interval)
                # Restart the loop
                return merge_overlapping_intervals(interval_set)
            # If the two intervals do not overlap, continue to the next pair

    return interval_set

copy_of_reduced_amp_intersections = copy.deepcopy(reduced_amp_intersections_per_node_per_chrom)
copy_of_reduced_loss_intersections = copy.deepcopy(reduced_loss_intersections_per_node_per_chrom)

for node in non_terminal_leaves:
    if "O" in node:
        continue
    for chrom in th_utils.autosomes:
        final_unionized_amp_ranges_per_node_per_chrom[node + "-" + chrom] = merge_overlapping_intervals(copy_of_reduced_amp_intersections[node + "-" + chrom])
        final_unionized_loss_ranges_per_node_per_chrom[node + "-" + chrom] = merge_overlapping_intervals(copy_of_reduced_loss_intersections[node + "-" + chrom])
        print("Final Unionized Amp Ranges for Node: ", node, " Chrom: ", chrom)
        print(final_unionized_amp_ranges_per_node_per_chrom[node + "-" + chrom])
        print("Final Unionized Loss Ranges for Node: ", node, " Chrom: ", chrom)
        print(final_unionized_loss_ranges_per_node_per_chrom[node + "-" + chrom])

Final Unionized Amp Ranges for Node:  N1  Chrom:  1
[]
Final Unionized Loss Ranges for Node:  N1  Chrom:  1
[]
Final Unionized Amp Ranges for Node:  N1  Chrom:  2
[]
Final Unionized Loss Ranges for Node:  N1  Chrom:  2
[[175000001, 177500001, {'C17', 'C6', 'C16', 'C9', 'C5', 'C18', 'C1', 'C4', 'C23', 'C11', 'C14', 'C8', 'C7', 'C15', 'C12', 'C21', 'C13', 'C22', 'C19', 'C10', 'C24', 'C3', 'C20'}]]
Final Unionized Amp Ranges for Node:  N1  Chrom:  3
[]
Final Unionized Loss Ranges for Node:  N1  Chrom:  3
[]
Final Unionized Amp Ranges for Node:  N1  Chrom:  4
[]
Final Unionized Loss Ranges for Node:  N1  Chrom:  4
[[89225015, 89596445, {'C17', 'C6', 'C16', 'C9', 'C5', 'C18', 'C1', 'C4', 'C23', 'C11', 'C14', 'C8', 'C7', 'C15', 'C12', 'C21', 'C13', 'C22', 'C19', 'C10', 'C24', 'C3', 'C20'}]]
Final Unionized Amp Ranges for Node:  N1  Chrom:  5
[[111423074, 151708600, {'C17', 'C6', 'C16', 'C9', 'C5', 'C18', 'C1', 'C4', 'C23', 'C11', 'C14', 'C8', 'C7', 'C15', 'C12', 'C21', 'C13', 'C22', 'C10', '

## Build an exclusive top down placement of amp and losses (If parent has said range - don't include for child).

In [42]:
to_mod_copy_of_unionized_amp_intersections = copy.deepcopy(final_unionized_amp_ranges_per_node_per_chrom)
to_mod_copy_of_unionized_loss_intersections = copy.deepcopy(final_unionized_loss_ranges_per_node_per_chrom)

def remove_intervals_ranges_from_cur_node(intervals_to_exclude, target_node_interval_set, chrom):
    print("Intervals to exclude: ", intervals_to_exclude)
    print("Target node interval set: ", target_node_interval_set)

    for cur_to_exclude_interval in intervals_to_exclude:
        for cur_target_node_interval in target_node_interval_set:
            #print("cur_to_exclude_interval: ", cur_to_exclude_interval)
            #print("cur_target_node_interval: ", cur_target_node_interval)
            # If the two intervals being compared overlap in any way, subtract the overlap region from the target node
            
            # Check if there is an overlap
            if cur_to_exclude_interval[0] <= cur_target_node_interval[1] and cur_to_exclude_interval[1] >= cur_target_node_interval[0]:
                # If cur_to_exclude_interval starts after cur_target_node_interval starts
                if cur_to_exclude_interval[0] > cur_target_node_interval[0]:
                    new_interval = [cur_target_node_interval[0], cur_to_exclude_interval[0], set(cur_target_node_interval[2])]
                    target_node_interval_set.append(new_interval)
                # If cur_to_exclude_interval ends before cur_target_node_interval ends
                if cur_to_exclude_interval[1] < cur_target_node_interval[1]:
                    new_interval = [cur_to_exclude_interval[1], cur_target_node_interval[1], set(cur_target_node_interval[2])]
                    target_node_interval_set.append(new_interval)
                # Remove the original interval
                target_node_interval_set.remove(cur_target_node_interval)

    return target_node_interval_set

# Traverse to each node, using its non_terminal_paths to get the path to the root
def mod_option_exclusive_range_placement(amp_or_loss="amp"):
    for target_node, path_to_target in non_terminal_paths.items():
        for chrom in th_utils.autosomes:
            #print("target node: ", target_node)
            #print("chrom: ", chrom)
            #print("path to target: ", path_to_target)
            if target_node == "N1":
                if amp_or_loss == "amp":
                    final_mod_option_amp_intersections_per_node_per_chrom[target_node + "-" + chrom] = to_mod_copy_of_unionized_amp_intersections[target_node + "-" + chrom]
                else:
                    final_mod_option_loss_intersections_per_node_per_chrom[target_node + "-" + chrom] = to_mod_copy_of_unionized_loss_intersections[target_node + "-" + chrom]
                continue
            else:
                intervals_to_exclude = []
                target_node_interval_set = []

                # Get the existing approved intervals for the target node
                if amp_or_loss == "amp":
                    if "O" in target_node:
                        target_node_interval_set = amp_only_dicts["C" + target_node[1:] + "-" + chrom]
                    else:
                        target_node_interval_set = to_mod_copy_of_unionized_amp_intersections[target_node + "-" + chrom]
                else:
                    if "O" in target_node:
                        target_node_interval_set = loss_only_dicts["C" + target_node[1:] + "-" + chrom]
                    else:
                        target_node_interval_set = to_mod_copy_of_unionized_loss_intersections[target_node + "-" + chrom]

                # Go through the path to the target node, and get the intervals to exclude
                for node in path_to_target:
                    ## We can do this in this simple way because the non_terminal_paths are already a form of a level order traversal!
                    ## When converting to ete4 verify that this remains the case.
                    ## Only the target node should not be in mod_option_output_intersections_per_node_per_chrom
                    if amp_or_loss == "amp":
                        if node + "-" + chrom in final_mod_option_amp_intersections_per_node_per_chrom:
                            for interval in final_mod_option_amp_intersections_per_node_per_chrom[node + "-" + chrom]:
                                intervals_to_exclude.append(interval)
                    else:
                        if node + "-" + chrom in final_mod_option_loss_intersections_per_node_per_chrom:
                            for interval in final_mod_option_loss_intersections_per_node_per_chrom[node + "-" + chrom]:
                                intervals_to_exclude.append(interval)
                   
                # Go through the intervals to exclude, and remove them from the target node
                reduced_target_node_intervals = remove_intervals_ranges_from_cur_node(copy.deepcopy(intervals_to_exclude), copy.deepcopy(target_node_interval_set), chrom)
                    
                if amp_or_loss == "amp":
                    final_mod_option_amp_intersections_per_node_per_chrom[target_node + "-" + chrom] = reduced_target_node_intervals
                else:
                    final_mod_option_loss_intersections_per_node_per_chrom[target_node + "-" + chrom] = reduced_target_node_intervals

            print("Finished processing target node: ", target_node, " chrom: ", chrom)

final_mod_option_amp_intersections_per_node_per_chrom = {}
final_mod_option_loss_intersections_per_node_per_chrom = {}

# Might have to pass in these dicts are arguments to the function? Not sure.

mod_option_exclusive_range_placement("amp")
mod_option_exclusive_range_placement("loss")

Intervals to exclude:  []
Target node interval set:  []
Finished processing target node:  N8  chrom:  1
Intervals to exclude:  []
Target node interval set:  []
Finished processing target node:  N8  chrom:  2
Intervals to exclude:  []
Target node interval set:  []
Finished processing target node:  N8  chrom:  3
Intervals to exclude:  []
Target node interval set:  []
Finished processing target node:  N8  chrom:  4
Intervals to exclude:  [[111423074, 151708600, {'C17', 'C6', 'C16', 'C9', 'C5', 'C18', 'C1', 'C4', 'C23', 'C11', 'C14', 'C8', 'C7', 'C15', 'C12', 'C21', 'C13', 'C22', 'C10', 'C19', 'C24', 'C3', 'C20'}]]
Target node interval set:  [[111423074, 151834684, {'C7', 'C4', 'C6', 'C16', 'C15', 'C9', 'C22', 'C13', 'C5', 'C11', 'C21', 'C24', 'C18', 'C20', 'C1', 'C8'}]]
Finished processing target node:  N8  chrom:  5
Intervals to exclude:  [[3000001, 149736546, {'C17', 'C6', 'C16', 'C9', 'C5', 'C18', 'C1', 'C4', 'C23', 'C11', 'C14', 'C8', 'C7', 'C15', 'C12', 'C21', 'C13', 'C22', 'C19', 'C

## Debug Printing

In [43]:
for key, value in final_mod_option_amp_intersections_per_node_per_chrom.items():
    print(key, ": ", value)

print(final_mod_option_amp_intersections_per_node_per_chrom == final_unionized_amp_ranges_per_node_per_chrom)
print(final_mod_option_loss_intersections_per_node_per_chrom == final_unionized_loss_ranges_per_node_per_chrom)

N1-1 :  []
N1-2 :  []
N1-3 :  []
N1-4 :  []
N1-5 :  [[111423074, 151708600, {'C17', 'C16', 'C6', 'C9', 'C5', 'C18', 'C1', 'C4', 'C23', 'C11', 'C14', 'C8', 'C7', 'C15', 'C12', 'C21', 'C13', 'C22', 'C10', 'C19', 'C24', 'C3', 'C20'}]]
N1-6 :  [[3000001, 149736546, {'C17', 'C16', 'C6', 'C9', 'C5', 'C18', 'C1', 'C4', 'C23', 'C11', 'C14', 'C8', 'C7', 'C15', 'C12', 'C21', 'C13', 'C22', 'C19', 'C10', 'C24', 'C3', 'C20'}]]
N1-7 :  []
N1-8 :  [[3000001, 112209700, {'C17', 'C16', 'C6', 'C9', 'C5', 'C18', 'C1', 'C4', 'C23', 'C11', 'C14', 'C8', 'C7', 'C15', 'C12', 'C21', 'C13', 'C22', 'C10', 'C24', 'C3', 'C20'}]]
N1-9 :  []
N1-10 :  [[109892326, 110028101, {'C17', 'C7', 'C6', 'C4', 'C22', 'C15', 'C9', 'C13', 'C12', 'C23', 'C5', 'C11', 'C19', 'C21', 'C14', 'C3', 'C18', 'C20', 'C1', 'C8'}], [110041131, 117324713, {'C17', 'C16', 'C6', 'C9', 'C5', 'C18', 'C1', 'C4', 'C23', 'C11', 'C14', 'C8', 'C7', 'C15', 'C12', 'C21', 'C13', 'C22', 'C19', 'C10', 'C24', 'C3', 'C20'}], [117329072, 130694993, {'C17', 'C1

## Print final amplification percentages per chrom and total per node

* In order to write percentages to files, change `write_node_total` and/or `write_per_chrom` to `True`

In [None]:
mouse_chrom_sizes = {'1': 195471971, '2': 182113224, '3': 160039680, '4': 156508116, '5': 151834684, '6': 149736546, '7': 145441459, '8': 129401213, '9': 124595110, '10': 130694993, '11': 122082543, '12': 120129022, '13': 120421639, '14': 124902244, '15': 104043685, '16': 98207768, '17': 94987271, '18': 90702639, '19': 61431566}

debug_printing = True
debug_detailed_printing = True
write_node_total=False
write_per_chrom=False
write_output_directory_path = "/data/KolmogorovLab/agoretsky/Latest_Variant_Calls_01_13_25/simple_cna_placement_percentages/"

total_amplification_percentages_nodes = {}

for node in non_terminal_leaves:
    total_amplified = 0
    per_chrom_dict = {}
    for chrom in th_utils.autosomes:
        chrom_amplified = 0
        amplified_intervals = final_mod_option_amp_intersections_per_node_per_chrom[node + "-" + chrom]
        for amp_interval in amplified_intervals:
            chrom_amplified += amp_interval[1] - amp_interval[0]
            total_amplified += amp_interval[1] - amp_interval[0]
        if debug_printing and debug_detailed_printing:
            print("Node: ", node, "\tChrom: ", chrom, "\tPercentage of Chromosome: ", chrom_amplified / mouse_chrom_sizes[chrom] * 100, "%")
            per_chrom_dict.update({chrom: (str(chrom_amplified / mouse_chrom_sizes[chrom] * 100) + "%", str(chrom_amplified))})
    if debug_printing:
        print("~~~~~~~~~~~")
        print("Percentage of Mouse Genome Amplified for Node: ", node, "is: ", total_amplified / sum(mouse_chrom_sizes.values()) * 100, "%")
        # Update with percentage and with the exact amount amplified in a tuple
        total_amplification_percentages_nodes.update({str(node): (str(total_amplified / sum(mouse_chrom_sizes.values()) * 100) + "%", str(total_amplified))})
        print("Total Amplified: ", total_amplified)
        print("Total Mouse Genome Length: ", sum(mouse_chrom_sizes.values()))
        print("----------END OF NODE----------")
    if write_per_chrom:
        per_chrom_df = pd.DataFrame.from_dict(per_chrom_dict, orient='index', columns=['Chromosome Amplification Percentage', 'Chromosome Amplified (bp)'])
        per_chrom_df.index.name = 'Chromosome'
        per_chrom_df.to_csv(write_output_directory_path + str(node) + "_exclusive_per_chrom_amplification_percentages" + ".csv", index=True)

if write_node_total:
    total_amplification_percentages_nodes_df = pd.DataFrame.from_dict(total_amplification_percentages_nodes, orient='index', columns=['Total Amplification Percentage', 'Total Amplified (bp)'])
    total_amplification_percentages_nodes_df.index.name = 'Node'
    total_amplification_percentages_nodes_df.to_csv(write_output_directory_path + "ALL_NODE_exclusive_total_amplification_percentages.csv", index=True)

Node:  N1 	Chrom:  1 	Percentage of Chromosome:  0.0 %
Node:  N1 	Chrom:  2 	Percentage of Chromosome:  0.0 %
Node:  N1 	Chrom:  3 	Percentage of Chromosome:  0.0 %
Node:  N1 	Chrom:  4 	Percentage of Chromosome:  0.0 %
Node:  N1 	Chrom:  5 	Percentage of Chromosome:  26.532492404699838 %
Node:  N1 	Chrom:  6 	Percentage of Chromosome:  97.99648043170436 %
Node:  N1 	Chrom:  7 	Percentage of Chromosome:  0.0 %
Node:  N1 	Chrom:  8 	Percentage of Chromosome:  84.39619418405297 %
Node:  N1 	Chrom:  9 	Percentage of Chromosome:  0.0 %
Node:  N1 	Chrom:  10 	Percentage of Chromosome:  34.174200537276896 %
Node:  N1 	Chrom:  11 	Percentage of Chromosome:  0.0 %
Node:  N1 	Chrom:  12 	Percentage of Chromosome:  0.0 %
Node:  N1 	Chrom:  13 	Percentage of Chromosome:  0.0 %
Node:  N1 	Chrom:  14 	Percentage of Chromosome:  0.0 %
Node:  N1 	Chrom:  15 	Percentage of Chromosome:  0.19186556108619182 %
Node:  N1 	Chrom:  16 	Percentage of Chromosome:  0.013878739205232726 %
Node:  N1 	Chrom:  17 

## Print final loss percentages per chrom and total per node

* In order to write percentages to files, change `write_node_total` and/or `write_per_chrom` to `True`

In [45]:
mouse_chrom_sizes = {'1': 195471971, '2': 182113224, '3': 160039680, '4': 156508116, '5': 151834684, '6': 149736546, '7': 145441459, '8': 129401213, '9': 124595110, '10': 130694993, '11': 122082543, '12': 120129022, '13': 120421639, '14': 124902244, '15': 104043685, '16': 98207768, '17': 94987271, '18': 90702639, '19': 61431566}

debug_printing = True
debug_detailed_printing = True
write_node_total=False
write_per_chrom=False
write_output_directory_path = "/data/KolmogorovLab/agoretsky/Latest_Variant_Calls_01_13_25/simple_cna_placement_percentages/"

total_loss_percentages_nodes = {}

for node in non_terminal_leaves:
    total_loss = 0
    #if "O" in node:
    #    continue
    per_chrom_dict = {}
    for chrom in th_utils.autosomes:
        chrom_loss = 0
        loss_intervals = final_mod_option_loss_intersections_per_node_per_chrom[node + "-" + chrom]
        for loss_interval in loss_intervals:
            chrom_loss += loss_interval[1] - loss_interval[0]
            total_loss += loss_interval[1] - loss_interval[0]
        if debug_printing and debug_detailed_printing:
            print("Node: ", node, "\tChrom: ", chrom, "\tPercentage of Chromosome: ", chrom_loss / mouse_chrom_sizes[chrom] * 100, "%")
            per_chrom_dict.update({chrom: (str(chrom_loss / mouse_chrom_sizes[chrom] * 100) + "%", str(chrom_loss))})
    if debug_printing:
        print("~~~~~~~~~~~")
        print("Percentage of Mouse Genome loss for Node: ", node, "is: ", total_loss / sum(mouse_chrom_sizes.values()) * 100, "%")
        # Update with percentage and with the exact amount loss in a tuple
        total_loss_percentages_nodes.update({str(node): (str(total_loss / sum(mouse_chrom_sizes.values()) * 100) + "%", str(total_loss))})
        print("Total loss: ", total_loss)
        print("Total Mouse Genome Length: ", sum(mouse_chrom_sizes.values()))
        print("----------END OF NODE----------")
    if write_per_chrom:
        per_chrom_df = pd.DataFrame.from_dict(per_chrom_dict, orient='index', columns=['Chromosome loss Percentage', 'Chromosome Loss (bp)'])
        per_chrom_df.index.name = 'Chromosome'
        per_chrom_df.to_csv(write_output_directory_path + str(node) + "_exclusive_per_chrom_loss_percentages" + ".csv", index=True)

if write_node_total:
    total_loss_percentages_nodes_df = pd.DataFrame.from_dict(total_loss_percentages_nodes, orient='index', columns=['Total loss Percentage', 'Total Loss (bp)'])
    total_loss_percentages_nodes_df.index.name = 'Node'
    total_loss_percentages_nodes_df.to_csv(write_output_directory_path + "ALL_NODE_exclusive_total_loss_percentages.csv", index=True)

Node:  N1 	Chrom:  1 	Percentage of Chromosome:  0.0 %
Node:  N1 	Chrom:  2 	Percentage of Chromosome:  1.372772358365365 %
Node:  N1 	Chrom:  3 	Percentage of Chromosome:  0.0 %
Node:  N1 	Chrom:  4 	Percentage of Chromosome:  0.2373231558164051 %
Node:  N1 	Chrom:  5 	Percentage of Chromosome:  0.0 %
Node:  N1 	Chrom:  6 	Percentage of Chromosome:  0.0 %
Node:  N1 	Chrom:  7 	Percentage of Chromosome:  2.5783569731654024 %
Node:  N1 	Chrom:  8 	Percentage of Chromosome:  0.0 %
Node:  N1 	Chrom:  9 	Percentage of Chromosome:  0.0 %
Node:  N1 	Chrom:  10 	Percentage of Chromosome:  0.04385707415738566 %
Node:  N1 	Chrom:  11 	Percentage of Chromosome:  0.01653389543171623 %
Node:  N1 	Chrom:  12 	Percentage of Chromosome:  0.0 %
Node:  N1 	Chrom:  13 	Percentage of Chromosome:  0.013102296340610347 %
Node:  N1 	Chrom:  14 	Percentage of Chromosome:  0.005417036382468837 %
Node:  N1 	Chrom:  15 	Percentage of Chromosome:  0.0 %
Node:  N1 	Chrom:  16 	Percentage of Chromosome:  0.0 %
Nod

## Get total span of unplaced CNA regions (Debug)

In [46]:
# Get total amplifications from original input:
total_amplified_by_subline = {}
total_amplified_original = 0
total_loss_by_subline = {}
total_loss_original = 0
for key, value in amp_only_dicts.items():
    subline = key.split("-")[0]
    if subline not in total_amplified_by_subline:
        total_amplified_by_subline.update({subline: 0})
    for interval in value:
        total_amplified_by_subline[subline] += interval[1] - interval[0]
        total_amplified_original += interval[1] - interval[0]

for key, value in loss_only_dicts.items():
    subline = key.split("-")[0]
    if subline not in total_loss_by_subline:
        total_loss_by_subline.update({subline: 0})
    for interval in value:
        total_loss_by_subline[subline] += interval[1] - interval[0]
        total_loss_original += interval[1] - interval[0]

# Get percentage amplified by subline
for key, value in total_amplified_by_subline.items():
    total_amplified_by_subline[key] = str(value / sum(mouse_chrom_sizes.values()) * 100) + "%"
for key, value in total_loss_by_subline.items():
    total_loss_by_subline[key] = str(value / sum(mouse_chrom_sizes.values()) * 100) + "%"

# Get average percentage amplified per subline
total_amplified_by_subline_avg = 0
total_loss_by_subline_avg = 0
for key, value in total_amplified_by_subline.items():
    total_amplified_by_subline_avg += float(value[:-1])
for key, value in total_loss_by_subline.items():
    total_loss_by_subline_avg += float(value[:-1])
total_amplified_by_subline_avg = str(total_amplified_by_subline_avg / len(total_amplified_by_subline)) + "%"
total_loss_by_subline_avg = str(total_loss_by_subline_avg / len(total_loss_by_subline)) + "%"
print("Total Amplified by Subline: ", total_amplified_by_subline)
print("Total Loss by Subline: ", total_loss_by_subline)
print("Total Amplified by Subline Average: ", total_amplified_by_subline_avg)
print("Total Loss by Subline Average: ", total_loss_by_subline_avg)
        
# Get total amplifications from final output:
total_amplified_final = 0
total_loss_final = 0
for x in final_mod_option_amp_intersections_per_node_per_chrom:
    for interval in final_mod_option_amp_intersections_per_node_per_chrom[x]:
        total_amplified_final += interval[1] - interval[0]
for x in final_mod_option_loss_intersections_per_node_per_chrom:
    for interval in final_mod_option_loss_intersections_per_node_per_chrom[x]:
        total_loss_final += interval[1] - interval[0]

print("Total Amplified Original: ", total_amplified_original)
print("Total Amplified Final: ", total_amplified_final)
print("Total Loss Original: ", total_loss_original)
print("Total Loss Final: ", total_loss_final)

Total Amplified by Subline:  {'C1': '40.05202311266306%', 'C10': '20.22126868899004%', 'C11': '19.204166341560313%', 'C12': '13.801996898523866%', 'C13': '24.6742103614136%', 'C14': '24.71611481533377%', 'C15': '41.14709572129201%', 'C16': '25.146979293502465%', 'C17': '20.237412704703516%', 'C18': '41.108735563950646%', 'C19': '9.141638330468197%', 'C20': '29.880120375725095%', 'C21': '23.072947379363526%', 'C22': '35.25676972208852%', 'C23': '24.238902996002096%', 'C24': '22.701883683530934%', 'C3': '36.42019072834128%', 'C4': '32.312447633619%', 'C5': '20.189736034071114%', 'C6': '20.597951845182493%', 'C7': '22.22268635645915%', 'C8': '31.55754299736126%', 'C9': '28.565322128411523%'}
Total Loss by Subline:  {'C1': '0.45823194406220896%', 'C10': '0.6279908661917523%', 'C11': '1.250503537135262%', 'C12': '0.9633250867141107%', 'C13': '2.550325530547652%', 'C14': '0.47485445016812133%', 'C15': '0.6119814563549685%', 'C16': '1.4135627410637714%', 'C17': '0.43460700880179864%', 'C18': 