In [1]:
import pandas as pd

In [2]:
sv_solve37 = pd.read_csv('Bionano_hg38_masks/hg38_DLE1_gap_common_segdup_min10_com10kb_seg50kb_solve3.7.bed', sep='\t', header=None)
cnv_mask = pd.read_csv('Bionano_hg38_masks/hg38_cnv_masks.bed', sep='\t')
superdup = pd.read_csv('Bionano_hg38_masks/genomicSuperDups.bed', sep='\t', header=None)
ATC = pd.read_csv('acrocentric_telo_cen.bed', sep='\t')

# preprocess SV masks
we drop the following columns
- seems like column 1 == 6, 2 == 7 are duplicate info 
- seems like col 5 == '.' is useless info
- seems like col 8 is a color info for display
- seems like col 4 is an ID



In [3]:
sv_solve37 = sv_solve37.drop(columns=[4, 5, 6, 7, 8])
sv_solve37.columns = ['Chr', 'StartPos', 'EndPos', 'Type']

In [4]:
sv_solve37

Unnamed: 0,Chr,StartPos,EndPos,Type
0,1,1,10000,gap
1,1,207667,257666,gap
2,1,297969,347968,gap
3,1,535989,585988,gap
4,1,2702782,2746290,gap
...,...,...,...,...
2315,24,25545548,26311622,segdupe
2316,24,25566691,25686389,segdupe
2317,24,25600000,25686389,segdupe
2318,24,25967291,26135950,segdupe


# preprocess CNV mask

In [5]:
cnv_mask['StartPos'] = cnv_mask['StartPos'].apply(int)
cnv_mask['EndPos'] = cnv_mask['EndPos'].apply(int)
cnv_mask['Type'] = 'cnv'

In [6]:
cnv_mask

Unnamed: 0,Chr,StartPos,EndPos,Type
0,1,14453,1674800,cnv
1,1,1899027,3864922,cnv
2,1,4085621,4285615,cnv
3,1,4920445,5092120,cnv
4,1,5330209,5493329,cnv
...,...,...,...,...
382,24,15800000,16122374,cnv
383,24,16155235,20893633,cnv
384,24,20916193,21144650,cnv
385,24,21180878,22737164,cnv


# preprocess Superdups
we drop the following columns
- seems like column 1 == 6, 2 == 7 are duplicate info 
- seems like col 4 == '+' is useless info
- seems like col 8 is a color info for display
- seems like col 3 is an ID


In [7]:
superdup = superdup.drop(columns=[3,4,5,6,7,8])
superdup.columns = ['Chr', 'StartPos', 'EndPos']
superdup['Type'] = 'superdup'

In [8]:
superdup

Unnamed: 0,Chr,StartPos,EndPos,Type
0,1,10000,87112,superdup
1,1,10000,20818,superdup
2,1,10000,19844,superdup
3,1,10169,37148,superdup
4,1,10464,40733,superdup
...,...,...,...,...
64211,24,56862674,56866758,superdup
64212,24,56862674,56866758,superdup
64213,24,56866349,56867465,superdup
64214,24,56878710,56879827,superdup


# Preprocess acrocentric-telo-cen
- Bionano's masking regions are 1-based indexed

In [9]:
type_mapping = {"telomere1": 'T1', 'telomere2': 'T2', 'centromere': 'CEN', 'acrocentric-telomere1': 'AT1', 'acrocentric-centromere': 'ACEN', 'acrocentric': 'AR'}
ATC['Type'] = ATC['Type'].map(type_mapping)
ATC['Chr'] = ATC['Chr'].apply(lambda x: x.replace('Chr', ''))
ATC['Chr'] = ATC['Chr'].apply(lambda x: x.replace('X', '23').replace('Y', '24'))
ATC['Chr'] = ATC['Chr'].apply(int)
ATC['StartPos'] = ATC['StartPos'] + 1
ATC['EndPos'] = ATC['EndPos'] + 1

In [10]:
ATC

Unnamed: 0,Chr,StartPos,EndPos,Type
0,1,1,10000,T1
1,1,122026461,125184588,CEN
2,1,248946423,248956422,T2
3,2,1,10000,T1
4,2,92188147,94090558,CEN
...,...,...,...,...
69,23,58605581,62412543,CEN
70,23,156030896,156040895,T2
71,24,1,10000,T1
72,24,10316946,10544040,CEN


# Find identical overlaps
- segdup is a subgroup of superdup

In [11]:
# Merge all DataFrames
merged_df = pd.concat([sv_solve37, cnv_mask, superdup, ATC], ignore_index=True)
merged_df = merged_df.sort_values(by=['Chr', 'StartPos', 'EndPos'])

In [12]:
# merge identical overlaps
temp_dict = {}
for idx, row in merged_df.iterrows():
    key = (row['Chr'], row['StartPos'], row['EndPos'])
    if key not in temp_dict:
        temp_dict[key] = {
                'Chr': row['Chr'],
                'StartPos': row['StartPos'],
                'EndPos': row['EndPos'],
                'Type': [row['Type']]
        }
    else:
        temp_dict[key]['Type'].append(row['Type'])
temp_dict2 = {'Chr': [], 'StartPos': [], 'EndPos': [], 'Type': []}
for key, val in temp_dict.items():
    temp_dict2['Chr'].append(val['Chr'])
    temp_dict2['StartPos'].append(val['StartPos'])
    temp_dict2['EndPos'].append(val['EndPos'])
    temp_dict2['Type'].append(val['Type'])
new_df = pd.DataFrame.from_dict(temp_dict2)
merged_df = new_df
merged_df['Type'] = merged_df['Type'].apply(set)

In [13]:
temp_df = merged_df.copy(deep=True)
temp_df['Type'] = temp_df['Type'].apply(str)
temp_df['Type'].unique()

array(["{'T1', 'gap'}", "{'common'}", "{'superdup'}",
       "{'superdup', 'segdupe'}", "{'cnv'}", "{'gap'}", "{'CEN'}",
       "{'T2', 'gap'}", "{'T2'}", "{'T1'}", "{'AT1'}", "{'ACEN'}",
       "{'AR'}"], dtype=object)

In [14]:
temp_df[(temp_df['Type'] == "{'T1', 'gap'}") | (temp_df['Type'] == "{'T1'}") | (temp_df['Type'] == "{'gap'}")]

Unnamed: 0,Chr,StartPos,EndPos,Type
0,1,1,10000,"{'T1', 'gap'}"
99,1,207667,257666,{'gap'}
133,1,297969,347968,{'gap'}
227,1,535989,585988,{'gap'}
401,1,2702782,2746290,{'gap'}
...,...,...,...,...
49456,24,11592903,11642902,{'gap'}
49567,24,20207794,20257793,{'gap'}
49614,24,21789282,21805281,{'gap'}
50043,24,26673215,56673214,{'gap'}


# Find all overlaps

In [15]:
merged_df = pd.concat([sv_solve37, cnv_mask, superdup, ATC], ignore_index=True)
merged_df = merged_df.sort_values(by=['Chr', 'StartPos', 'EndPos'])

In [16]:
starts = merged_df['StartPos'].unique()
ends = merged_df['EndPos'].unique()

In [17]:
def val_in_between(start_i, end_i):
    starts_in_between = []
    ends_in_between = []
    for start_itr in starts:
        if start_i < start_itr <= end_i:
            starts_in_between.append(start_itr)
    for end_itr in ends:
        if start_i <= end_itr < end_i:
            ends_in_between.append(end_itr)
    output = [(x, 's') for x in starts_in_between] + [(y, 't') for y in ends_in_between]
    output = sorted(output, key=lambda x: x[0])
    return output

In [18]:
temp_dict = {}
print(f"total rows: {merged_df.shape[0]}")
c = 0
for idx, row in merged_df.iterrows():
    if c % 1000 == 0:
        print(f"current row: {c}")
    c += 1
    c_chr = row['Chr']
    c_type = row['Type']
    c_ranges = []
    breakpoints = val_in_between(row['StartPos'], row['EndPos'])
    new_start = row['StartPos']
    for (bp_pos, bp_type) in breakpoints:
        if bp_type == 's':
            c_ranges.append((new_start, bp_pos - 1))
            new_start = bp_pos
        else:
            c_ranges.append((new_start, bp_pos))
            new_start = bp_pos + 1
    if new_start <= row['EndPos']:
        c_ranges.append((new_start, row['EndPos']))
    for range_itr in c_ranges:
        c_key = (c_chr, range_itr[0], range_itr[1])
        if c_key not in temp_dict:
            temp_dict[c_key] = {c_type}
        else:
            temp_dict[c_key].add(c_type)

temp_dict2 = {'Chr': [], 'StartPos': [], 'EndPos': [], 'Type': []}
for key, val in temp_dict.items():
    temp_dict2['Chr'].append(key[0])
    temp_dict2['StartPos'].append(key[1])
    temp_dict2['EndPos'].append(key[2])
    temp_dict2['Type'].append(val)
merged_df = pd.DataFrame.from_dict(temp_dict2)

total rows: 66997
current row: 0
current row: 1000
current row: 2000
current row: 3000
current row: 4000
current row: 5000
current row: 6000
current row: 7000
current row: 8000
current row: 9000
current row: 10000
current row: 11000
current row: 12000
current row: 13000
current row: 14000
current row: 15000
current row: 16000
current row: 17000
current row: 18000
current row: 19000
current row: 20000
current row: 21000
current row: 22000
current row: 23000
current row: 24000
current row: 25000
current row: 26000
current row: 27000
current row: 28000
current row: 29000
current row: 30000
current row: 31000
current row: 32000
current row: 33000
current row: 34000
current row: 35000
current row: 36000
current row: 37000
current row: 38000
current row: 39000
current row: 40000
current row: 41000
current row: 42000
current row: 43000
current row: 44000
current row: 45000
current row: 46000
current row: 47000
current row: 48000
current row: 49000
current row: 50000
current row: 51000
current

In [21]:
temp_df = merged_df.copy(deep=True)
temp_df['Type'] = temp_df['Type'].apply(str)
all_combinations = temp_df['Type'].unique()

In [22]:
[x for x in all_combinations if ('gap' in x) and (('T1' not in x) and ('T2' not in x))]

["{'cnv', 'common', 'gap'}",
 "{'cnv', 'gap'}",
 "{'superdup', 'cnv', 'gap'}",
 "{'superdup', 'cnv', 'gap', 'segdupe'}",
 "{'superdup', 'segdupe', 'cnv', 'common', 'gap'}",
 "{'CEN', 'cnv', 'gap'}",
 "{'gap'}",
 "{'superdup', 'cnv', 'common', 'gap'}",
 "{'cnv', 'ACEN', 'gap'}",
 "{'superdup', 'gap'}",
 "{'common', 'gap'}",
 "{'AR', 'cnv', 'gap'}",
 "{'AR', 'cnv', 'common', 'gap'}",
 "{'AR', 'superdup', 'cnv', 'gap'}",
 "{'superdup', 'AR', 'segdupe', 'cnv', 'common', 'gap'}",
 "{'superdup', 'AR', 'segdupe', 'cnv', 'gap'}",
 "{'superdup', 'AR', 'cnv', 'common', 'gap'}"]

In [27]:
merged_df['length'] = merged_df['EndPos'] - merged_df['StartPos'] + 1

In [33]:
merged_df

Unnamed: 0,Chr,StartPos,EndPos,Type,length
0,1,1,485,"{'T1', 'gap'}",485
1,1,486,4451,"{'T1', 'gap'}",3966
2,1,4452,9999,"{'common', 'T1', 'gap'}",5548
3,1,10000,10000,"{'superdup', 'segdupe', 'common', 'T1', 'gap'}",1
4,1,10001,10000,"{'superdup', 'common', 'segdupe'}",0
...,...,...,...,...,...
321393,24,57210167,57210183,{'cnv'},17
321394,24,57210184,57211291,{'cnv'},1108
321395,24,57211292,57212320,{'cnv'},1029
321396,24,57212321,57217415,{'cnv'},5095


In [32]:
merged_df['Type'] = merged_df['Type'].apply(str)

In [34]:
merged_df[merged_df['Type'].str.contains('common', na=False)]

Unnamed: 0,Chr,StartPos,EndPos,Type,length
2,1,4452,9999,"{'common', 'T1', 'gap'}",5548
3,1,10000,10000,"{'superdup', 'segdupe', 'common', 'T1', 'gap'}",1
4,1,10001,10000,"{'superdup', 'common', 'segdupe'}",0
5,1,10001,10026,"{'superdup', 'common', 'segdupe'}",26
6,1,10027,10032,"{'superdup', 'common', 'segdupe'}",6
...,...,...,...,...,...
301600,24,24991867,24993800,"{'superdup', 'cnv', 'common', 'segdupe'}",1934
301601,24,24993801,24994352,"{'superdup', 'cnv', 'common', 'segdupe'}",552
301602,24,24994353,24996258,"{'superdup', 'cnv', 'common', 'segdupe'}",1906
301603,24,24996259,24998567,"{'superdup', 'cnv', 'common', 'segdupe'}",2309


In [38]:
merged_df[merged_df['Type'].str.contains('common', na=False)]['length'].sum()

26158264

In [39]:
merged_df[merged_df['Type'].str.contains('segdupe', na=False)]['length'].sum()

88295249

In [41]:

mask1 = merged_df['Type'].str.contains('gap', na=False)
mask2 = merged_df['Type'].str.contains('common', na=False)
mask3 = merged_df['Type'].str.contains('segdupe', na=False)
merged_df[mask1 & mask2 & mask3]['length'].sum()

23

In [42]:
merged_df[mask1 & mask2]['length'].sum()

542039

In [43]:
merged_df[mask2 & mask3]['length'].sum()

21387054

In [44]:
merged_df[mask1 & mask3]['length'].sum()

54

In [45]:
merged_df[mask1]['length'].sum()

85960835

In [46]:
merged_df[mask2]['length'].sum()

26158264

In [47]:
merged_df[mask3]['length'].sum()

88295249

In [48]:
merged_df[mask1 | mask2 | mask3]['length'].sum()

178485224

In [35]:
sum_region = 0
for idx, row in merged_df.iterrows():
    if 'common' in merged_df['Type']:
        sum_region += row['length']

In [36]:
print(sum_region)

0


In [None]:
merged_df['length'] = merged_df['EndPos'] - merged_df['StartPos']
sum_region = 0
for idx, row in merged_df.iterrows():
    if 'gap' in merged_df['Type']:
        sum_region += row['length']