In [1]:
import pandas as pd

In [14]:
gap_df = pd.read_csv('hg19_gap.txt', sep='\t')

types_of_interest = ['centromere', 'telomere', 'short_arm']
gap_df = gap_df[gap_df['type'].isin(types_of_interest)]
gap_df['chromStart'] = gap_df['chromStart'].astype(int)
gap_df['chromEnd'] = gap_df['chromEnd'].astype(int)
gap_df['size'] = gap_df['size'].astype(int)

In [15]:
gap_df

Unnamed: 0,#bin,chrom,chromStart,chromEnd,ix,n,size,type,bridge
1,23,chr1,121535434,124535434,1270.0,N,3000000,centromere,no
19,585,chr1,0,10000,1.0,N,10000,telomere,no
40,2486,chr1,249240621,249250621,2302.0,N,10000,telomere,no
42,20,chr2,92326171,95326171,770.0,N,3000000,centromere,no
49,585,chr2,0,10000,1.0,N,10000,telomere,no
...,...,...,...,...,...,...,...,...,...
347,952,chr21,48119895,48129895,515.0,N,10000,telomere,no
348,1,chr22,10000,13000000,2.0,N,12990000,short_arm,no
350,10,chr22,13000000,16000000,3.0,N,3000000,centromere,no
353,585,chr22,0,10000,1.0,N,10000,telomere,no


In [25]:
## build indices
chroms = gap_df['chrom'].unique()
ixs = {}
for chrom in chroms:
    chrom_df = gap_df[gap_df['chrom'] == chrom]
    telo1_end = -1
    telo2_start = -1
    ref_len = -1
    cen_start = -1
    cen_end = -1
    for i, row in chrom_df.iterrows():
        if row['type'] == 'centromere':
            cen_start = row['chromStart']
            cen_end = row['chromEnd']
        elif row['type'] == 'telomere':
            if row['chromStart'] == 0:
                if telo1_end == -1:
                    ## do not overwrite if already set by short_arm
                    telo1_end = row['chromEnd']
            else:
                telo2_start = row['chromStart']
                ref_len = row['chromEnd'] + 1
        elif row['type'] == 'short_arm':
            ## overwrites telo1 if short_arm type exists
            telo1_end = row['chromEnd']
    ixs[chrom] = {'ref_len': ref_len,
                  'telo1_end': telo1_end,
                  'cen_start': cen_start,
                  'cen_end': cen_end,
                  'telo2_start': telo2_start}
parsed_index_df = pd.DataFrame.from_dict(ixs, orient='index')
parsed_index_df.index = parsed_index_df.index.map(str.capitalize)  # capitalize the first char in ref chr name

In [26]:
parsed_index_df

Unnamed: 0,ref_len,telo1_end,cen_start,cen_end,telo2_start
Chr1,249250622,10000,121535434,124535434,249240621
Chr2,243199374,10000,92326171,95326171,243189373
Chr3,198022431,10000,90504854,93504854,198012430
Chr4,191154277,10000,49660117,52660117,191144276
Chr5,180915261,10000,46405641,49405641,180905260
Chr6,171115068,10000,58830166,61830166,171105067
Chr7,159138664,10000,58054331,61054331,159128663
Chr8,146364023,10000,43838887,46838887,146354022
Chr9,141213432,10000,47367679,50367679,141203431
Chrx,155270561,10000,58632012,61632012,155260560


In [27]:
parsed_index_df.to_csv('hg19_index.txt', sep='\t', index=True, header=False)

For hg19, manually input the chr17 telo1 and telo2 coordinates