In [15]:
import pandas as pd
from collections import defaultdict

In [16]:
bed_fp = "CHM13v2_cytoBandMapped.bed"
column_names = [
    "chromosome", "start", "end", "gband", "stain"
]  # [start, end)
bed_df = pd.read_csv(
    bed_fp,
    sep="\t",
    names=column_names
)

In [17]:
## collect chrom length
chrom_len = defaultdict(lambda: -1)
for row_idx, row in bed_df.iterrows():
    chrom = row['chromosome']
    end = row['end']
    if chrom_len[chrom] == -1:
        chrom_len[chrom] = end
    else:
        chrom_len[chrom] = max(chrom_len[chrom], end)

In [18]:
chrom_len

defaultdict(<function __main__.<lambda>()>,
            {'chr1': 248387328,
             'chr10': 134758134,
             'chr11': 135127769,
             'chr12': 133324548,
             'chr13': 113566686,
             'chr14': 101161492,
             'chr15': 99753195,
             'chr16': 96330374,
             'chr17': 84276897,
             'chr18': 80542538,
             'chr19': 61707364,
             'chr2': 242696752,
             'chr20': 66210255,
             'chr21': 45090682,
             'chr22': 51324926,
             'chr3': 201105948,
             'chr4': 193574945,
             'chr5': 182045439,
             'chr6': 172126628,
             'chr7': 160567428,
             'chr8': 146259331,
             'chr9': 150617247,
             'chrX': 154259566,
             'chrY': 62460029})

In [19]:
bed_df = bed_df[bed_df['stain'] == 'acen']

In [20]:
## Merge p/q-arm ACEN
centromeres = defaultdict(lambda: {'start': -1, 'end': -1})
for rowidx, row in bed_df.iterrows():
    chrom = row['chromosome']
    start = row['start']
    end = row['end']
    if centromeres[chrom]['start'] == -1:
        # no entry yet
        centromeres[chrom]['start'], centromeres[chrom]['end'] = start, end
    else:
        # previous entry exists
        prev_end = centromeres[chrom]['end']
        if prev_end - 100 < start:
            # current ACEN is on q-arm side
            centromeres[chrom]['end'] = end
        else:
            # current ACEN is on the p-arm side
            centromeres[chrom]['start'] = start

In [21]:
## form index
chm13_index = {}
for chrom, length in chrom_len.items():
    cen_start = centromeres[chrom]['start']
    cen_end = centromeres[chrom]['end'] - 1
    chrom_end = length - 1
    chm13_index[chrom] = {'len': length,
                          'telo1_end': 0,
                          'CEN_start': cen_start,
                          'CEN_end': cen_end,
                          'telo2_start': chrom_end}

## extend the acrocentric chromosome's centromere to cover the whole p-arm
acro_chrom = ['chr13', 'chr14', 'chr15', 'chr21', 'chr22']
for ac in acro_chrom:
    chm13_index[ac]['telo1_end'] = chm13_index[ac]['CEN_start']

In [27]:
## output
def chr_sort(k):
    v = k[3:].lower()
    if v == 'x':
        return 23
    elif v == 'y':
        return 24
    else:
        return int(v)

df = pd.DataFrame.from_dict(chm13_index, orient='index')
df = df.loc[sorted(df.index, key=chr_sort)]
df.index = df.index.str.capitalize()
df.to_csv('chm13v2_index.txt', sep='\t', header=False)

In [24]:
df

Unnamed: 0,len,telo1_end,CEN_start,CEN_end,telo2_start
chr1,248387328,0,121796048,126300486,248387327
chr10,134758134,0,39633793,41664588,134758133
chr11,135127769,0,51035789,54450837,135127768
chr12,133324548,0,34620838,37202489,133324547
chr13,113566686,15547593,15547593,17498290,113566685
chr14,101161492,10092112,10092112,12708410,101161491
chr15,99753195,16678794,16678794,17694465,99753194
chr16,96330374,0,35848286,37829520,96330373
chr17,84276897,0,23892419,27486938,84276896
chr18,80542538,0,15965699,20933549,80542537
