In [None]:
# !pip install biopython

Defaulting to user installation because normal site-packages is not writeable
Collecting biopython
  Downloading biopython-1.85-cp312-cp312-win_amd64.whl.metadata (13 kB)
Downloading biopython-1.85-cp312-cp312-win_amd64.whl (2.8 MB)
   ---------------------------------------- 0.0/2.8 MB ? eta -:--:--
   -------------- ------------------------- 1.0/2.8 MB 8.5 MB/s eta 0:00:01
   -------------------------- ------------- 1.8/2.8 MB 3.5 MB/s eta 0:00:01
   ----------------------------- ---------- 2.1/2.8 MB 3.8 MB/s eta 0:00:01
   ---------------------------------------- 2.8/2.8 MB 3.2 MB/s eta 0:00:00
Installing collected packages: biopython
Successfully installed biopython-1.85


In [1]:
import pandas as pd
import numpy as np
from Bio import SeqIO

In [2]:
def read_fasta(file_path, filter_header=None):
    sequences = ''
    for record in SeqIO.parse(file_path, "fasta"):
        if filter_header and filter_header not in record.id:
            continue
        sequences = record.seq
    return sequences

def read_centro_telo(file_path_centro, file_path_telo, file_path_cpg):
    centro = pd.read_csv(file_path_centro)
    centro.drop('#"bin"', axis=1, inplace=True)
    
    telo = pd.read_csv(file_path_telo)
    telo.drop('#bin', axis=1, inplace=True)
    telo = telo[telo['type'] == 'telomere']
    
    cpg = pd.read_csv(file_path_cpg)
    cpg.drop('#"bin"', axis=1, inplace=True)
    
    return centro, telo, cpg

def check_overlap(centro, telo, cpg):
    telo.sort_values('chromStart', inplace=True)
    # print("Starting Telomere")
    # print(f'({telo[['chromStart', 'chromEnd']].iloc[0, 0]},{telo[['chromStart', 'chromEnd']].iloc[0, 1]})')
    # print("Ending Telomere")
    # print(f'({telo[['chromStart', 'chromEnd']].iloc[1, 0]},{telo[['chromStart', 'chromEnd']].iloc[1, 1]})')
    
    start_telo = telo[['chromStart', 'chromEnd']].iloc[0, 1]
    end_telo = telo[['chromStart', 'chromEnd']].iloc[1, 0]

    if len(cpg[cpg['chromStart'] < start_telo]) > 0 or len(cpg[cpg['chromEnd'] > end_telo]) > 0:
        print("There is a CpG island overlapping with Telomeric regions.")
    else:
        print("There is NO overlapping CpG island with Telomeric regions.")
        
    centro.sort_values('chromStart', inplace=True)
    min_centro = centro[['chromStart','chromEnd']].iloc[0,0]
    max_centro = centro[['chromStart','chromEnd']].iloc[len(centro)-1, 1]

    if len(cpg[(cpg['chromStart'] > min_centro) & (cpg['chromEnd'] < max_centro)]) > 0:
        print("Do a manual inspection because there is a CpG island in between the Centromeric regions.")
    else:
        print("There is NO overlapping CpG island with Centromeric regions.")

In [3]:
seq = read_fasta('./GRCh38.p14.genome.fa', 'chr18')

In [4]:
centro18_df, telo18_df, cpg18_df = read_centro_telo('./chr18/centromeres.csv','./chr18/telomeres.csv', './chr18/cpg.csv')
centro19_df, telo19_df, cpg19_df = read_centro_telo('./chr19/centromeres.csv','./chr19/telomeres.csv', './chr19/cpg.csv')

In [5]:
check_overlap(centro18_df, telo18_df, cpg18_df)

There is NO overlapping CpG island with Telomeric regions.
There is NO overlapping CpG island with Centromeric regions.


In [6]:
check_overlap(centro19_df, telo19_df, cpg19_df)

There is NO overlapping CpG island with Telomeric regions.
There is NO overlapping CpG island with Centromeric regions.


In [7]:
def check_in_start_end(i, index, start_end, in_):
    if index < len(start_end):
        if not in_ and i >= start_end[index]:
            return True, index + 1
        if in_ and i >= start_end[index]:
            return False, index + 1
    return in_, index    

In [9]:
in_cpg = False
cpg_index = 0

in_centro = False
centro_index = 0

in_telo = False
telo_index = 0

telo_start_end = telo18_df.sort_values('chromStart')[['chromStart', 'chromEnd']].to_numpy().flatten()
centro_start_end = centro18_df.sort_values('chromStart')[['chromStart', 'chromEnd']].to_numpy().flatten()
cpg_start_end = cpg18_df.sort_values('chromStart')[['chromStart', 'chromEnd']].to_numpy().flatten()

# Format: A-, G-, C-, T-, A+, G+, C+, T+
init_probs = np.zeros(8)
emit_probs = np.concatenate([np.eye(4), np.eye(4)])
tran_probs = np.zeros((8,8))

init_dict = {
    'A-': 0,
    'G-': 1,
    'C-': 2,
    'T-': 3,
    'A+': 4,
    'G+': 5,
    'C+': 6,
    'T+': 7,
}

for i in range(len(seq)):
    # check if in telomeric region
    in_telo, telo_index = check_in_start_end(i, telo_index, telo_start_end, in_telo)
    if in_telo:
        continue

    # check if in centromeric region
    in_centro, centro_index = check_in_start_end(i, centro_index, centro_start_end, in_centro)
    if in_centro:
        continue

    # check if in CpG island
    in_cpg, cpg_index = check_in_start_end(i, cpg_index, cpg_start_end, in_cpg)

    # avoid ambiguous bases N
    if seq[i] == 'N':
        continue

    # counting the frequency of each Markov States
    base_i = seq[i] + ('+' if in_cpg else '-')
    init_probs[init_dict[base_i]] += 1

    # check if next index in telomeric region
    if i + 1 < len(seq):
        in_telo_next, telo_index = check_in_start_end(i + 1, telo_index, telo_start_end, in_telo)
        if in_telo_next:
            continue

        # check if next index in centromeric region
        in_centro_next, centro_index = check_in_start_end(i + 1, centro_index, centro_start_end, in_centro)
        if in_centro_next:
            continue

        # check if next index in CpG island
        in_cpg_next, cpg_index = check_in_start_end(i + 1, cpg_index, cpg_start_end, in_cpg)

        base_i_next = seq[i + 1] + ('+' if in_cpg_next else '-')
        emit_probs[init_dict[base_i], init_dict[base_i_next]] += 1


KeyboardInterrupt: 