In [4]:
!pip install biopython

Defaulting to user installation because normal site-packages is not writeable
Collecting biopython
  Downloading biopython-1.85-cp312-cp312-win_amd64.whl.metadata (13 kB)
Downloading biopython-1.85-cp312-cp312-win_amd64.whl (2.8 MB)
   ---------------------------------------- 0.0/2.8 MB ? eta -:--:--
   -------------- ------------------------- 1.0/2.8 MB 8.5 MB/s eta 0:00:01
   -------------------------- ------------- 1.8/2.8 MB 3.5 MB/s eta 0:00:01
   ----------------------------- ---------- 2.1/2.8 MB 3.8 MB/s eta 0:00:01
   ---------------------------------------- 2.8/2.8 MB 3.2 MB/s eta 0:00:00
Installing collected packages: biopython
Successfully installed biopython-1.85


In [24]:
import pandas as pd
from Bio import SeqIO

In [150]:
def read_fasta(file_path, filter_header=None):
    sequences = ''
    for record in SeqIO.parse(file_path, "fasta"):
        if filter_header and filter_header not in record.id:
            continue
        sequences = record.seq
    return sequences

def read_centro_telo(file_path_centro, file_path_telo, file_path_cpg):
    centro = pd.read_csv(file_path_centro)
    centro.drop('#"bin"', axis=1, inplace=True)
    
    telo = pd.read_csv(file_path_telo)
    telo.drop('#bin', axis=1, inplace=True)
    telo = telo[telo['type'] == 'telomere']
    
    cpg = pd.read_csv(file_path_cpg)
    cpg.drop('#"bin"', axis=1, inplace=True)
    
    return centro, telo, cpg

def check_overlap(centro, telo, cpg):
    telo.sort_values('chromStart', inplace=True)
    # print("Starting Telomere")
    # print(f'({telo[['chromStart', 'chromEnd']].iloc[0, 0]},{telo[['chromStart', 'chromEnd']].iloc[0, 1]})')
    # print("Ending Telomere")
    # print(f'({telo[['chromStart', 'chromEnd']].iloc[1, 0]},{telo[['chromStart', 'chromEnd']].iloc[1, 1]})')
    
    start_telo = telo[['chromStart', 'chromEnd']].iloc[0, 1]
    end_telo = telo[['chromStart', 'chromEnd']].iloc[1, 0]

    if len(cpg[cpg['chromStart'] < start_telo]) > 0 or len(cpg[cpg['chromEnd'] > end_telo]) > 0:
        print("There is a CpG island overlapping with Telomeric regions.")
    else:
        print("There is NO overlapping CpG island with Telomeric regions.")
        
    centro.sort_values('chromStart', inplace=True)
    min_centro = centro[['chromStart','chromEnd']].iloc[0,0]
    max_centro = centro[['chromStart','chromEnd']].iloc[len(centro)-1, 1]

    if len(cpg[(cpg['chromStart'] > min_centro) & (cpg['chromEnd'] < max_centro)]) > 0:
        print("Do a manual inspection because there is a CpG island in between the Centromeric regions.")
    else:
        print("There is NO overlapping CpG island with Centromeric regions.")

In [8]:
seq = read_fasta('./GRCh38.p14.genome.fa', 'chr18')

In [87]:
centro18_df, telo18_df, cpg18_df = read_centro_telo('./chr18/centromeres.csv','./chr18/telomeres.csv', './chr18/cpg.csv')
centro19_df, telo19_df, cpg19_df = read_centro_telo('./chr19/centromeres.csv','./chr19/telomeres.csv', './chr19/cpg.csv')

In [151]:
check_overlap(centro18_df, telo18_df, cpg18_df)

There is NO overlapping CpG island with Telomeric regions.
There is NO overlapping CpG island with Centromeric regions.


In [152]:
check_overlap(centro19_df, telo19_df, cpg19_df)

There is NO overlapping CpG island with Telomeric regions.
There is NO overlapping CpG island with Centromeric regions.


In [160]:
cpg_islands = []
for each in cpg18_df.sort_values('chromStart')[['chromStart', 'chromEnd']].itertuples():
    _, start, end = each
    print(seq[start: end])
    break

CGGGCCCCTGGGCCACGCCCACTCCCCCAAGCGGGGAAGGAGCTTCGCGCTGCCGCTTGGCTGGGGACTGGGCACCGCCCTCCCGCGGCTCCTGAGCCGGCTGCCACCAGGGGTGCGCGCCAGCGGTGTCCGGGAGCCTAGCGGCGCGTGTGCAGCGGCCAGTGCACCTGCTCTGGCCCTCGCCGCGGTCTCTGCCAGGACCCCGACGCCCAGCCTGACCCTGCCATTCAGCGGGGCTGCGGCTCCACGGCCTGCGACAGCAGCCCCACCTGGCATTCAGCGCGCTCCCGGGGGCAGAGGTCGCGGTGTCCTCACGCTGTGGTGCCGGCCTACAACCCCCACGCCGGGCTCGGGCCCGGCGGAGGAGGGCGATGCTCCCCGGGTAGGACAAACCGGTCACCTGGGCTGCGAGGGCGGCTTAGGGGCAGAAGCGGCGGTCCAGGGCCGCCTGGCGCAGCAGCCTGTCCCAGCCGCGGTCCCTGCAGTCCCTCCCTGGCGGCTGCGCAGCCGTCCCACGACAGGGGCCATAAACTCTCCAGAGCGGAAAGCCGCACCCTGGTGGCCCGGCCCCGCGCCCAGACCTGGCGGCCGCTGGCACCTGACCCGCTGCATGGGTCTCCAGGGAGCTCGCTGCCCACCCGGCGCTGCAGGCTCGGCTCCCTCG
