In [1]:
import re
import pandas as pd
from tqdm import tqdm

In [2]:
def parse_zdna_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    zdna_list = []
    current_scaffold = None

    for line in lines:
        line = line.strip()
        if line.startswith('NW_') or line.startswith('GCF_'):
            current_scaffold = line
        elif line and current_scaffold:
            try:
                start, end = map(int, line.split())
                zdna_list.append((current_scaffold, (start, end)))
            except ValueError:
                continue  # skip lines that do not have valid start and end positions
    
    return zdna_list

In [3]:
def parse_gff(file_path):
    # Read the GFF file into a pandas DataFrame
    gff_df = pd.read_csv(file_path, sep='\t', comment='#', header=None,
                         names=['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'])
    return gff_df

In [4]:
def categorize_zdna(zdna_list, gff_df):
    zdna_counts = {
        'exon': 0,
        'intron': 0,
        'promoter': 0,
        'downstream': 0,
        'intergenic': 0
    }

    for scaffold, (start, end) in zdna_list:
        # Check if ZDNA is in exon
        exons = gff_df[(gff_df['seqid'] == scaffold) & (gff_df['type'] == 'exon') & (gff_df['strand'] == '+')]
        if any((exons['start'] <= start) & (exons['end'] >= end)):
            zdna_counts['exon'] += 1
            continue

        # Check if ZDNA is in intron
        genes = gff_df[(gff_df['seqid'] == scaffold) & (gff_df['type'] == 'gene') & (gff_df['strand'] == '+')]
        if any((genes['start'] <= start) & (genes['end'] >= end)):
            zdna_counts['intron'] += 1
            continue
        
        # Check if ZDNA is in promoter region
        if any((genes['start'] - 1000 <= start) & (genes['start'] >= start)):
            zdna_counts['promoter'] += 1
            continue

        # Check if ZDNA is in downstream region
        if any((genes['end'] <= end) & (genes['end'] + 200 >= end)):
            zdna_counts['downstream'] += 1
            continue

        # If not in any other category, it's intergenic
        zdna_counts['intergenic'] += 1
    
    return zdna_counts

In [18]:
def categorize_zdna_minus_strand(zdna_list, gff_df):
    zdna_counts = {
        'exon': 0,
        'intron': 0,
        'promoter': 0,
        'downstream': 0,
        'intergenic': 0
    }

    for scaffold, (start, end) in zdna_list:
        # Check if ZDNA is in exon
        exons = gff_df[(gff_df['seqid'] == scaffold) & (gff_df['type'] == 'exon') & (gff_df['strand'] == '-')]
        if any((exons['start'] <= start) & (exons['end'] >= end)):
            zdna_counts['exon'] += 1
            continue

        # Check if ZDNA is in intron
        genes = gff_df[(gff_df['seqid'] == scaffold) & (gff_df['type'] == 'gene') & (gff_df['strand'] == '-')]
        if any((genes['start'] <= start) & (genes['end'] >= end)):
            zdna_counts['intron'] += 1
            continue
        
        # Check if ZDNA is in promoter region
        if any((genes['start'] - 1000 <= start) & (genes['start'] >= start)):
            zdna_counts['promoter'] += 1
            continue

        # Check if ZDNA is in downstream region
        if any((genes['end'] <= end) & (genes['end'] + 200 >= end)):
            zdna_counts['downstream'] += 1
            continue

        # If not in any other category, it's intergenic
        zdna_counts['intergenic'] += 1
    
    return zdna_counts

In [5]:
zdna_file_path = 'text_predictions.txt'
gff_file_path  = '../ncbi_dataset/data/GCF_000787575.1/genomic.gff'

In [6]:
gff_df = parse_gff(gff_file_path)

In [None]:
gff_df[(gff_df['seqid'] == "NW_012236532.1") & (gff_df['type'] == 'exon') & (gff_df['strand'] == '+')]

In [None]:
zdna_list = parse_zdna_file(zdna_file_path)

In [None]:
zdna_counts = categorize_zdna(zdna_list, gff_df)
print(zdna_counts)

In [None]:
len(zdna_list)

### Count using zhunt data

In [None]:
zhunt=pd.read_csv("zhunt.bed", sep="\t", names=['Scaffold', 'Start', 'End', 'Score'])
len(zhunt)

In [None]:
zhunt

In [None]:
# convert to list of tuples
zhunt_zdna_regions = [(scaffold, (start, end)) for scaffold, start, end, _ in zhunt.values]
zdna_counts = categorize_zdna(zhunt_zdna_regions, gff_df)

In [None]:
print(zdna_counts)

In [17]:
# count quadruplexes
import re
pattern = "(G{3,5}[ATGC]{1,7}){3,}G{3,5}"
pattern_C="(C{3,5}[ATGC]{1,7}){3,}C{3,5}"

In [15]:
file = "../ncbi_dataset/data/GCF_000787575.1/GCF_000787575.1_Asub_2.0_genomic.fna"
from Bio import SeqIO
quadruplex_counts = {
        'exon': 0,
        'intron': 0,
        'promoter': 0,
        'downstream': 0,
        'intergenic': 0
    }
for record in SeqIO.parse(file, "fasta"):
    sequence = str(record.seq)
    PQS = [[m.start(), m.end()] for m in re.finditer(pattern, sequence, re.IGNORECASE)]
    # convert to list of tuples (scaffold, (start, end))
    PQS_regions = [(record.id, (start, end)) for start, end in PQS]
    counts = categorize_zdna(PQS_regions, gff_df)
    print(counts)
    for key, value in counts.items():
        quadruplex_counts[key] += value
    

{'exon': 0, 'intron': 0, 'promoter': 0, 'downstream': 0, 'intergenic': 0}
{'exon': 0, 'intron': 0, 'promoter': 0, 'downstream': 0, 'intergenic': 0}
{'exon': 0, 'intron': 0, 'promoter': 0, 'downstream': 0, 'intergenic': 0}
{'exon': 0, 'intron': 0, 'promoter': 0, 'downstream': 0, 'intergenic': 0}
{'exon': 0, 'intron': 0, 'promoter': 0, 'downstream': 0, 'intergenic': 0}
{'exon': 0, 'intron': 0, 'promoter': 0, 'downstream': 0, 'intergenic': 0}
{'exon': 0, 'intron': 0, 'promoter': 0, 'downstream': 0, 'intergenic': 0}
{'exon': 0, 'intron': 0, 'promoter': 0, 'downstream': 0, 'intergenic': 0}
{'exon': 0, 'intron': 0, 'promoter': 0, 'downstream': 0, 'intergenic': 1}
{'exon': 0, 'intron': 0, 'promoter': 0, 'downstream': 0, 'intergenic': 0}
{'exon': 0, 'intron': 0, 'promoter': 0, 'downstream': 0, 'intergenic': 0}
{'exon': 0, 'intron': 0, 'promoter': 0, 'downstream': 0, 'intergenic': 0}
{'exon': 0, 'intron': 0, 'promoter': 0, 'downstream': 0, 'intergenic': 0}
{'exon': 0, 'intron': 0, 'promoter': 0

In [19]:
# count for minus strand
for record in SeqIO.parse(file, "fasta"):
    sequence = str(record.seq)
    PQS_minus = [[m.start(), m.end()] for m in re.finditer(pattern_C, sequence, re.IGNORECASE)]
    # convert to list of tuples (scaffold, (start, end))
    PQS_minus_regions = [(record.id, (start, end)) for start, end in PQS_minus]
    counts = categorize_zdna_minus_strand(PQS_minus_regions, gff_df)
    print(counts)
    for key, value in counts.items():
        quadruplex_counts[key] += value

{'exon': 0, 'intron': 0, 'promoter': 0, 'downstream': 0, 'intergenic': 0}
{'exon': 0, 'intron': 0, 'promoter': 0, 'downstream': 0, 'intergenic': 0}
{'exon': 0, 'intron': 0, 'promoter': 0, 'downstream': 0, 'intergenic': 0}
{'exon': 0, 'intron': 0, 'promoter': 0, 'downstream': 0, 'intergenic': 0}
{'exon': 0, 'intron': 0, 'promoter': 0, 'downstream': 0, 'intergenic': 0}
{'exon': 0, 'intron': 0, 'promoter': 0, 'downstream': 0, 'intergenic': 0}
{'exon': 0, 'intron': 0, 'promoter': 0, 'downstream': 0, 'intergenic': 0}
{'exon': 0, 'intron': 0, 'promoter': 0, 'downstream': 0, 'intergenic': 0}
{'exon': 0, 'intron': 0, 'promoter': 0, 'downstream': 0, 'intergenic': 0}
{'exon': 0, 'intron': 0, 'promoter': 0, 'downstream': 0, 'intergenic': 0}
{'exon': 0, 'intron': 0, 'promoter': 0, 'downstream': 0, 'intergenic': 0}
{'exon': 0, 'intron': 0, 'promoter': 0, 'downstream': 0, 'intergenic': 0}
{'exon': 0, 'intron': 0, 'promoter': 0, 'downstream': 0, 'intergenic': 0}
{'exon': 0, 'intron': 0, 'promoter': 0

In [20]:
print(quadruplex_counts)

{'exon': 27, 'intron': 9, 'promoter': 232, 'downstream': 29, 'intergenic': 865}
