### Dyad Density Evaluation

In this notebook, we measure the density of short dyads in the peri/centromeric region using the tool Palindrome in the European Molecular Biology Open Software Suite (EMBOSS). Please download this tool from the following link - http://emboss.open-bio.org/html/adm/ch01s01.html 

We will need the following to run this notebook - 
1. A .bed file containing the annotation information of the region/s of interest
2. A .fasta file containing the genetic information from the genome of interest
3. Palindrome from EMBOSS

In [4]:
# Importing necessary libraries
import os
import re
import pandas as pd
from Bio import SeqIO

In [5]:
BASE_DIR = os.getcwd()
genome_dir = "./ALLCHROMOSOMES"

def chr_path(chromosome_number) -> str:
    return f'{genome_dir}/{chromosome_number}.fasta'

#### Input fasta generation

In [7]:
seqDF = pd.read_csv('CHM13_centric_transition.bed', sep = '\t') # This is an example. Change the path according to your needs.

In [None]:
# Function to extract sequence from a FASTA file based on start and end positions
def extract_sequence_from_fasta(fasta_file, chromosome, start, end):
    for record in SeqIO.parse(fasta_file, "fasta"):
        if record.id == chromosome:
            # Extract the substring (note: start and end are 1-based indices)
            return str(record.seq[start-1:end])
    return ""

# Function to get the path of the FASTA file for a given chromosome
def get_fasta_path(chromosome):
    return f"./ALLCHROMOSOMES/{chromosome}.fasta"  # Update path as needed

In [None]:
seqDF['sequence'] = ''
for index, row in seqDF.iterrows():
    chromosome = row['Chromosome']
    start = row['Start']
    end = row['End']
    
    fasta_file = get_fasta_path(chromosome)
    sequence = extract_sequence_from_fasta(fasta_file, chromosome, start, end)
    
    seqDF.at[index, 'sequence'] = sequence

In [None]:
def read_between_start_end(file_path, start_character, end_character):
    start_character = start_character - 1
    with open(file_path, 'r') as file:
        # Move the file pointer to the starting character position
        file.seek(start_character)

        # Read and print the content between the given character positions
        content_between_positions = file.read(end_character - start_character)
        return content_between_positions

In [9]:
for index in range(0, len(seqDF)):
    start = seqDF['start'][index]
    end = seqDF['stop'][index]
    chromosome = seqDF['chr'][index]
    seqDF.loc[index, 'sequence'] = read_between_start_end(chr_path(chromosome), start, end)

In [None]:
def write_fasta(file_name, sequences):
    with open(file_name, "w") as fasta_file:
        for seq_id, sequence in sequences.items():
            fasta_file.write(f">{seq_id}\n{sequence}\n")

In [11]:
file_path = "../input/sequences/"
statusDF = pd.DataFrame(
    {
        "seq_id": [],
        "region": [],
        "file": [],
        "status": [],
    }
)

for index in range(0, len(seqDF)):
    start = seqDF["Start"][index]
    end = seqDF["End"][index]
    chromosome = seqDF["Chromosome"][index]
    region = seqDF["Region"][index]
    sequence_id = f"{chromosome}_{region}__{start}_{end}"
    sequences = {sequence_id: seqDF["Sequence"][index]}

    seq_filename = f"{sequence_id}.fasta"
    write_fasta(file_path+seq_filename, sequences)

    new_row = pd.DataFrame(
        {
            "seq_id": [sequence_id],
            "region": [region],
            "file": [seq_filename],
        }
    )
    statusDF = pd.concat([statusDF, new_row], ignore_index=True)

In [12]:
statusDF.to_csv("status_palindrome.csv", index=False)

#### Palindrome Generation

In [None]:
def palindrome_command(input_file:str):
    """
    input_file: path/input.fasta
    """
    command = f"palindrome -sequence {input_file} -minpallen 5 -maxpallen 100 -gaplimit 20 -nummismatches 0 -overlap"
    return command

In [15]:
#os.chdir(f"/home/brook/Projects/automation/palindrome-reports/output")

In [33]:
def get_difference(input_string):
    first_double_underscore = input_string.index("__") # Find the position of the first "__" and the next "_"
    next_underscore = input_string.index("_", first_double_underscore + 2)

    first_number_str = input_string[first_double_underscore + 2:next_underscore] # Extract the numbers as strings
    second_number_str = input_string[next_underscore + 1:]

    first_number = int(first_number_str) # Converting the extracted strings to integers
    second_number = int(second_number_str)

    difference = second_number - first_number # Calculate the difference
    return difference

In [69]:
# Function to extract numbers from the input string
def extract_numbers(input_str):
    pattern = r'(\d+).*?(\d+)'
    match = re.match(pattern, input_str)

    if match:
        numbers = (int(match.group(1)), int(match.group(2)))
        return numbers
    else:
        return None

def extract_sequence_length(input_str):
    pattern = r'Sequence length is: (\d+)'
    match = re.search(pattern, input_str)
    if match:
        sequence_length = int(match.group(1))
        return sequence_length
    else:
        return None

#### Final Output

In [70]:
status = pd.read_csv("status_palindrome.csv")
master = pd.read_csv('CHM13_centric_transition.bed', sep = '\t') # This is an example. Please change the path according to your needs

In [75]:
master['ds_occupancy'] = ''
master['ds_nucleotides'] = ''
master['sequence_length'] = ''
master['palindrome_count'] = ''

Unnamed: 0,chr,region,start,stop,ds_occupancy,ds_nucleotides,sequence_length,palindrome_count
151,19,centric_transition,30396484,34768171,,,,
152,20,centric_transition,37114981,37969538,,,,
153,21,centric_transition,12418454,16341849,,,,
154,22,centric_transition,20037050,20711065,,,,
155,23,centric_transition,61247454,65927025,,,,


In [None]:
for ind in  range(0, len(status)):
    filepath = f"../output/{status['seq_id'][ind]}.palindrome"
    lines = []
    with open(filepath, 'r') as file:
        for line in file:
            lines.append(line.strip())
    
    total_palindromes = int((len(lines)-3-12)/4)
    sequence_length = extract_sequence_length(lines[1])

    palrecords = []
    for index in range(0,total_palindromes):
        # print(lines[12+(4*index)])
        palstr = lines[12+(4*index)]
        palrecords.append(extract_numbers(palstr))
        
    bp_count = 0
    for pal in palrecords:
        bp_count = bp_count + (pal[1]-pal[0]+1) *2
    
    occupation_percent = bp_count/sequence_length
    
        
    master['palindrome_count'][ind] = total_palindromes
    master['ds_nucleotides'][ind] = bp_count
    master['sequence_length'][ind] = sequence_length
    master['ds_occupancy'][ind] = occupation_percent

In [None]:
master.to_csv("dyad_density.csv", index = False)