In [14]:
import ast
import numpy as np
tid = "GRCh38_ENST00000495576" #for debugging

In [None]:
np.range(15, 10 - 1, -1)

range(15, 9, -1)

In [37]:
def calculate_poly_x_regions(sequence, exon_regions = None,  strand = None, nucleotides = 'A', allow_mismatches_without_breaking_streak = 1, return_genomic_coordinates = False):
    """
    Calculate the regions of consecutive nucleotides in a given sequence.

    Parameters:
    sequence (str): The input nucleotide sequence.
    nucleotides (str): The nucleotide(s) to look for. Default is 'A'.
    allow_mismatches_without_breaking_streak (int): Number of allowed mismatches within a streak. Default is 1.

    Returns:
    list of tuples: Each tuple contains the start and end indices of a region.
    """
    

    regions = []
    regions_genomic = []
    streak_lengths = []
    start = None
    mismatch_count = 0
    streak_length = 0

    if return_genomic_coordinates:
        if exon_regions is None or strand is None:
            raise ValueError("exon_regions and strand must be provided when return_genomic_coordinates is True")
        genomic_index = []
        for exon_region in exon_regions:
            if strand == '+':
                genomic_index.extend(np.arange(exon_region[0], exon_region[1] + 1))
            else:
                genomic_index.extend(np.arange(exon_region[1], exon_region[0] - 1, -1))


    for i, nucleotide in enumerate(sequence):
        
        if nucleotide in nucleotides:
            if start is None:
                start = i
            mismatch_count = 0
            streak_length += 1
        else:
            if start is not None:
                mismatch_count += 1
                if mismatch_count > allow_mismatches_without_breaking_streak:
                    regions.append((start, i - mismatch_count))
                    if return_genomic_coordinates:
                        regions_genomic.append((int(genomic_index[start]), int(genomic_index[i - mismatch_count]))) # convert to genomic coordinates
                    streak_lengths.append(streak_length)
                    start = None
                    mismatch_count = 0
                    streak_length = 0

    if start is not None:
        regions.append((start, len(sequence) - 1))
        if return_genomic_coordinates:
            regions_genomic.append((int(genomic_index[start]), int(genomic_index[len(sequence) - 1]))) # convert to genomic coordinates
        streak_lengths.append(streak_length)
        
    if return_genomic_coordinates:
        return regions, regions_genomic, streak_lengths
    else:
        return regions, streak_lengths

In [35]:
transcript_intron_exon_sequences_path = ("/tmp/Mazutislab-out/Ignas/RT_comparison/genome_and_annotations/lncs_with_single_isoform_fasta_sequences_strand_exon_intron_positions.txt")
transcript_introns = {}
transcript_exons = {}
transcript_sequences = {}
transcript_strands = {}
for line in open(transcript_intron_exon_sequences_path):
    if line.startswith(">"):
        transcript_id = line[1:].strip()
    elif line.startswith('Strand: '):
        strand = line.strip().split(": ")[1]
        transcript_strands[transcript_id] = strand
    elif line.startswith('Exon positions:'):
        exon_positions = ast.literal_eval(line.strip().split(": ")[1])
        transcript_exons[transcript_id] = exon_positions
    elif line.startswith('Intron positions:'):
        intron_positions = ast.literal_eval(line.strip().split(": ")[1])
        transcript_introns[transcript_id] = intron_positions
    elif line.startswith('Sequence:'): 
        sequence = line.strip().split(": ")[1]
        transcript_sequences[transcript_id] = sequence
    

In [9]:
print(transcript_strands[tid])

-


In [12]:
print(transcript_sequences[tid])

TCAGCCTCCCAAGTAGCTGGGGCTACAGGCACCTGCCACCAAACCCGGCTAATTTTTTTGTATTTTTAGTAGAGACGGGGTTTCACCGTGTTAGCCAGGATCGTCTTGATCTCCTGACCTTGTGATCCACCCGCCTCGGCCTCCCAAATTGCTGGGATTACAGATGTGAGCCACCGCACCTGGTCCAAGAACCCAAGTTTTAGATCTAGAGTGATGTCAGCATGACATTGATTTCCTGAGGCCCAGGGGCGAAGGAGCTGAGGACAGCAGAGGGGTGAAGGAACTCAGCTACAGACAGCAGCAGCTGATGCACAGGCCTCCCAGCGCCTGAAGTCACCCGGAATTGGGAAGTGCTCAGAAGCTTACAAAGCTGCCTCGAGGTGGGAACATGACATAAATCCAAGAGCAGATCCCTGATCCTATAAAAATGTACTAGATGCAGTGGGGGCATTTTAAATGAGCAGAGAAGGACAGACAGATAAACAGAAGGACAAACAGTATTGGGATTGGGATAAATGCTCAGCTTTTGCCCAAATCTTAGTGACTTAAGCATCACTTATTTGCTCACGATTCTGTGGCTGGACCATTTGGTTTGGCTCACAGGGCAGGGACTGTGCTGGTCTTACCTGAGCAGACCTGCATGTCTGCGGTCAACTGGGTTGGCAGAGACAGAGTGACTGTCTTCCTCCAGGAAGCAGCAGGTTAACTGGTTGGCAGAGACAGAGGGACAGAGGGACTGTCTTCCTCCAGGAAGCAGCAGGTTAACTGGTTGGCAGAGACAGAGGGACAGAGGGACTGTCTTCCTCCAGGAAGCAGCAGGTTGGCTCTGTTTCCTTCGTGGGGCAGCTGGTCTCCAGGGCAGCAAGAGAGACCAAGCCCCAGTGCACATTCTACAGCCTCTGTGCACATCAGACTTGTTAATATCCCATTGGCCAGTGTAAGTCACTTGGCCAAGCCCAGATTAAGGAGTGGAAAGATGGAGGCTATCTCCTCCTGGGAG

In [39]:
# Now, for each transcript, calculate poly-A regions
transcript_poly_A_regions = {}
for transcript_id, sequence in transcript_sequences.items():
    #if transcript_id == tid:#for debugging
    if 1==1:
        exon_regions = transcript_exons[transcript_id]
        strand = transcript_strands[transcript_id]
        poly_A_regions, regions_genomic, streak_lengths = calculate_poly_x_regions(sequence, exon_regions=exon_regions, strand=strand, nucleotides='A', allow_mismatches_without_breaking_streak=1, return_genomic_coordinates=True)
        #Now for each poly A region, adjust coordinates to genomic coordinates
        transcript_poly_A_regions[transcript_id] = {'regions': poly_A_regions, 'regions_genomic': regions_genomic, 'streak_lengths': streak_lengths}
#write output to file
output_path = ("/tmp/Mazutislab-out/Ignas/RT_comparison/genome_and_annotations/lncs_with_single_isoform_poly_A_streaks.txt")
with open(output_path, 'w') as f:
    for transcript_id, data in transcript_poly_A_regions.items():
        f.write(f">{transcript_id}\n")
        f.write(f"Poly-A regions (transcript coordinates): {data['regions']}\n")
        f.write(f"Poly-A regions (genomic coordinates): {data['regions_genomic']}\n")
        f.write(f"Streak lengths: {data['streak_lengths']}\n")