In [13]:
def find_shine_dalgarno(sequence, shine_dalgarno="AGGAGG"):
    """Find the Shine-Dalgarno sequence in the given DNA sequence."""
    index = sequence.find(shine_dalgarno)
    if index != -1:
        return index
    else:
        return None

def cut_sequence(sequence, shine_dalgarno="AGGAGG"):
    """Cut the DNA sequence based on the Shine-Dalgarno sequence."""
    sections = []
    start_index = find_shine_dalgarno(sequence, shine_dalgarno)
    if start_index is not None:
        start_index += len(shine_dalgarno)
        while True:
            index = find_shine_dalgarno(sequence[start_index:], shine_dalgarno)
            if index is not None:
                sections.append(sequence[start_index:start_index + index])
                start_index += index + len(shine_dalgarno)
            else:
                sections.append(sequence[start_index:])
                break
    return sections
    
def translate_to_uppercase(sequence):
    """Translate the DNA sequence to uppercase."""
    return sequence.upper()

def filter_dna_sequence(sequence):
    """Filter out characters that are not 'A', 'T', 'C', or 'G'."""
    return ''.join(filter(lambda x: x in 'ATCG', sequence.upper()))

def read_dna_sequence(filename):
    """Read DNA sequence from the document where the first column is empty."""
    sequence = ""
    read_started = False
    with open(filename, 'r') as file:
        for line in file:
            if not read_started:
                if line.startswith(" "):
                    read_started = True  # Start reading from the first line with a space
            else:
                if line.strip() == "//":  # Stop reading if encounter "//"
                    read_started = False  # Reset flag to allow reading subsequent sequences
                else:
                    # Find the start and stop points within the line
                    start_index = line.find(' ')
                    while start_index != -1:  # Loop as long as there are occurrences of ' '
                        # Find the next occurrence of '//' after the start_index
                        stop_index = line.find('//', start_index)
                        if stop_index == -1:
                            stop_index = len(line)  # If no more occurrences found, read till the end of the line
                        # Add the content between ' ' and '//' to the sequence
                        sequence += line[start_index:stop_index]

                        # Find the next start_index after the current stop_index
                        start_index = line.find(' ', stop_index)

    return sequence

def main():
    # Read DNA sequence from the document
    filename = "input.txt"  # Specify your document filename
    dna_sequence = read_dna_sequence(filename)
    
    # Filter and process DNA sequence
    dna_sequence = filter_dna_sequence(dna_sequence)
    sections = cut_sequence(dna_sequence)
    
    # Print resulting sections
    print("Number of sections:", len(sections))
    for i, section in enumerate(sections):
        print(f"Section {i+1}:", section)
        print("----------------------------------------------------------------------")

if __name__ == "__main__":
    main()


Number of sections: 20
Section 1: CGATACAGAAGCTGGTGGCGGCGCGTCTGGCTGCTGATGTCATGGGCGTACCCACGATCCTGCTGGCGCGGACCGATGCGGATGCCGCAGACCTGATCACGTCTGACTGTGACGAATACGACCGCCCCTTTATACGCGGCGACCGCACGGCGGAAGGTTTTTTCCGCACTAACGCCGGCATTGAGCAGGCAATAAGCCGTGGACTGGCTTACGCCCCTTACGCCGATGTCCTTTGGTGTGAAACCTCCACCCCGGATTTGGCCATGGCCCAGCGTTTTGCCGATGCGATTCACGCCCGCTATCCAGGCAAGCTGCTGGCTTACAACTGTTCGCCTTCGTTTAACTGGAAAAAGAATCTGGATGACAAAACCATTGCGGCCTTCCAGCAGGCGCTCAGTGACATGGGCTATCGCTTTCAGTTCATCACGCTGGCGGGCATCCACAGCATGTGGTTCAACATGTTCGATCTTGCGCACGCTTACGCTCAGGGCGAAGGCATGCGCCACTATGTGGAAAAAGTTCAGCAGCCGGAGTTTGCTGCGCGAGAACGGGGTTACAGCTTTTCATCGCATCAGC
----------------------------------------------------------------------
Section 2: TCGGAACAGGCTATTTCGATCAGGTGACCAACACGATTCAGGGCGGCAAGTCATCGGTGACGGCCCTGACAGGGTCCACCGAGGAGCATCAGTTTTGAGTTGAGTCTGTGTCTTTCCCGCCCTGCATCTGCGGGCGGGCTTTTCAGGAGTGAATCATGCCGCCACGCGAATCACTGATTGCTCACACCATTCTGCAGGGTTTTGACGCGCAGTATGGCCGTTTTCTCGATATTACCGCAGGCGCACAACAGCGTTTTGAACAGGCCGAATGGCAGGCGGTACAGCAGGCGATGAAGGCGCGTATTCA