# Problems: Sequence Analysis

https://rosalind.info/problems/topics/sequence-analysis/

__Various Pattern to find ORF and intron:<br>__
- "(ATG(?:.{3})+?)(?:TAA|TAG|TGA)"  # multiple of 3  <br>
- "ATG.*?(?:TAA|TAG|TGA)"  # not multiple of 3  <br>
- "GT.*?AG"  # intron finding  <br>

## 1. GenBank Introduction
https://rosalind.info/problems/gbk/

In [1]:
from Bio import Entrez

In [5]:
# Define the search parameters
genus = "Pectinaria"
start_date = "2006/05/07"
end_date = "2008/04/06"

# Set the email for NCBI Entrez
Entrez.email = "your.email@example.com"

# Search the Nucleotide database
handle = Entrez.esearch(db="nucleotide", term=f"{genus}[Organism]",
                        datetype="pdat", mindate=start_date, maxdate=end_date)
record = Entrez.read(handle)
handle.close()

# Get the count of entries
entry_count = int(record["Count"])
print(entry_count)


18


## 2. Data Formats

https://rosalind.info/problems/frmt/

To search for particular accession IDs, we can use the function __Bio.Entrez.efetch(db, rettype)__

In [6]:
from Bio import Entrez
from Bio import SeqIO

In [11]:
with open("rosalind_frmt.txt", "r") as fi:
    accession_list = fi.read().strip().split()
print(accession_list)

['JX469991', 'JX462669', 'JX308821', 'NM_001197168', 'NM_001251956', 'NM_002124', 'JQ712982', 'JX462666', 'FJ817486', 'NM_001081821']


In [12]:
Entrez.email = "your_name@your_mail_server.com"
handle = Entrez.efetch(db="nucleotide", id=accession_list, rettype="fasta")
records = {len(record.seq) : (record.description, str(record.seq)) for record in SeqIO.parse(handle, "fasta")}
records = sorted(records.items(), key=lambda x:x[0])

if records:
    with open("output.fasta", "w") as fw:
        header = records[0][1][0]
        seq = records[0][1][1]
        fw.write(f">{header}\n{seq}")


## 3. Protein Translation

https://rosalind.info/problems/ptra/

In [55]:
from Bio.Seq import translate

In [68]:
dna = "ATGGCCATGGCGCCCAGAACTGAGATCAATAGTACCCGTATTAACGGGTGA"
protein = "MAMAPRTEINSTRING"

with open("rosalind_ptra.txt", "r") as fi:
    dna = fi.readline().strip()
    protein = fi.readline().strip()

for index in range(1, 34):
    try:
        translated_seq = translate(dna, table=index)
        table = translated_seq.index(protein)
        print(index)
        break
    except (ValueError, KeyError):
        continue


13


## 4. Complementing a Strand of DNA

https://rosalind.info/problems/rvco/

In [69]:
from Bio import SeqIO

In [70]:
count = 0
for record in SeqIO.parse("rosalind_rvco.txt", "fasta"):
    revc = record.seq.reverse_complement()
    if revc == record.seq:
        count += 1
print(count)


3


## 5. Finding Genes with ORFs

https://rosalind.info/problems/orfr/

In [79]:
import re
from Bio.Seq import translate, reverse_complement

In [81]:
dna = "AGCCATGTAGCTAACTCAGGTTACATGGGGATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAATAAGCCTGAATGATCCGAGTAGCATCTCAG"

with open("rosalind_orfr.txt", "r") as fi:
    dna = fi.read().strip().replace("\n", "")

rev_dna = reverse_complement(dna)

pattern = "(ATG(?:.{3})+?)(?:TAA|TAG|TGA|$)"
orfs1 = re.findall(pattern, dna, re.S|re.I)
orfs2 = re.findall(pattern, rev_dna, re.S|re.I)

if orfs:
    largest_orf = max(orfs1 + orfs2, key=len)
    protein = translate(largest_orf)
    print(protein)


MLRSTVAQTSVSRSIRLYRRRGCLSESLYGRGGCLTRCHPALYLLGNR
