In [1]:
!pip install biopython
!pip install pymupdf

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

# Given nucleotide sequence with non-standard characters
nucleotide_sequence = """
gctgctagtt to ctgtc.ccc gttacaa.gct ttgaaag cat catggit cota cagggagag 60
acgggacct g aattittgggg agacittaga C gaagcttittg ctg.cgtgttc gcatggaaaa 12O
gag caat cac caatcaattt gttctittgaa agggagcaaa Caccaaagtg gaattgggcc 18O
ttitt catata gcgaagctgc tittittctgtt gaaaataatg gt catacgat tcaa.gctaat 24 O
gtggaaaacg atgatgc.cgg toggattagala attaatggcg aagctt atca gct cacacaa 3 OO
titc catttcc at act cogag tdaacatacg attgaagaag catcct tcc caatggaactic 360
Catcttgttc atgcaaacca toaggggat ttagcggtgc ticggcgttitt gatggaaatt 42O
ggalacagttc atgaaggcat taag.ccgtt toggaagttca toctgaaga agaagggact 48O
gctgaatatt coatttct ct agaccc.gagc ct attcc tigc ctdaaagtgt aactgcttac 54 O
Caatacgacg gttcattgac aaccc ct cot totagdgaag gggtgaaatg gacggtgctt 6 OO
aatgacacca titt.cgatttic agcaacgcaa citt catgcat ttagggacat citat cogcaa 660
aactatogt c cagtic caaga gottagg.cgac agagaaatcg gttitt catta to attaa 7
"""

# Clean the sequence by keeping only G, T, A, and C characters
cleaned_sequence = ''.join(filter(lambda x: x in 'GTACgtac', nucleotide_sequence))

# Translate the cleaned nucleotide sequence to a protein sequence
nucleotide_seq = Seq(cleaned_sequence)
protein_seq = nucleotide_seq.translate(to_stop=True)

# Format the protein sequence with 10 residue spacing and 100 residues per line
def format_protein_sequence(protein, line_length=100, spacing=10):
    formatted_seq = []
    for i in range(0, len(protein), line_length):
        line = protein[i:i+line_length]
        spaced_line = ' '.join([line[j:j+spacing] for j in range(0, len(line), spacing)])
        formatted_seq.append(spaced_line)
    return '\n'.join(formatted_seq)

formatted_protein_seq = format_protein_sequence(str(protein_seq))

# Create a SeqRecord for FASTA format
protein_record = SeqRecord(Seq(str(protein_seq)),
                           id="Translated_Protein_Sequence",
                           description="")

# Write the formatted protein sequence to a FASTA file
with open("protein_sequence.fasta", "w") as output_handle:
    output_handle.write(f">Translated_Protein_Sequence\n{formatted_protein_seq}\n")

print("FASTA file 'protein_sequence.fasta' has been created successfully.")


Collecting biopython
  Downloading biopython-1.83-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.83
Collecting pymupdf
  Downloading PyMuPDF-1.24.4-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.24.3 (from pymupdf)
  Downloading PyMuPDFb-1.24.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.8/15.8 MB[0m [31m68.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, pymupdf
Successfully installed PyMuPDFb-1.24.3 pymupdf-1.24.4
FASTA file 'protein_sequence.fasta' has been created successfully.
