In [1]:
from Bio import Seq
from Bio import SeqIO
from Bio.SeqIO import parse

In [2]:
# task 1

In [3]:
histones = list(parse("../../histones.fa", "fasta"))

In [4]:
for seq in histones:
    print(seq)

ID: YDR225W
Name: YDR225W
Description: YDR225W cdna chromosome:R64-1-1:IV:915530:915928:1 gene:YDR225W gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:HTA1 description:Histone H2A; core histone protein required for chromatin assembly and chromosome function; one of two nearly identical subtypes (see also HTA2); DNA damage-dependent phosphorylation by Mec1p facilitates DNA repair; acetylated by Nat4p; N-terminally propionylated in vivo [Source:SGD;Acc:S000002633]
Number of features: 0
Seq('ATGTCCGGTGGTAAAGGTGGTAAAGCTGGTTCAGCTGCTAAAGCTTCTCAATCT...TAA', SingleLetterAlphabet())
ID: YBL003C
Name: YBL003C
Description: YBL003C cdna chromosome:R64-1-1:II:235394:235792:-1 gene:YBL003C gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:HTA2 description:Histone H2A; core histone protein required for chromatin assembly and chromosome function; one of two nearly identical (see also HTA1) subtypes; DNA damage-dependent phosphorylation by Mec1p facilit

In [5]:
translated_histones = list(map(
    lambda seq: seq.translate(id=seq.id, name=seq.name, description=seq.description,stop_symbol=""), histones))

In [6]:
for seq in translated_histones:
    print(seq)

ID: YDR225W
Name: YDR225W
Description: YDR225W cdna chromosome:R64-1-1:IV:915530:915928:1 gene:YDR225W gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:HTA1 description:Histone H2A; core histone protein required for chromatin assembly and chromosome function; one of two nearly identical subtypes (see also HTA2); DNA damage-dependent phosphorylation by Mec1p facilitates DNA repair; acetylated by Nat4p; N-terminally propionylated in vivo [Source:SGD;Acc:S000002633]
Number of features: 0
Seq('MSGGKGGKAGSAAKASQSRSAKAGLTFPVGRVHRLLRRGNYAQRIGSGAPVYLT...QEL', HasStopCodon(ExtendedIUPACProtein(), ''))
ID: YBL003C
Name: YBL003C
Description: YBL003C cdna chromosome:R64-1-1:II:235394:235792:-1 gene:YBL003C gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:HTA2 description:Histone H2A; core histone protein required for chromatin assembly and chromosome function; one of two nearly identical (see also HTA1) subtypes; DNA damage-dependent phosphorylatio

In [8]:
with open("output/translated_histones.fa", "w") as output_handle:
    SeqIO.write(translated_histones, output_handle, "fasta")

In [9]:
# task 2

In [10]:
from Bio import AlignIO
from Bio import Phylo
from Bio.Align.Applications import ClustalwCommandline

In [13]:
clustalw = ClustalwCommandline(infile="output/translated_histones.fa")

In [14]:
clustalw()

('\n\n\n CLUSTAL 2.1 Multiple Sequence Alignments\n\n\nSequence format is Pearson\nSequence 1: YDR225W      132 aa\nSequence 2: YBL003C      132 aa\nSequence 3: YNL031C      136 aa\nSequence 4: YBR010W      136 aa\nSequence 5: YDR224C      131 aa\nSequence 6: YBR009C      103 aa\nSequence 7: YBL002W      131 aa\nSequence 8: YNL030W      103 aa\nStart of Pairwise alignments\nAligning...\n\nSequences (1:2) Aligned. Score:  98\nSequences (1:3) Aligned. Score:  17\nSequences (1:4) Aligned. Score:  17\nSequences (1:5) Aligned. Score:  12\nSequences (1:6) Aligned. Score:  17\nSequences (1:7) Aligned. Score:  10\nSequences (1:8) Aligned. Score:  17\nSequences (2:3) Aligned. Score:  17\nSequences (2:4) Aligned. Score:  17\nSequences (2:5) Aligned. Score:  12\nSequences (2:6) Aligned. Score:  17\nSequences (2:7) Aligned. Score:  10\nSequences (2:8) Aligned. Score:  17\nSequences (3:4) Aligned. Score:  100\nSequences (3:5) Aligned. Score:  17\nSequences (3:6) Aligned. Score:  24\nSequences (3:7)

In [15]:
translated_histones_alignments = AlignIO.read("output/translated_histones.aln", "clustal")

In [16]:
print(translated_histones_alignments)

SingleLetterAlphabet() alignment with 8 rows and 148 columns
------MSG-GKGGKAGSAAKASQSRSAKAGLTFPVGRVHRLLR...QEL YDR225W
------MSG-GKGGKAGSAAKASQSRSAKAGLTFPVGRVHRLLR...QEL YBL003C
------MSGRGKGGKGLGKGGAKRHRKILRDNIQGITKP-----...GG- YBR009C
------MSGRGKGGKGLGKGGAKRHRKILRDNIQGITKP-----...GG- YNL030W
MARTKQTARKSTGGKAPRKQLASKAARKSAPSTGGVKKPHRYKP...--- YNL031C
MARTKQTARKSTGGKAPRKQLASKAARKSAPSTGGVKKPHRYKP...--- YBR010W
------MSAKAEKKPASKAPAEKKPAAKKTSTSTDGKKRSKARK...QA- YDR224C
------MSSAAEKKPASKAPAEKKPAAKKTSTSVDGKKRSKVRK...QA- YBL002W


In [17]:
translated_histones_tree = Phylo.read("output/translated_histones.dnd", "newick")

In [18]:
print(translated_histones_tree)

Tree(rooted=False, weight=1.0)
    Clade()
        Clade(branch_length=0.03455)
            Clade(branch_length=0.40473)
                Clade(branch_length=0.00758, name='YDR225W')
                Clade(branch_length=0.00758, name='YBL003C')
            Clade(branch_length=0.45411)
                Clade(branch_length=0.01827, name='YDR224C')
                Clade(branch_length=0.01227, name='YBL002W')
        Clade(branch_length=0.35161)
            Clade(branch_length=0.0, name='YNL031C')
            Clade(branch_length=0.0, name='YBR010W')
        Clade(branch_length=0.40567)
            Clade(branch_length=0.0, name='YBR009C')
            Clade(branch_length=0.0, name='YNL030W')


In [19]:
# task 3

In [20]:
from Bio.Align.Applications import MuscleCommandline

In [21]:
muscle = MuscleCommandline(input="output/translated_histones.fa", out="output/histones_aligned.fa")

In [22]:
muscle()

('',
 '\nMUSCLE v3.8.31 by Robert C. Edgar\n\nhttp://www.drive5.com/muscle\nThis software is donated to the public domain.\nPlease cite: Edgar, R.C. Nucleic Acids Res 32(5), 1792-97.\n\ntranslated_histones 8 seqs, max length 136, avg  length 125\n00:00:00    23 MB(-3%)  Iter   1    2.78%  K-mer dist pass 1\n00:00:00    23 MB(-3%)  Iter   1  100.00%  K-mer dist pass 1\n00:00:00    23 MB(-3%)  Iter   1    2.78%  K-mer dist pass 2\n00:00:00    23 MB(-3%)  Iter   1  100.00%  K-mer dist pass 2\n00:00:00    23 MB(-3%)  Iter   1   14.29%  Align node       \n00:00:00    24 MB(-3%)  Iter   1   28.57%  Align node\n00:00:00    24 MB(-3%)  Iter   1   42.86%  Align node\n00:00:00    24 MB(-3%)  Iter   1   57.14%  Align node\n00:00:00    25 MB(-3%)  Iter   1   71.43%  Align node\n00:00:00    25 MB(-3%)  Iter   1   85.71%  Align node\n00:00:00    25 MB(-3%)  Iter   1  100.00%  Align node\n00:00:00    25 MB(-3%)  Iter   1  100.00%  Align node\n00:00:00    25 MB(-3%)  Iter   1   12.50%  Root alignment\

In [23]:
muscle_translated_histones_alignments = AlignIO.read("output/histones_aligned.fa", "fasta")

In [24]:
# task 4

In [25]:
from Bio.Phylo.TreeConstruction import DistanceCalculator
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor

In [26]:
calculator = DistanceCalculator('blosum62')

In [27]:
dtc = DistanceTreeConstructor(calculator, method="nj")

In [28]:
clustalw_tree = dtc.build_tree(translated_histones_alignments)
print(clustalw_tree)

Tree(rooted=False)
    Clade(branch_length=0, name='Inner6')
        Clade(branch_length=0.4761876612205521, name='Inner2')
            Clade(branch_length=0.008833858944954143, name='YDR225W')
            Clade(branch_length=0.004927608944954143, name='YBL003C')
        Clade(branch_length=0.0859944482571462, name='Inner5')
            Clade(branch_length=0.4154594548522553, name='Inner3')
                Clade(branch_length=0.0, name='YBR010W')
                Clade(branch_length=0.0, name='YNL031C')
            Clade(branch_length=0.40758581263745663, name='Inner4')
                Clade(branch_length=0.0, name='YNL030W')
                Clade(branch_length=0.0, name='YBR009C')
        Clade(branch_length=0.5625149442430137, name='Inner1')
            Clade(branch_length=0.01631579517908066, name='YBL002W')
            Clade(branch_length=0.011897370965120002, name='YDR224C')


In [29]:
muscle_tree = dtc.build_tree(muscle_translated_histones_alignments)
print(muscle_tree)

Tree(rooted=False)
    Clade(branch_length=0, name='Inner6')
        Clade(branch_length=0.34686480787875923, name='Inner4')
            Clade(branch_length=0.0, name='YNL031C')
            Clade(branch_length=0.0, name='YBR010W')
        Clade(branch_length=0.048138805630008275, name='Inner5')
            Clade(branch_length=0.40101364203892065, name='Inner3')
                Clade(branch_length=0.006373634553473395, name='YBL003C')
                Clade(branch_length=0.00738783333643489, name='YDR225W')
            Clade(branch_length=0.43997580657190616, name='Inner1')
                Clade(branch_length=0.0, name='YNL030W')
                Clade(branch_length=0.0, name='YBR009C')
        Clade(branch_length=0.43469466097139897, name='Inner2')
            Clade(branch_length=0.013202179584099749, name='YBL002W')
            Clade(branch_length=0.015010986560100914, name='YDR224C')
