Extract scaffold 12 from full M. zebra genome assembly.

In [1]:
from Bio import SeqIO
import gzip

target = 'scaffold_12'

assembly = gzip.open('../../data/REFERENCE_DATA/M_zebra_v0.assembly.fasta.gz')

for rec in SeqIO.parse(assembly, 'fasta'):
    if rec.id == target:
        out = open(target+'.fasta','w')
        SeqIO.write(rec, out, 'fasta')
        out.close()

Convert data in vcf file to alignment across the entire length of scaffold 12.

In [None]:
%%bash
python ../../../../Dropbox/Github/genomisc/popogeno/vcf_2_hap.py ../../data/VCF/Malawi2015_all_sc_12_3750000-3850000_beagle_annotated.vcf.gz \
--fasta \
--fill_from_ref_fasta scaffold_12.fasta \
--consensus \
-o s12_vcf_full_consensus

Specify list of target taxa to be included in alignment.

Parse alignment and display all taxa in alignment file.

In [3]:
from Bio import AlignIO

ali = AlignIO.read(open("s12_vcf_full_consensus.fas", 'r'),'fasta')

total_length = len(ali[0, :].seq)
print "Total length: %s\n" %total_length

sps = []

for rec in ali:
    sps.append(rec.id)
    
for sp in sorted(sps):
    print sp

Total length: 8758262

A_calliptera_Bua
A_calliptera_Chitimba
A_calliptera_Chizumulu
A_calliptera_Enukweni
A_calliptera_Itupi_1
A_calliptera_Itupi_2
A_calliptera_Itupi_3
A_calliptera_Itupi_4
A_calliptera_Kitai_Dam
A_calliptera_Lake_Chidya
A_calliptera_Lake_Chilwa
A_calliptera_Luwawa
A_calliptera_Mbaka_river_Female
A_calliptera_Near_Kyela
A_calliptera_North_Rukuru
A_calliptera_Rovuma_river_Female
A_calliptera_Salima_Father
A_calliptera_Salima_Mother
A_calliptera_Salima_Offspring
A_calliptera_Songwe_River
A_calliptera_South_Rukuru
A_calliptera_Upper_Rovuma
Alticorpus_geoffreyi
Alticorpus_macrocleithrum
Astatotilapia_Kingiri
Astatotilapia_rujewa
Astatotilapia_tweddlei
Aulonocara_minutus
Aulonocara_steveni
Aulonocara_stuartgranti_Father
Aulonocara_stuartgranti_Mother
Aulonocara_stuartgranti_Offspring
Aulonocara_yellow
Buccochromis_nototaenia
Buccochromis_rhoadesii
Champsochromis_caeruelus_1
Champsochromis_caeruelus_2
Chilotilapia_rhoadesii_1
Chilotilapia_rhoadesii_2
Chilotilapia_rhoadesii_

Select a subset of taxa for the test dataset.

In [4]:
target_list = ['Alticorpus_geoffreyi',
               'Alticorpus_macrocleithrum',
               'Astatotilapia_tweddlei',
               'Aulonocara_minutus',
               'Aulonocara_steveni',
               'Aulonocara_yellow',
               'Buccochromis_nototaenia',
               'Buccochromis_rhoadesii',
               'Chilotilapia_rhoadesii_1',
               'Copadichromis_quadrimaculatus',
               'Copadichromis_trimaculatus',
               'Copadichromis_virginalis',
               'Ctenopharynx_nitidus',
               'Ctenopharynx_intermedius_1',
               'Cynotilapia_afra',
               'Cynotilapia_axelrodi',
               'Dimidiochromis_compressiceps',
               'Dimidiochromis_dimidiatus',
               'Dimidiochromis_strigatus',
               'Diplotaxodon_greenwoodi',
               'Diplotaxodon_limnothrissa',
               'Diplotaxodon_macrops',
               'Diplotaxodon_macrops_black_dorsal',
               'Diplotaxodon_ngulube',
               'Diplotaxodon_white_back_similis',
               'Iodotropheus_sprengerae',
               'Labeotropheus_trewavasae',
               'Lethrinops_albus',
               'Lethrinops_auritus',
               'Lethrinops_lethrinus',
               'Lethrinops_gossei',
               'Lethrinops_longimanus_redhead',
               'Lethrinops_sp_oliveri',
               'Metriaclima_zebra',
               'Mylochromis_anaphyrmus',
               'Nimbochromis_linni',
               'Nimbochromis_livingstoni',
               'Nimbochromis_polystigma',
               'Otopharynx_lithobates',
               'Otopharynx_speciosus_1',
               'Pallidochromis_tokolosh',
               'Genyochromis_mento',
               'Petrotilapia_genalutea',
               'Placidochromis_johnstoni',
               'Placidochromis_longimanus_1',
               'Stigmatochromis_guttatus',
               'Tropheops_tropheops',
               'Taeniolethrinops_furcicauda',
               'Tyrannochromis_nigriventer']

Check number of target taxa. Should be n < 50 for some tests.

In [None]:
print len(target_list)

Reduce alignment to only target taxa.

In [6]:
!mkdir target_only_01

In [4]:
cd target_only_01/

/home/chrishah/WORKING/Diplotaxodon/Phylogeny/s12/target_only_01


In [8]:
from Bio import AlignIO

from Bio.Align import MultipleSeqAlignment

reduced_ali = MultipleSeqAlignment([])

for rec in ali:
    if rec.id in target_list:
            reduced_ali.append(rec)

AlignIO.write(reduced_ali[:,:], 's12_vcf_full_consensus_target_only_1.fas', 'fasta')

1

Infer Phylogeny using RAxML.

In [None]:
%%bash

threads=6
prefix='s12_consensus_target_only_1'
ali='./s12_vcf_full_consensus_target_only_1.fas'

/home/chrishah/src/RAXML/RAxML.8.2.9/standard-RAxML/raxmlHPC-PTHREADS-SSE3 \
-f a -m GTRGAMMA \
-T $threads -n $prefix -s $ali \
-p 1234 -x 1234 -N 100

Remove branch lengths from tree.

With a simple command line sed. 

In [9]:
!cat RAxML_bipartitions.s12_consensus_target_only_1 | sed 's/:[0-9].[0-9]\+//g' > RAxML_bipartitions.s12_consensus_target_only_1.nobranchlenghts.nwk

Or using ETE.

In [5]:
from ete3 import Tree

t = Tree("RAxML_bipartitions.s12_consensus_target_only_1")

#display tree
print "\nUnrooted\n"
print t

#root tree to Astatotilapia tweddlei
t.set_outgroup('Astatotilapia_tweddlei')

print "\nRooted"
print t

#Write rooted tree without to file without any branchlengths (format=9)
t.write(format=9, outfile='RAxML_bipartitions.s12_consensus_target_only_1.nobranchlenghts.nwk')

#Write rooted tree to file with branch lengths (format=3)
t.write(format=3, outfile='RAxML_bipartitions.s12_consensus_target_only_1.rooted_branchlenghts.nwk')


Unrooted


         /-Labeotropheus_trewavasae
      /-|
     |  |   /-Petrotilapia_genalutea
     |   \-|
     |      \-Metriaclima_zebra
   /-|
  |  |      /-Iodotropheus_sprengerae
  |  |   /-|
  |  |  |  |   /-Tropheops_tropheops
  |  |  |   \-|
  |   \-|      \-Genyochromis_mento
  |     |
  |     |   /-Cynotilapia_afra
  |      \-|
  |         \-Cynotilapia_axelrodi
  |
  |      /-Diplotaxodon_limnothrissa
  |     |
  |     |         /-Diplotaxodon_ngulube
  |   /-|      /-|
  |  |  |   /-|   \-Diplotaxodon_white_back_similis
  |  |  |  |  |
  |  |   \-|   \-Diplotaxodon_greenwoodi
  |  |     |
  |  |     |   /-Pallidochromis_tokolosh
  |  |      \-|
  |  |        |   /-Diplotaxodon_macrops_black_dorsal
  |  |         \-|
  |  |            \-Diplotaxodon_macrops
  |  |
  |  |         /-Tyrannochromis_nigriventer
  |  |        |
  |  |        |         /-Placidochromis_johnstoni
  |  |      /-|      /-|
  |  |     |  |   /-|   \-Lethrinops_auritus
  |  |     |  |  |  |
  |  |    

In [3]:
cd ..

/home/chrishah/WORKING/Diplotaxodon/Phylogeny/s12
