Data generation
=====

Generate data for time-scaling simulations.

In [20]:
import toytree
import ipcoal
import numpy as np

In [21]:
realtree = toytree.rtree.unittree(50, treeheight = 25e6, seed = 123)
realtree.draw(layout = "d", scalebar = True);

In [22]:
# Copy the tree three times for testing variation in Ne, g and both.
realtree_vary_ne = realtree.copy()
realtree_vary_g = realtree.copy()
realtree_vary_both = realtree.copy()

In [23]:
# Create a dictionary of random Ne values within an interval and set onto tree.
np.random.seed(0)
dict_ne = {i.name : np.random.randint(5e5, 5e6) for i in realtree_vary_ne.get_feature_dict()}
realtree_vary_ne = realtree_vary_ne.set_node_values("Ne", dict_ne)

In [24]:
# Create a dictionary of g values from a normal distribution and set onto tree.
np.random.seed(1)
dict_g = {i.name : np.random.normal(1, 0.2) for i in realtree_vary_g.get_feature_dict()}
realtree_vary_g = realtree_vary_g.set_node_values("g", dict_g)

In [25]:
# Use the variation in Ne and g already generated simultaneously for the last tree.
realtree_vary_both = realtree_vary_both.set_node_values("Ne", dict_ne)
realtree_vary_both = realtree_vary_both.set_node_values("g", dict_g)

In [26]:
# For the trees where Ne does not vary, set a value of Ne = 1e6 across the tree.
dict_ne_for_real = {i.name : 1e6 for i in realtree.get_feature_dict()}
dict_ne_for_g = {i.name : 1e6 for i in realtree_vary_ne.get_feature_dict()}
realtree = realtree.set_node_values("Ne", dict_ne_for_real)
realtree_vary_g = realtree_vary_g.set_node_values("Ne", dict_ne_for_g)

In [27]:
# For the trees where g does not vary, set a value of g = 1 across the tree.
dict_g_for_real = {i.name : 1 for i in realtree.get_feature_dict()}
dict_g_for_ne = {i.name : 1 for i in realtree_vary_ne.get_feature_dict()}
realtree = realtree.set_node_values("g", dict_g_for_real)
realtree_vary_ne = realtree_vary_ne.set_node_values("g", dict_g_for_ne)

In [28]:
# To simulate chloroplast, copy the base tree and divide Ne values by 4 across all tips.
realtree_chp = realtree.copy()
dict_ne_for_chp = {i.name : (1e6 / 4) for i in realtree_chp.get_feature_dict()}
dict_g_for_chp = {i.name : 1 for i in realtree_chp.get_feature_dict()}
realtree_chp = realtree_chp.set_node_values("Ne", dict_ne_for_chp)
realtree_chp = realtree_chp.set_node_values("g", dict_g_for_chp)

In [29]:
# Divide edge lengths (absolute time) by generation time to get units of generations for each tree.
trealtree = realtree.set_node_values(
    "dist",
    {i.name: i.dist / i.g for i in realtree.get_feature_dict()}
)
trealtree_vary_ne = realtree_vary_ne.set_node_values(
    "dist",
    {i.name: i.dist / i.g for i in realtree_vary_ne.get_feature_dict()}
)
trealtree_vary_g = realtree_vary_g.set_node_values(
    "dist",
    {i.name: i.dist / i.g for i in realtree_vary_g.get_feature_dict()}
)
trealtree_vary_both = realtree_vary_both.set_node_values(
    "dist",
    {i.name: i.dist / i.g for i in realtree_vary_both.get_feature_dict()}
)
trealtree_chp = realtree_chp.set_node_values(
    "dist",
    {i.name: i.dist / i.g for i in realtree_chp.get_feature_dict()}
)

In [30]:
# Visualize the trees to be used for ipcoal sims.
trealtree.draw(layout = "d", scalebar = True);

In [31]:
trealtree_vary_ne.draw(layout = "d", scalebar = True);

In [39]:
trealtree_vary_g.draw(layout = "d", scalebar = True);

In [33]:
trealtree_vary_both.draw(layout = "d", scalebar = True);

In [38]:
trealtree_chp.draw(layout = "d", scalebar = True);

In [35]:
# Write the five trees to be used in ipcoal sims to Newick files.
trealtree.write("/home/henry/phylo-timescale/newick/realtree.tre")
trealtree_vary_ne.write("/home/henry/phylo-timescale/newick/realtree_vary_ne.tre")
trealtree_vary_g.write("/home/henry/phylo-timescale/newick/realtree_vary_g.tre")
trealtree_vary_both.write("/home/henry/phylo-timescale/newick/realtree_vary_both.tre")
trealtree_chp.write("/home/henry/phylo-timescale/newick/realtree_chp.tre")

In [36]:
# Create model objects for each tree.
model_real = ipcoal.Model(trealtree, nsamples = 2)
model_ne = ipcoal.Model(trealtree_vary_ne, nsamples = 2)
model_g = ipcoal.Model(trealtree_vary_g, nsamples = 2)
model_both = ipcoal.Model(trealtree_vary_both, nsamples = 2)
model_chp = ipcoal.Model(trealtree_chp, nsamples = 2)

In [42]:
# Simulate sequence data: 2000 loci of 500 bp.  For the chloroplast, instead do 1 locus of 2000 bp.
model_real.sim_loci(2000, 500)
model_ne.sim_loci(2000, 500)
model_g.sim_loci(2000, 500)
model_both.sim_loci(2000, 500)
model_chp.sim_loci(1, 2000)

In [43]:
# Write the dataframe results of the model objects to CSV files.
model_real.df.to_csv("/home/henry/phylo-timescale/ipcoal-df-csv/realtree.csv")
model_ne.df.to_csv("/home/henry/phylo-timescale/ipcoal-df-csv/realtree_vary_ne.csv")
model_g.df.to_csv("/home/henry/phylo-timescale/ipcoal-df-csv/realtree_vary_g.csv")
model_both.df.to_csv("/home/henry/phylo-timescale/ipcoal-df-csv/realtree_vary_both.csv")
model_chp.df.to_csv("/home/henry/phylo-timescale/ipcoal-df-csv/realtree_chp.csv")

In [45]:
# Write small matrices to files.
model_real.write_concat_to_phylip(outdir = "/home/henry/phylo-timescale/phylip-files", name = "model_real_small_haploid.phy")
model_ne.write_concat_to_phylip(outdir = "/home/henry/phylo-timescale/phylip-files", name = "model_ne_small_haploid.phy")
model_g.write_concat_to_phylip(outdir = "/home/henry/phylo-timescale/phylip-files", name = "model_g_small_haploid.phy")
model_both.write_concat_to_phylip(outdir = "/home/henry/phylo-timescale/phylip-files", name = "model_both_small_haploid.phy")
model_chp.write_concat_to_phylip(outdir = "/home/henry/phylo-timescale/phylip-files", name = "model_chp_small_haploid.phy")

model_real.write_concat_to_phylip(outdir = "/home/henry/phylo-timescale/phylip-files", diploid = True,
                                  name = "model_real_small_diploid.phy")
model_ne.write_concat_to_phylip(outdir = "/home/henry/phylo-timescale/phylip-files", diploid = True,
                                name = "model_ne_small_diploid.phy")
model_g.write_concat_to_phylip(outdir = "/home/henry/phylo-timescale/phylip-files", diploid = True,
                               name = "model_g_small_diploid.phy")
model_both.write_concat_to_phylip(outdir = "/home/henry/phylo-timescale/phylip-files", diploid = True,
                                  name = "model_both_small_diploid.phy")
model_chp.write_concat_to_phylip(outdir = "/home/henry/phylo-timescale/phylip-files", diploid = True,
                                 name = "model_chp_small_diploid.phy")

model_real.write_loci_to_hdf5(outdir = "/home/henry/phylo-timescale/alignments", name = "model_real_small_loci")
model_ne.write_loci_to_hdf5(outdir = "/home/henry/phylo-timescale/alignments", name = "model_ne_small_loci")
model_g.write_loci_to_hdf5(outdir = "/home/henry/phylo-timescale/alignments", name = "model_g_small_loci")
model_both.write_loci_to_hdf5(outdir = "/home/henry/phylo-timescale/alignments", name = "model_both_small_loci")
model_chp.write_loci_to_hdf5(outdir = "/home/henry/phylo-timescale/alignments", name = "model_chp_small_loci")
model_real.write_snps_to_hdf5(outdir = "/home/henry/phylo-timescale/alignments", name = "model_real_small_snps")
model_ne.write_snps_to_hdf5(outdir = "/home/henry/phylo-timescale/alignments", name = "model_ne_small_snps")
model_g.write_snps_to_hdf5(outdir = "/home/henry/phylo-timescale/alignments", name = "model_g_small_snps")
model_both.write_snps_to_hdf5(outdir = "/home/henry/phylo-timescale/alignments", name = "model_both_small_snps")
model_chp.write_snps_to_hdf5(outdir = "/home/henry/phylo-timescale/alignments", name = "model_chp_small_snps")

# Write large matrices to files.
model_real.apply_missing_mask(coverage = 0.5, coverage_type = "locus")
model_ne.apply_missing_mask(coverage = 0.5, coverage_type = "locus")
model_g.apply_missing_mask(coverage = 0.5, coverage_type = "locus")
model_both.apply_missing_mask(coverage = 0.5, coverage_type = "locus")
model_chp.apply_missing_mask(coverage = 0.5, coverage_type = "locus")

model_real.write_concat_to_phylip(outdir = "/home/henry/phylo-timescale/phylip-files", name = "model_real_large_haploid.phy")
model_ne.write_concat_to_phylip(outdir = "/home/henry/phylo-timescale/phylip-files", name = "model_ne_large_haploid.phy")
model_g.write_concat_to_phylip(outdir = "/home/henry/phylo-timescale/phylip-files", name = "model_g_large_haploid.phy")
model_both.write_concat_to_phylip(outdir = "/home/henry/phylo-timescale/phylip-files", name = "model_both_large_haploid.phy")
model_chp.write_concat_to_phylip(outdir = "/home/henry/phylo-timescale/phylip-files", name = "model_chp_large_haploid.phy")

model_real.write_concat_to_phylip(outdir = "/home/henry/phylo-timescale/phylip-files", diploid = True,
                                  name = "model_real_large_diploid.phy")
model_ne.write_concat_to_phylip(outdir = "/home/henry/phylo-timescale/phylip-files", diploid = True,
                                name = "model_ne_large_diploid.phy")
model_g.write_concat_to_phylip(outdir = "/home/henry/phylo-timescale/phylip-files", diploid = True,
                               name = "model_g_large_diploid.phy")
model_both.write_concat_to_phylip(outdir = "/home/henry/phylo-timescale/phylip-files", diploid = True,
                                  name = "model_both_large_diploid.phy")
model_chp.write_concat_to_phylip(outdir = "/home/henry/phylo-timescale/phylip-files", diploid = True,
                                 name = "model_chp_large_diploid.phy")

model_real.write_loci_to_hdf5(outdir = "/home/henry/phylo-timescale/alignments", name = "model_real_large_loci")
model_ne.write_loci_to_hdf5(outdir = "/home/henry/phylo-timescale/alignments", name = "model_ne_large_loci")
model_g.write_loci_to_hdf5(outdir = "/home/henry/phylo-timescale/alignments", name = "model_g_large_loci")
model_both.write_loci_to_hdf5(outdir = "/home/henry/phylo-timescale/alignments", name = "model_both_large_loci")
model_chp.write_loci_to_hdf5(outdir = "/home/henry/phylo-timescale/alignments", name = "model_chp_large_loci")
model_real.write_snps_to_hdf5(outdir = "/home/henry/phylo-timescale/alignments", name = "model_real_large_snps")
model_ne.write_snps_to_hdf5(outdir = "/home/henry/phylo-timescale/alignments", name = "model_ne_large_snps")
model_g.write_snps_to_hdf5(outdir = "/home/henry/phylo-timescale/alignments", name = "model_g_large_snps")
model_both.write_snps_to_hdf5(outdir = "/home/henry/phylo-timescale/alignments", name = "model_both_large_snps")
model_chp.write_snps_to_hdf5(outdir = "/home/henry/phylo-timescale/alignments", name = "model_chp_large_snps")

wrote concat locus (100 x 1000000bp) to /home/henry/phylo-timescale/phylip-files/model_real_small_haploid.phy
wrote concat locus (100 x 1000000bp) to /home/henry/phylo-timescale/phylip-files/model_ne_small_haploid.phy
wrote concat locus (100 x 1000000bp) to /home/henry/phylo-timescale/phylip-files/model_g_small_haploid.phy
wrote concat locus (100 x 1000000bp) to /home/henry/phylo-timescale/phylip-files/model_both_small_haploid.phy
wrote concat locus (100 x 2000bp) to /home/henry/phylo-timescale/phylip-files/model_chp_small_haploid.phy
wrote concat locus (50 x 1000000bp) to /home/henry/phylo-timescale/phylip-files/model_real_small_diploid.phy
wrote concat locus (50 x 1000000bp) to /home/henry/phylo-timescale/phylip-files/model_ne_small_diploid.phy
wrote concat locus (50 x 1000000bp) to /home/henry/phylo-timescale/phylip-files/model_g_small_diploid.phy
wrote concat locus (50 x 1000000bp) to /home/henry/phylo-timescale/phylip-files/model_both_small_diploid.phy
wrote concat locus (50 x 200