Data generation
=====

Generate data for time-scaling simulations.

In [15]:
import toytree
import ipcoal
import numpy as np
import ipyrad.analysis as ipa
import ipyparallel as ipp

In [16]:
realtree = toytree.rtree.unittree(50, treeheight = 25e6, seed = 123)
realtree.draw(layout = "d", scalebar = True);

In [17]:
# Copy the tree three times for testing variation in Ne, g and both.
realtree_vary_ne = realtree.copy()
realtree_vary_g = realtree.copy()
realtree_vary_both = realtree.copy()

In [18]:
# Create a dictionary of random Ne values within an interval and set onto tree.
dict_ne = {i.name : np.random.randint(1e5, 1e6) for i in realtree_vary_ne.get_feature_dict()}
realtree_vary_ne = realtree_vary_ne.set_node_values("Ne", dict_ne)

In [19]:
# Create a dictionary of random g values within an interval and set onto tree.
dict_g = {i.name : np.random.randint(1, 10) for i in realtree_vary_g.get_feature_dict()}
realtree_vary_g = realtree_vary_ne.set_node_values("g", dict_g)

In [20]:
# Use the variation in Ne and g already generated simultaneously for the last tree.
realtree_vary_both = realtree_vary_both.set_node_values("Ne", dict_ne)
realtree_vary_both = realtree_vary_both.set_node_values("g", dict_g)

In [21]:
# For the trees where g does not vary, set a value of g = 1 across the tree.
dict_g_for_real = {i.name : 1 for i in realtree.get_feature_dict()}
dict_g_for_ne = {i.name : 1 for i in realtree_vary_ne.get_feature_dict()}
realtree = realtree.set_node_values("g", dict_g_for_real)
realtree_vary_ne = realtree_vary_ne.set_node_values("g", dict_g_for_ne)

In [22]:
# Divide edge lengths (absolute time) by generation time to get units of generations for each tree.
trealtree = realtree.set_node_values(
    "dist",
    {i.name: i.dist / i.g for i in realtree.get_feature_dict()}
)
trealtree_vary_ne = realtree_vary_ne.set_node_values(
    "dist",
    {i.name: i.dist / i.g for i in realtree_vary_ne.get_feature_dict()}
)
trealtree_vary_g = realtree_vary_g.set_node_values(
    "dist",
    {i.name: i.dist / i.g for i in realtree_vary_g.get_feature_dict()}
)
trealtree_vary_both = realtree_vary_both.set_node_values(
    "dist",
    {i.name: i.dist / i.g for i in realtree_vary_both.get_feature_dict()}
)

In [23]:
# Visualize the trees to be used for ipcoal sims.
trealtree.draw(layout = "d", scalebar = True);

In [24]:
trealtree_vary_ne.draw(layout = "d", scalebar = True);

In [25]:
trealtree_vary_g.draw(layout = "d", scalebar = True);

In [26]:
trealtree_vary_both.draw(layout = "d", scalebar = True);

In [27]:
# Create model objects for each tree.
model_real = ipcoal.Model(trealtree, nsamples = 2)
model_ne = ipcoal.Model(trealtree_vary_ne, nsamples = 2)
model_g = ipcoal.Model(trealtree_vary_g, nsamples = 2)
model_both = ipcoal.Model(trealtree_vary_both, nsamples = 2)

In [28]:
# Simulate sequence data: 2000 loci of 500 bp.
model_real.sim_loci(2000, 500)
model_ne.sim_loci(2000, 500)
model_g.sim_loci(2000, 500)
model_both.sim_loci(2000, 500)

In [30]:
# Write small matrices to files.
model_real.write_concat_to_phylip(outdir = "/moto/eaton/users/hnl2109/phylip-files", name = "model_real_small.phy")
model_ne.write_concat_to_phylip(outdir = "/moto/eaton/users/hnl2109/phylip-files", name = "model_ne_small.phy")
model_g.write_concat_to_phylip(outdir = "/moto/eaton/users/hnl2109/phylip-files", name = "model_g_small.phy")
model_both.write_concat_to_phylip(outdir = "/moto/eaton/users/hnl2109/phylip-files", name = "model_both_small.phy")

wrote concat locus (100 x 1000000bp) to /moto/eaton/users/hnl2109/phylip-files/model_real.phy
wrote concat locus (100 x 1000000bp) to /moto/eaton/users/hnl2109/phylip-files/model_ne.phy
wrote concat locus (100 x 1000000bp) to /moto/eaton/users/hnl2109/phylip-files/model_g.phy
wrote concat locus (100 x 1000000bp) to /moto/eaton/users/hnl2109/phylip-files/model_bot.phy


In [41]:
# Write large matrices to files.
model_real.apply_missing_mask(coverage = 0.5, coverage_type = "locus")
model_ne.apply_missing_mask(coverage = 0.5, coverage_type = "locus")
model_g.apply_missing_mask(coverage = 0.5, coverage_type = "locus")
model_both.apply_missing_mask(coverage = 0.5, coverage_type = "locus")

model_real.write_concat_to_phylip(outdir = "/moto/eaton/users/hnl2109/phylip-files", name = "model_real_large.phy")
model_ne.write_concat_to_phylip(outdir = "/moto/eaton/users/hnl2109/phylip-files", name = "model_ne_large.phy")
model_g.write_concat_to_phylip(outdir = "/moto/eaton/users/hnl2109/phylip-files", name = "model_g_large.phy")
model_both.write_concat_to_phylip(outdir = "/moto/eaton/users/hnl2109/phylip-files", name = "model_both_large.phy")

ipcoalError: Missing data can only be applied to a dataset once.