Convert notebook to executable script with `jupyter nbconvert --to script seq-sims.ipynb`

In [1]:
using Distributed

if nprocs() < 2
    addprocs(2 - nprocs())
end

nprocs(), Threads.nthreads()

(2, 1)

In [2]:
@everywhere using Pkg
Pkg.activate("/mnt/dv/wid/projects4/SolisLemus-network-merging/")
Pkg.instantiate()

@everywhere Pkg.activate("/mnt/dv/wid/projects4/SolisLemus-network-merging/")
@everywhere using InPhyNet, PhyloNetworks
@everywhere include("/mnt/dv/wid/projects4/SolisLemus-network-merging/simulation-study/simulation-scripts/helpers/helpers.jl")
@everywhere cd("/mnt/dv/wid/projects4/SolisLemus-network-merging/simulation-study/from-sequences")

[32m[1m  Activating[22m[39m project at `/mnt/dv/wid/projects4/SolisLemus-network-merging`
[32m[1m  Activating[22m[39m project at `/mnt/dv/wid/projects4/SolisLemus-network-merging`


      From worker 2:	[32m[1m  Activating[22m[39m project at `/mnt/dv/wid/projects4/SolisLemus-network-merging`


In [3]:
# "Instance" variables
ntaxa = 500
replicatenum = 1
ngt = 100
seq_len = 500
ils_level = "med"
m = 20
dmethod = "AGIC"

true_net = load_true_net_ils_adjusted_level1(ntaxa, replicatenum, ils_level)
for e in true_net.edge
    if e.length == -1. e.length = 0.473 end
end

In [4]:
# File paths
data_dir = "/mnt/dv/wid/projects4/SolisLemus-network-merging/simulation-study/from-sequences/data/"
checkpoint_dir = joinpath(pwd(), "checkpoint_files")
if !isdir(checkpoint_dir) mkdir(checkpoint_dir) end

truegt_file = joinpath(checkpoint_dir, "truegt_n$(ntaxa)_$(replicatenum)_$(ngt)_$(ils_level).treefile")
seq_file_prefix = joinpath(checkpoint_dir, "seqfile_n$(ntaxa)_$(replicatenum)_$(ngt)_$(seq_len)_$(ils_level).phy")
estgt_file = joinpath(checkpoint_dir, "estgt_n$(ntaxa)_$(replicatenum)_$(ngt)_$(seq_len)_$(ils_level).treefile")
net_file = joinpath(checkpoint_dir, "estnets_n$(ntaxa)_$(replicatenum)_$(ngt)_$(seq_len)_$(ils_level)_$(m)_$(dmethod).netfile")

if !isfile(estgt_file) touch(estgt_file) end

# Estimated gene trees

In [5]:
seed = parse(Int64, "$(true_net.numTaxa)42$(true_net.numHybrids)42$(replicatenum)")

true_gts::Vector{HybridNetwork} = simulate_gene_trees(truegt_file, ngt, seed)
pmap(
    (i, gt) -> est_gt_from_true_gt(gt, "$(seq_file_prefix)_$(i)", "$(estgt_file)_$(i)", data_dir, i),
    1:length(true_gts), true_gts
)
true_gts = []
@everywhere GC.gc()

[30m[TRUE GT] [39m[36mLoaded from file.[39m
      From worker 2:	[30m[SEQ] [39m[36mSequences already simulated for #1.[39m
      From worker 2:	[30m[IQ-TREE] [39m[36mResults already exist for #1.[39m
      From worker 2:	[30m[SEQ] [39m[36mSequences already simulated for #2.[39m
      From worker 2:	[30m[IQ-TREE] [39m[36mResults already exist for #2.[39m
      From worker 2:	[30m[SEQ] [39m[36mSequences already simulated for #3.[39m
      From worker 2:	[30m[IQ-TREE] [39m[36mResults already exist for #3.[39m
      From worker 2:	[30m[SEQ] [39m[36mSequences already simulated for #4.[39m
      From worker 2:	[30m[IQ-TREE] [39m[36mResults already exist for #4.[39m
      From worker 2:	[30m[SEQ] [39m[36mSequences already simulated for #5.[39m
      From worker 2:	[30m[IQ-TREE] [39m[36mResults already exist for #5.[39m
      From worker 2:	[30m[SEQ] [39m[36mSequences already simulated for #6.[39m
      From worker 2:	[30m[IQ-TREE] [39m[36mRes

In [6]:
# Consolidate all estimated gene trees into a single file
est_gts = Array{HybridNetwork}(undef, ngt)
for i = 1:ngt
    est_gts[i] = readTopology("$(estgt_file)_$(i)")
end

@everywhere GC.gc()

# Subset decomposition

In [10]:
_, _, nj_tre = estimate_nj_tree(est_gts)
subsets = sateIdecomp(nj_tre, m)

[30m[NJ] [39m[36mCalculating AGIC.[39m
[30m[NJ] [39m[36mEstimating NJ tree.[39m


40-element Vector{Vector{String}}:
 ["t329", "t334", "t341", "t342", "t333", "t336", "t337", "t338", "t349", "t350"]
 ["t326", "t327", "t328", "t335", "t345", "t346", "t339", "t340", "t347", "t348", "t330", "t331", "t332", "t343", "t344"]
 ["t402", "t403", "t404", "t412", "t413", "t418", "t419", "t422", "t423", "t411", "t420", "t421"]
 ["t401", "t405", "t406", "t407", "t408", "t416", "t417", "t409", "t410", "t414", "t415", "t424", "t425"]
 ["t176", "t183", "t184", "t187", "t188", "t180", "t181", "t191", "t192", "t195", "t196"]
 ["t178", "t193", "t194", "t177", "t179", "t182", "t197", "t198", "t185", "t189", "t190", "t199", "t200", "t186"]
 ["t226", "t234", "t239", "t249", "t250", "t245", "t246", "t238", "t247", "t248", "t236", "t237"]
 ["t229", "t233", "t235", "t243", "t244", "t242", "t241", "t232", "t240", "t227", "t228", "t230", "t231"]
 ["t252", "t268", "t269", "t270", "t271", "t255", "t256", "t262", "t263", "t259", "t266", "t272", "t273"]
 ["t251", "t261", "t267", "t274", "t275", "

# SNaQ inference

In [17]:
df_dir = "/mnt/dv/wid/projects4/SolisLemus-network-merging/simulation-study/from-sequences/CFs/"
nj_dir = "/mnt/dv/wid/projects4/SolisLemus-network-merging/simulation-study/from-sequences/NJs/"
nruns = 10
GC.gc()

for (i, subset_taxa) in enumerate(subsets)
    output_file = "$(net_file)_$(i)"
    runtime_file = "$(output_file).runtime"
    output_net_file = "$(output_file).netfile"
    
    if isfile(output_net_file) && isfile(runtime_file)
        log("SNaQ $(i)", "Already inferred.")
        continue
    end

    temp_gts = Array{HybridNetwork}(undef, length(est_gts))
    for i = 1:length(est_gts)
        temp_gts[i] = pruneTruthFromDecomp(est_gts[i], subset_taxa)
    end

    # 1. Quartets
    q, t = countquartetsintrees(temp_gts)
    df = silently() do
        readTableCF(writeTableCF(q, t))
    end
    CSV.write(joinpath(df_dir, "df_n$(ntaxa)_$(replicatenum)_$(ngt)_$(seq_len)_$(ils_level)_$(m)_$(dmethod)_sub$(i).csv"), writeTableCF(df))

    # 2. Starting trees
    init_tree = pruneTruthFromDecomp(nj_tre, subset_taxa)
    tre_out = joinpath(nj_dir, "nj_n$(ntaxa)_$(replicatenum)_$(ngt)_$(seq_len)_$(ils_level)_$(m)_$(dmethod)_sub$(i).tre")
    open(tre_out, "w+") do f
        write(f, writeTopology(init_tree))
    end

    # 3. Write info to condor input table
    tab_file = "/mnt/dv/wid/projects4/SolisLemus-network-merging/simulation-study/condor/inputs.tab"
    open(tab_file, "a") do f
        write(f, "$(ntaxa),$(replicatenum),$(ngt),$(seq_len),$(ils_level),$(m),$(i)\n")
    end

    # 4. Constraints inferred in Condor
end
@everywhere GC.gc()

Reading in trees, looking at 210 quartets in each...
0+--------------------------------------------------+100%
  **************************************************
Reading in trees, looking at 1365 quartets in each...
0+--------------------------------------------------+100%
  **************************************************
Reading in trees, looking at 495 quartets in each...
0+--------------------------------------------------+100%
  **************************************************
Reading in trees, looking at 715 quartets in each...
0+--------------------------------------------------+100%
  **************************************************
Reading in trees, looking at 330 quartets in each...
0+--------------------------------------------------+100%
  **************************************************
Reading in trees, looking at 1001 quartets in each...
0+--------------------------------------------------+100%
  **************************************************
Reading in tre

# InPhyNet

In [None]:
# Load estimated data
est_constraints = Array{HybridNetworks}(undef, length(subsets))
for i = 1:length(subsets)
    est_constraints[i] = readTopology("$(net_file)_$(i).netfile")
end
est_constraint_runtimes = Vector{Float64}(
    [parse(Float64, readlines("$(net_file)_$(i).runtime")[1]) for i=1:length(subsets)]
)

est_gts = Array{HybridNetwork}(undef, ngt)
for i = 1:ngt
    est_gts[i] = readTopology("$(estgt_file)_$(i)")
end
est_D, est_namelist = calculateAGID(est_gts)



inphynet_time = @elapsed mnet = netnj(est_D, est_constraints, est_namelist)
save_estimated_gts_results(
    "$(ntaxa)", true_net, replicatenum, ngt, ils_level,
    m, dmethod, seq_len, mnet, constraints,
    est_gts, est_constraint_runtimes, inphynet_time
)