In [1]:
library(BiocParallel)
register(MulticoreParam(8)) # Use 8 cores
library(treeman)	# for calcDstTrp
library(ape)
options(width = as.integer(system('tput cols', intern = TRUE)))
library(DCLEAR)

Loading required package: phangorn


Attaching package: ‘DCLEAR’


The following object is masked from ‘package:stats’:

    simulate




## Set weights

In [2]:
ws = c(0.42823089537553294,
 0.20667366349622826,
 0.12241784191538209,
 0.08059298130534602,
 0.051816333224007874,
 0.03758215808461791,
 0.025931124959002953,
 0.01694326008527386,
 0.011021318465070516,
 0.006993768448671695,
 0.004587733683174811,
 0.0027235159068547065,
 0.0016595605116431617,
 0.0009590029517874714,
 0.0008658576582486061,
 0.0004801574286651361,
 0.0002007215480485405,
 0.00014168579862249917,
 8.920957691046244e-05,
 5.116431616923581e-05,
 2.7550016398819286e-05,
 2.623811085601837e-06,
 7.87143325680551e-06)

InfoW = -log(ws)
InfoW[1] = 1.5
InfoW[2] = .8
InfoW[15:25] = 7


## Compute the RF distance and triplet metric of predicted trees and ground truth trees using R ape and treeman


In [4]:
data(lineages)
set.seed(1)

In [None]:
res <- do.call('rbind', bplapply(1:length(lineages), function(i){

#	flog.info(sprintf('%d/%d', i, length(lineages)))

	x <- lineages[[i]]$sequence
	y <- lineages[[i]]$tree
	d <- x %>% dist_weighted_hamming(InfoW, dropout = FALSE)

	y_pred <- d %>% fastme.bal() 	# inferred tree

	rf <- RF.dist(y, y_pred, normalize = TRUE)	# RF distance

	triplet <- calcDstTrp(
		readTree(text = write.tree(y)),
		readTree(text = write.tree(y_pred)),
		nrmlsd = TRUE
	)	# triplet metric (slow)

	data.frame(sample = i, rf = rf, triplet)

}))


In [None]:
res_file <- 'challenge=C2_group=Kwak_gong_weighted_hamming_treeman.tsv'
write.table(res, res_file, sep = '\t', quote = FALSE, row.names = FALSE)

## Compute the RF distance and triplet metric of predicted trees and ground truth trees using TreeCmp

In [None]:
data(lineages)
set.seed(1)

In [None]:
true_tree_files <- sprintf('data/C2/sample=%d.true.newick', 1:length(lineages))
pred_tree_files <- sprintf('data/C2/sample=%d.pred.newick', 1:length(lineages))
treecmp_output_files <- sprintf('data/C2/sample=%d.treecmp.txt', 1:length(lineages))
res <- do.call('rbind', bplapply(1:length(lineages), function(i){
  flog.info(sprintf('%d/%d', i, length(lineages)))
	  x <- lineages[[i]]$sequence
	  y <- lineages[[i]]$tree
	  d <- x %>% dist_weighted_hamming(InfoW, dropout = FALSE)
	  y_pred <- d %>% fastme.bal()  # inferred tree
	  write.tree(y, true_tree_files[i])
	  write.tree(y_pred, pred_tree_files[i])
	  command <- sprintf('java -jar TreeCmp/bin/TreeCmp.jar -P -N -I -r %s -i %s -o %s -d rc rf tt', true_tree_files[i], pred_tree_files[i], treecmp_output_files[i])
	  system(command)
	  row <- read.table(treecmp_output_files[i], header = TRUE, sep = '\t', nrow = 1) %>%
			unlist()
		n <- row['Common_taxa']
		rf_score <- row['R.F'] / (n - 3)
		triples <- row['Triples']
		triples_score <- min(1.0, 3 * triples / (2 * choose(n, 3)))
		data.frame(sample = i, rf = rf_score, triplet = triples_score)
}))


In [None]:
res_file <- 'challenge=C2_group=Kwak_gong_weighted_hamming_TreeCmp.tsv'
write.table(res, res_file, sep = '\t', quote = FALSE, row.names = FALSE)