# Phylogenetic signal of cell size - attraction towards an optimum value over time

In [1]:
library(ape)
library(phytools)
library(caper)
library(geiger)
library(OUwie)

Loading required package: maps

Loading required package: MASS

Loading required package: mvtnorm

Loading required package: corpcor

Loading required package: nloptr

Loading required package: RColorBrewer



## Tree and data

Load tree

In [2]:
tree <- read.tree('../phylogeny/place/fine_all.nwk')
tree


Phylogenetic tree with 5380 tips and 1961 internal nodes.

Tip labels:
  taxid71518, taxid83984, taxid2193, taxid83985, taxid71152, taxid2203, ...
Node labels:
  N1, N5, N18, N51, N79, N119, ...

Rooted; includes branch lengths.

Load data

In [3]:
data <- read.table('../phylogeny/place/fine_all.tsv', header = TRUE, sep = '\t', quote = '')
head(data, 3)

Unnamed: 0_level_0,taxid,length,width,volume,surface,shape,species,genus,family,order,⋯,rank,node,genome,gc,proteins,coding,rrnas,MILC,ENCprime,hash
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,taxid11,2.371708,1.0606602,1.783187,7.902917,rod-shaped,Cellulomonas gilvus,Cellulomonas,Cellulomonadaceae,Micrococcales,⋯,species,G000218545,3526441,73.81,3206,91.77278,2,-0.262005,0.10083562,1.15
2,taxid14,10.0,0.4898979,1.8541744,15.390598,rod-shaped,Dictyoglomus thermophilum,Dictyoglomus,Dictyoglomaceae,Dictyoglomales,⋯,species,G000020965,1959987,33.74,1890,93.77725,2,-0.0644294,0.03020484,1.13
3,taxid23,1.5,0.7,0.4874705,3.298672,rod-shaped,Shewanella colwelliana,Shewanella,Shewanellaceae,Alteromonadales,⋯,species,G000518705,4575622,45.39,4094,87.38314,0,-0.6533632,0.24898652,1.16


Log transform some metrics

In [4]:
data[[paste("svratio")]] = (data[['volume']] / data[['surface']])
data[[paste("survol")]] = (data[['surface']] / data[['volume']])

In [5]:
cols = c("length", "width", "volume", "surface", "svratio", "survol")

In [6]:
for (col in cols) {
    data[[paste("log", col, sep="_")]] = log10(data[[col]])
}

Set seed

In [7]:
# set.seed(42)

Binarize tree - required for geiger package

In [8]:
tree2 <- multi2di(tree)
is.ultrametric(tree2)

# Comparing models of evolution

In [9]:
cols = c("log_length", "log_width", "log_volume", "log_surface", "log_survol", "log_svratio")
# cols = c("log_length", "log_width")

In [10]:
# Dataframe to save outputs
df_models <- data.frame(matrix(ncol=6, nrow=0))
colnames(df_models) <- c('sigma', 'zo', 'parameter', 'lnl', 'aic', 'aic_weight')
df_models

sigma,zo,parameter,lnl,aic,aic_weight
<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>


In [11]:
startTime <- Sys.time()
for (col in cols) {
    print(col)
    datum <- setNames(data[[col]], data$taxid)
    # Models of evolution
    bm <- fitContinuous(tree2, datum, model = 'BM', control = list(method = c("subplex","L-BFGS-B"),
    niter = 100, FAIL = 1e+200, hessian = FALSE, CI = 0.95))
    eb <- fitContinuous(tree2, datum, model = 'EB', control = list(method = c("subplex","L-BFGS-B"),
    niter = 100, FAIL = 1e+200, hessian = FALSE, CI = 0.95))
    wh <- fitContinuous(tree2, datum, model = 'white', control = list(method = c("subplex","L-BFGS-B"),
    niter = 100, FAIL = 1e+200, hessian = FALSE, CI = 0.95))
    ou <- fitContinuous(tree2, datum, model = 'OU', ncores = 24, bounds = list(alpha = c(0, 500)))
    # Akaike weigths
    aic_cs <- setNames(c(AIC(bm), AIC(eb), AIC(wh)), c('BM', 'EB', 'WH'))
    aic_cs.w <- aic.w(aic_cs)
    # Add to table
    # BM
    df_models[nrow(df_models) + 1,] <- c(bm$opt$sigsq, bm$opt$z0, '', bm$opt$lnL, bm$opt$aic, aic_cs.w[1])
    rownames(df_models)[nrow(df_models)] <- paste(c(col, 'bm'), collapse = '_')
    # EB
    df_models[nrow(df_models) + 1,] <- c(eb$opt$sigsq, eb$opt$z0, eb$opt$a, eb$opt$lnL, eb$opt$aic, aic_cs.w[2])
    rownames(df_models)[nrow(df_models)] <- paste(c(col, 'eb'), collapse = '_')
    # WH
    df_models[nrow(df_models) + 1,] <- c(wh$opt$sigsq, wh$opt$z0, '', wh$opt$lnL, wh$opt$aic, aic_cs.w[3])
    rownames(df_models)[nrow(df_models)] <- paste(c(col, 'wh'), collapse = '_')
    # OU
    df_models[nrow(df_models) + 1,] <- c(ou$opt$sigsq, ou$opt$z0, ou$opt$alpha, ou$opt$lnL, ou$opt$aic, aic_cs.w[4])
    rownames(df_models)[nrow(df_models)] <- paste(c(col, 'ou'), collapse = '_')
}
endTime <- Sys.time()
print(endTime - startTime)

[1] "log_length"


“Non-ultrametric tree with OU model, using VCV method.”
“Recycling array of length 1 in vector-array arithmetic is deprecated.
  Use c() or as.vector() instead.
”
“Recycling array of length 1 in vector-array arithmetic is deprecated.
  Use c() or as.vector() instead.
”


[1] "log_width"


“Non-ultrametric tree with OU model, using VCV method.”
“Recycling array of length 1 in vector-array arithmetic is deprecated.
  Use c() or as.vector() instead.
”
“Recycling array of length 1 in vector-array arithmetic is deprecated.
  Use c() or as.vector() instead.
”


[1] "log_volume"


“Non-ultrametric tree with OU model, using VCV method.”
“Recycling array of length 1 in vector-array arithmetic is deprecated.
  Use c() or as.vector() instead.
”
“Recycling array of length 1 in vector-array arithmetic is deprecated.
  Use c() or as.vector() instead.
”


[1] "log_surface"


“Non-ultrametric tree with OU model, using VCV method.”
“Recycling array of length 1 in vector-array arithmetic is deprecated.
  Use c() or as.vector() instead.
”
“Recycling array of length 1 in vector-array arithmetic is deprecated.
  Use c() or as.vector() instead.
”


[1] "log_survol"


“
Parameter estimates appear at bounds:
	a”
“Non-ultrametric tree with OU model, using VCV method.”
“Recycling array of length 1 in vector-array arithmetic is deprecated.
  Use c() or as.vector() instead.
”
“Recycling array of length 1 in vector-array arithmetic is deprecated.
  Use c() or as.vector() instead.
”


[1] "log_svratio"


“
Parameter estimates appear at bounds:
	a”
“Non-ultrametric tree with OU model, using VCV method.”
“Recycling array of length 1 in vector-array arithmetic is deprecated.
  Use c() or as.vector() instead.
”
“Recycling array of length 1 in vector-array arithmetic is deprecated.
  Use c() or as.vector() instead.
”


Time difference of 6.95917 days


In [12]:
df_models

Unnamed: 0_level_0,sigma,zo,parameter,lnl,aic,aic_weight
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
log_length_bm,0.305313717880153,0.35471168966103,,-1.48220509665269,6.96441019330538,0.731071243060858
log_length_eb,0.305314484353685,0.354711683713506,-1.00408806311008e-06,-1.48226951094911,8.96453902189823,0.268928756939142
log_length_wh,0.0990999600304162,0.29493902520672,,-1415.61472013192,2835.22944026384,0.0
log_length_ou,0.394364632172533,0.176034629300114,1.45927524787313,80.5258456301381,-155.051691260276,
log_width_bm,0.186301213377839,-0.2086246621032,,1327.31064236703,-2650.62128473405,0.731062966304206
log_width_eb,0.186301091785185,-0.208624663518205,-1.0116737184211e-06,1327.31062005049,-2648.62124010098,0.268937033695794
log_width_wh,0.0648370632202481,-0.222797344533588,,-274.377824726946,552.755649453892,0.0
log_width_ou,0.253096155438443,-0.243735791333744,1.84716845682168,1462.61341183399,-2919.22682366797,
log_volume_bm,1.39119160951155,-0.246864642544473,,-4081.0719010667,8166.14380213341,0.731065023403925
log_volume_eb,1.39118088100865,-0.246864652135057,-1.00403078402921e-06,-4081.07193384611,8168.14386769222,0.268934976596075


I forgot to add the OU model for the Akaike weight calculation. This can be easily fixed. **The OU model, for all metrics, has the highest log-likelihood and lowest AIC, so if Akaike weights were calculated (again easily fixable), it will assign the highest weight to OU!**