## Fit Rooted GNC to Randomly Sampled Alignments
Load some libraries

In [20]:
from __future__ import division

import numpy
from pymongo import MongoClient
from cogent import LoadTree, LoadSeqs
from cogent.maths.stats import chisqprob

import lib
import ml
import nest

Define the model fitters

In [27]:
def fit_one(aln, tree, rooted_edges):
    sp_kw = dict(upper=20., lower=0.05, is_independent=False)
    sm = ml.MG94GTR(optimise_motif_probs=True)
    init_lf = sm.makeLikelihoodFunction(tree)
    init_lf.setAlignment(aln)
    with init_lf.updatesPostponed():
        for param in init_lf.getParamNames():
            if '/' in param:
                init_lf.setParamRule(param, **sp_kw)
    init_lf.setParamRule('length', edges=rooted_edges, is_independent=False)
    init_lf.optimise(local=True, show_progress=False, limit_action='raise')
    init_lf = nest.deflate_likelihood_function(init_lf, save_jsd=False)
    sm = ml.GNC(optimise_motif_probs=True)
    lf = sm.makeLikelihoodFunction(tree)
    lf.setAlignment(aln)
    ml._populate_parameters(lf, init_lf, **sp_kw)
    for param in lf.getParamNames():
        if '>' in param or param == 'omega':
            lf.setParamRule(param, edges=rooted_edges, is_independent=False)
    lf.optimise(local=True, show_progress=False, limit_action='raise')
    return nest.deflate_likelihood_function(lf, save_jsd=False)

def fit_n(output, data, n, treestring, rooted_edges):
    ids = [d['_id'] for d in data.find({},{'_id':True})]
    ids = numpy.random.choice(ids, n, replace=False).tolist()
    tree = LoadTree(treestring=treestring)
    for doc in data.find({'_id' : {'$in' : ids}}):
        aln = LoadSeqs(data=doc['aln'].encode('utf-8'), moltype=ml.DNA)
        aln = aln.withoutTerminalStopCodons(ml.get_genetic_code(None))
        aln = aln.filtered(lambda x: set(''.join(x))<=set(ml.DNA), motif_length=3)
        flat_lf = fit_one(aln, tree, rooted_edges)
        output.insert_one({'_id':doc['_id'], 'lf':flat_lf})
        
def lrt(rooted, unrooted):
    null_lnL = unrooted['lf']['ll']
    null_nfp = unrooted['lf']['df']
    alt_lnL = rooted['lf']['ll']
    alt_nfp = rooted['lf']['df']
    LR = 2 * (alt_lnL - null_lnL) # the likelihood ratio statistic
    LR = max(LR, 0.)
    df = (alt_nfp - null_nfp) # the test degrees of freedom
    return chisqprob(LR, df)

Connect to the database

In [18]:
client = MongoClient()

Ants first

In [19]:
treestring = '(Hsal_3_3,(Cflo_3_3,Lhum_1_0)internal);'
rooted_edges = ['Hsal_3_3', 'internal']
fit_n(client.ants.GNC_rooted, client.ants.data, 10, treestring, rooted_edges)



Do some LRTs

In [26]:
for rooted in client.ants.GNC_rooted.find():
    unrooted = client.ants.GNC.find_one({'_id' : rooted['_id']})
    print rooted['lf']['EN']['internal'], lrt(rooted, unrooted) 

0.218130812783 1
0.0774824906615 0.073481277987
4.70411219134e-12 0.473152467128
0.0191533204928 0.674512874478
1.08826187972e-11 1
1.12044233366e-09 1
0.191255641714 0.00363635449812
0.260516945508 0.000152926348527
1.82488001191e-13 1
0.165872608846 1


In [32]:
treestring = '(Cflo_3_3,(Hsal_3_3,Lhum_1_0)internal);'
rooted_edges = ['Cflo_3_3', 'internal']
fit_n(client.ants.GNC_rooted_on_cflo, client.ants.data, 10, treestring, rooted_edges)

In [33]:
for rooted in client.ants.GNC_rooted_on_cflo.find():
    unrooted = client.ants.GNC.find_one({'_id' : rooted['_id']})
    print rooted['lf']['EN']['internal'], lrt(rooted, unrooted) 

3.66314717141e-12 1
0.0654302168538 0.197226518041
0.166271402869 0.00415652200066
7.52701676199e-12 1
0.211292695681 1
0.0186063027579 1
0.0578547279989 0.0904571384649
6.75195028797e-12 1
0.2068137563 2.36807779418e-06
0.00101765090912 0.924642100873


In [34]:
treestring = '(Lhum_1_0,(Hsal_3_3,Cflo_3_3)internal);'
rooted_edges = ['Lhum_1_0', 'internal']
fit_n(client.ants.GNC_rooted_on_lhum, client.ants.data, 10, treestring, rooted_edges)

In [35]:
for rooted in client.ants.GNC_rooted_on_lhum.find():
    unrooted = client.ants.GNC.find_one({'_id' : rooted['_id']})
    print rooted['lf']['EN']['internal'], lrt(rooted, unrooted) 

0.0659998537439 0.0921104735199
0.182372871175 0.000184505081369
8.9061537885e-12 0.99590355365
0.0836769783917 0.285347543077
1.71480205492e-12 1
0.016742992236 0.419702710271
0.111957157955 1
5.52207712526e-12 1
0.0913902837079 0.184098291937
0.20362292773 1.08165134747e-05


Mammals second

In [30]:
treestring = '(Opossum,(Human,Mouse)internal);'
rooted_edges = ['Opossum', 'internal']
fit_n(client.mammals.GNC_rooted, client.mammals.data, 10, treestring, rooted_edges)

In [36]:
for rooted in client.mammals.GNC_rooted.find():
    unrooted = client.mammals.GNC.find_one({'_id' : rooted['_id']})
    print rooted['_id'], rooted['lf']['EN']['internal'], lrt(rooted, unrooted) 

ENSG00000090924_mammals 0.228314880016 0.00180756361448
ENSG00000074276_mammals 0.150420976176 6.43570447639e-07
ENSG00000074621_mammals 0.082408159895 0.272779453783
ENSG00000077147_mammals 9.03184467837e-12 1
ENSG00000100426_mammals 0.201219244447 0.00199047375544
ENSG00000111641_mammals 0.0978675551755 0.0244891970521
ENSG00000138658_mammals 0.136963970999 0.00392545984241
ENSG00000161647_mammals 0.102204360148 0.0153267921655
ENSG00000164867_mammals 0.0941493247698 3.10822334983e-05
ENSG00000165813_mammals 0.224333961111 8.43477196665e-05
ENSG00000240303_mammals 0.384159359721 4.60150475952e-06


In [38]:
sims_per_aln = 100
for doc in client.mammals.GNC_rooted.find():
    model = lambda: ml.GNC(optimise_motif_probs=True)
    lf = nest.inflate_likelihood_function(doc['lf'], model=model)
    tree = doc['lf']['tree']
    for i in range(sims_per_aln):
        aln = lf.simulateAlignment(sequence_length=doc['lf']['aln_length'])
        aln = aln.__str__()
        _id = doc['_id'] + '_' + str(i)
        client.mammals.GNC_rooted_sims.insert_one({'_id' : _id, 'aln': aln, 'tree' : tree})

  is_independent)
