# Generate Comparative AIC Table
### Setup 

In [1]:
from __future__ import division

from collections import defaultdict, Counter
from json import dumps, load

from pymongo import MongoClient
from pandas import DataFrame
from IPython.display import Markdown, display
from numpy import median, array, exp

### Utility

In [6]:
def get_blacklist():
    with open('blacklist.json') as bl:
        return load(bl)

def get_saturated(col):
    bl = get_blacklist()
    return [d['_id'] for d in col.find({'_id': {'$nin': bl}}) 
            if max(d['lf']['params']['length'].values()) > 10.-1e-4]

def get_hard_up(col):
    bl = get_blacklist()
    return [d['_id'] for d in col.find({'_id': {'$nin': bl}}) 
            if d['lf']['hard_up']]

def get_aic(col):
    bl = get_blacklist()
    df = col.find_one()['lf']['df']
    return {d['_id']:2*df - 2*d['lf']['ll'] for d in col.find({'_id': {'$nin': bl}})}    

def confidence_set(aics, p=0.95):
    models, aics = zip(*aics)
    aics = array(aics)
    aic_diffs = aics - aics.min()
    model_likelihoods = exp(-aic_diffs/2.)
    akaike_weights = model_likelihoods / model_likelihoods.sum()
    cum_prob = 0.
    result = {}
    for weight, model in sorted(zip(akaike_weights, models))[::-1]:
        cum_prob += weight
        result[model] = cum_prob
        if cum_prob > p:
            break
    return result

def print_logs(log,  level='INFO'):
    printmd('##### ' + log.name)
    for d in log.find({'level':level}):
        printmd(dumps(d['message']))
        
def printmd(string):
    ''' thanks http://stackoverflow.com/questions/32026727/format-output-of-code-cell-with-markdown '''
    display(Markdown(string))

### Load the Data

In [7]:
client = MongoClient()

In [8]:
cols = ['GNC', 'Y98', 'CNFGTR']
datasets = ['mammals', 'ants', 'hum_xen_fug', 'introns']
ds_names = {'mammals' : 'Mammals',
            'ants' : 'Ants',
            'hum_xen_fug' : 'Vertebrates',
            'introns' : 'Primate Introns'}
for dataset in datasets:
    printmd('#### ' + ds_names[dataset])
    for col in ['data'] + cols:
        print_logs(getattr(getattr(client, dataset), col + '.log'))

#### Mammals

##### data.log

{"input_directory": "/short/xe9/bdk248/data/mammals", "log_level": "DEBUG", "output_collection": "mammals.data", "tree_file": "../config/mammals.nwk", "db_host": "r2081", "aln_length": 1500, "log_name": "log", "codon_position": -1}

{"mong": "0.0.10-dev", "cogent": "1.5.3-dev", "masterslave": "0.0.10-dev", "consume": "0.0.5-dev", "monglog": "0.0.1-dev"}

##### GNC.log

{"function": "ml.ml", "start_over": true, "log_level": "DEBUG", "no_mpi_main_loop": false, "kwargs_file": "../config/GNC.json", "input_collection": "mammals.data", "output_collection": "mammals.GNC", "output_collections_file": null, "output_collections": ["mammals.GNC"], "db_host": "r2081", "input_collections_file": null, "kwargs": {"model": "GNC"}, "log_name": "log", "input_collections": ["mammals.data"]}

{"mong": "0.0.10-dev", "monglog": "0.0.1-dev", "map_collection": "0.0.8-dev", "masterslave": "0.0.10-dev", "ml": "0.0.11-dev"}

##### Y98.log

{"function": "ml.ml", "start_over": true, "log_level": "DEBUG", "no_mpi_main_loop": false, "kwargs_file": "../config/Y98.json", "input_collection": "mammals.data", "output_collection": "mammals.Y98", "output_collections_file": null, "output_collections": ["mammals.Y98"], "db_host": "r2081", "input_collections_file": null, "kwargs": {"model": "Y98"}, "log_name": "log", "input_collections": ["mammals.data"]}

{"mong": "0.0.10-dev", "monglog": "0.0.1-dev", "map_collection": "0.0.8-dev", "masterslave": "0.0.10-dev", "ml": "0.0.11-dev"}

##### CNFGTR.log

{"function": "ml.ml", "start_over": false, "log_level": "DEBUG", "no_mpi_main_loop": false, "kwargs_file": "../config/CNFGTR.json", "input_collection": "mammals.data", "output_collection": "mammals.CNFGTR", "output_collections_file": null, "output_collections": ["mammals.CNFGTR"], "db_host": "r2081", "input_collections_file": null, "kwargs": {"model": "CNFGTR"}, "log_name": "log", "input_collections": ["mammals.data"]}

{"mong": "0.0.10-dev", "monglog": "0.0.1-dev", "map_collection": "0.0.8-dev", "masterslave": "0.0.10-dev", "ml": "0.0.11-dev"}

#### Ants

##### data.log

{"input_directory": "/short/xe9/bdk248/../data/ants", "log_level": "DEBUG", "output_collection": "ants.data", "tree_file": "../config/ants.nwk", "db_host": "r2081", "aln_length": 1500, "log_name": "log", "codon_position": -1}

{"mong": "0.0.10-dev", "cogent": "1.5.3-dev", "masterslave": "0.0.10-dev", "consume": "0.0.5-dev", "monglog": "0.0.1-dev"}

##### GNC.log

{"function": "ml.ml", "start_over": true, "log_level": "DEBUG", "no_mpi_main_loop": false, "kwargs_file": "../config/GNC.json", "input_collection": "ants.data", "output_collection": "ants.GNC", "output_collections_file": null, "output_collections": ["ants.GNC"], "db_host": "r2081", "input_collections_file": null, "kwargs": {"model": "GNC"}, "log_name": "log", "input_collections": ["ants.data"]}

{"mong": "0.0.10-dev", "monglog": "0.0.1-dev", "map_collection": "0.0.8-dev", "masterslave": "0.0.10-dev", "ml": "0.0.11-dev"}

##### Y98.log

{"function": "ml.ml", "start_over": false, "log_level": "DEBUG", "no_mpi_main_loop": false, "kwargs_file": "../config/Y98.json", "input_collection": "ants.data", "output_collection": "ants.Y98", "output_collections_file": null, "output_collections": ["ants.Y98"], "db_host": "r2081", "input_collections_file": null, "kwargs": {"model": "Y98"}, "log_name": "log", "input_collections": ["ants.data"]}

{"mong": "0.0.10-dev", "monglog": "0.0.1-dev", "map_collection": "0.0.8-dev", "masterslave": "0.0.10-dev", "ml": "0.0.11-dev"}

##### CNFGTR.log

{"function": "ml.ml", "start_over": true, "log_level": "DEBUG", "no_mpi_main_loop": false, "kwargs_file": "../config/CNFGTR.json", "input_collection": "ants.data", "output_collection": "ants.CNFGTR", "output_collections_file": null, "output_collections": ["ants.CNFGTR"], "db_host": "r2081", "input_collections_file": null, "kwargs": {"model": "CNFGTR"}, "log_name": "log", "input_collections": ["ants.data"]}

{"mong": "0.0.10-dev", "monglog": "0.0.1-dev", "map_collection": "0.0.8-dev", "masterslave": "0.0.10-dev", "ml": "0.0.11-dev"}

#### Vertebrates

##### data.log

{"input_directory": "/short/xe9/bdk248/../data/hum_xen_fug", "log_level": "DEBUG", "output_collection": "hum_xen_fug.data", "tree_file": "../config/hum_xen_fug.nwk", "db_host": "r2081", "aln_length": 1500, "log_name": "log", "codon_position": -1}

{"mong": "0.0.10-dev", "cogent": "1.5.3-dev", "masterslave": "0.0.10-dev", "consume": "0.0.5-dev", "monglog": "0.0.1-dev"}

##### GNC.log

{"function": "ml.ml", "start_over": true, "log_level": "DEBUG", "no_mpi_main_loop": false, "kwargs_file": "../config/GNC.json", "input_collection": "hum_xen_fug.data", "output_collection": "hum_xen_fug.GNC", "output_collections_file": null, "output_collections": ["hum_xen_fug.GNC"], "db_host": "r2081", "input_collections_file": null, "kwargs": {"model": "GNC"}, "log_name": "log", "input_collections": ["hum_xen_fug.data"]}

{"mong": "0.0.10-dev", "monglog": "0.0.1-dev", "map_collection": "0.0.8-dev", "masterslave": "0.0.10-dev", "ml": "0.0.11-dev"}

##### Y98.log

{"function": "ml.ml", "start_over": true, "log_level": "DEBUG", "no_mpi_main_loop": false, "kwargs_file": "../config/Y98.json", "input_collection": "hum_xen_fug.data", "output_collection": "hum_xen_fug.Y98", "output_collections_file": null, "output_collections": ["hum_xen_fug.Y98"], "db_host": "r2081", "input_collections_file": null, "kwargs": {"model": "Y98"}, "log_name": "log", "input_collections": ["hum_xen_fug.data"]}

{"mong": "0.0.10-dev", "monglog": "0.0.1-dev", "map_collection": "0.0.8-dev", "masterslave": "0.0.10-dev", "ml": "0.0.11-dev"}

##### CNFGTR.log

{"function": "ml.ml", "start_over": true, "log_level": "DEBUG", "no_mpi_main_loop": false, "kwargs_file": "../config/CNFGTR.json", "input_collection": "hum_xen_fug.data", "output_collection": "hum_xen_fug.CNFGTR", "output_collections_file": null, "output_collections": ["hum_xen_fug.CNFGTR"], "db_host": "r2081", "input_collections_file": null, "kwargs": {"model": "CNFGTR"}, "log_name": "log", "input_collections": ["hum_xen_fug.data"]}

{"mong": "0.0.10-dev", "monglog": "0.0.1-dev", "map_collection": "0.0.8-dev", "masterslave": "0.0.10-dev", "ml": "0.0.11-dev"}

#### Primate Introns

##### data.log

{"input_directory": "/short/xe9/bdk248/data/human_macaque_marmoset_introns/data/introns", "log_level": "DEBUG", "output_collection": "introns.data", "tree_file": "../config/introns.nwk", "db_host": "r2081", "aln_length": 1500, "log_name": "log", "codon_position": -1}

{"mong": "0.0.10-dev", "cogent": "1.5.3-dev", "masterslave": "0.0.10-dev", "consume": "0.0.5-dev", "monglog": "0.0.1-dev"}

##### GNC.log

{"function": "ml.ml", "start_over": true, "log_level": "DEBUG", "no_mpi_main_loop": false, "kwargs_file": "../config/GNC_no_stop.json", "input_collection": "introns.data", "output_collection": "introns.GNC", "output_collections_file": null, "output_collections": ["introns.GNC"], "db_host": "r2081", "input_collections_file": null, "kwargs": {"model": "GNC", "gc": "FFLLSSSSYYZOCCUWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"}, "log_name": "log", "input_collections": ["introns.data"]}

{"mong": "0.0.10-dev", "monglog": "0.0.1-dev", "map_collection": "0.0.8-dev", "masterslave": "0.0.10-dev", "ml": "0.0.11-dev"}

##### Y98.log

{"function": "ml.ml", "start_over": false, "log_level": "DEBUG", "no_mpi_main_loop": false, "kwargs_file": "../config/Y98_no_stop.json", "input_collection": "introns.data", "output_collection": "introns.Y98", "output_collections_file": null, "output_collections": ["introns.Y98"], "db_host": "r2081", "input_collections_file": null, "kwargs": {"model": "Y98", "gc": "FFLLSSSSYYZOCCUWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"}, "log_name": "log", "input_collections": ["introns.data"]}

{"mong": "0.0.10-dev", "monglog": "0.0.1-dev", "map_collection": "0.0.8-dev", "masterslave": "0.0.10-dev", "ml": "0.0.11-dev"}

##### CNFGTR.log

{"function": "ml.ml", "start_over": true, "log_level": "DEBUG", "no_mpi_main_loop": false, "kwargs_file": "../config/CNFGTR_no_stop.json", "input_collection": "introns.data", "output_collection": "introns.CNFGTR", "output_collections_file": null, "output_collections": ["introns.CNFGTR"], "db_host": "r2081", "input_collections_file": null, "kwargs": {"model": "CNFGTR", "gc": "FFLLSSSSYYZOCCUWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"}, "log_name": "log", "input_collections": ["introns.data"]}

{"mong": "0.0.10-dev", "monglog": "0.0.1-dev", "map_collection": "0.0.8-dev", "masterslave": "0.0.10-dev", "ml": "0.0.11-dev"}

{"function": "ml.ml", "start_over": false, "log_level": "DEBUG", "no_mpi_main_loop": false, "kwargs_file": "../config/CNFGTR_no_stop.json", "input_collection": "introns.data", "output_collection": "introns.CNFGTR", "output_collections_file": null, "output_collections": ["introns.CNFGTR"], "db_host": "r2081", "input_collections_file": null, "kwargs": {"model": "CNFGTR", "gc": "FFLLSSSSYYZOCCUWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"}, "log_name": "log", "input_collections": ["introns.data"]}

{"mong": "0.0.10-dev", "monglog": "0.0.1-dev", "map_collection": "0.0.8-dev", "masterslave": "0.0.10-dev", "ml": "0.0.11-dev"}

In [9]:
aic = defaultdict(dict)
sat = defaultdict(dict)
hup = defaultdict(dict)
for dataset in datasets:
    for col in cols:
        actual_col = getattr(getattr(client, dataset), col)
        aic[ds_names[dataset]][col] = get_aic(actual_col)
        sat[ds_names[dataset]][col] = set(get_saturated(actual_col))
        hup[ds_names[dataset]][col] = set(get_hard_up(actual_col))

### Display the AIC Winners Table

In [10]:
rows = defaultdict(list)
labels = []
for dataset in datasets:
    labels.append(ds_names[dataset])
    models = aic[labels[-1]]
    
    score = Counter()
    for tag in models.values()[0]:
        scores = [(models[m][tag], m) for m in models]
        score[min(scores)[1]] += 1
    for model, count in score.items():
        rows[model].append(count)

In [11]:
df = DataFrame(rows, index=labels, columns=['GNC', 'CNFGTR', 'Y98'])
df = df.div(df.sum(1), axis=0)*100
df

Unnamed: 0,GNC,CNFGTR,Y98
Mammals,59.841545,37.732112,2.426343
Ants,79.183267,16.384462,4.432271
Vertebrates,52.739044,37.749004,9.511952
Primate Introns,90.739892,8.700834,0.559274


In [12]:
print df.to_latex(float_format=lambda x: '%.1f%%' % x)

\begin{tabular}{lrrr}
\toprule
{} &   GNC &  CNFGTR &  Y98 \\
\midrule
Mammals         & 59.8\% &   37.7\% & 2.4\% \\
Ants            & 79.2\% &   16.4\% & 4.4\% \\
Vertebrates     & 52.7\% &   37.7\% & 9.5\% \\
Primate Introns & 90.7\% &    8.7\% & 0.6\% \\
\bottomrule
\end{tabular}



### Display the AIC CI Membership Table

In [13]:
rows = defaultdict(list)
labels = []
for dataset in datasets:
    labels.append(ds_names[dataset])
    models = aic[labels[-1]]
    
    score = Counter()
    for tag in models.values()[0]:
        scores = [(m, models[m][tag]) for m in models]
        ci = confidence_set(scores)
        for model in ci.keys():
            score[model] += 1
    for model, count in score.items():
        rows[model].append(count/len(models.values()[0])*100)

In [14]:
df = DataFrame(rows, index=labels, columns=['GNC', 'CNFGTR', 'Y98'])
df

Unnamed: 0,GNC,CNFGTR,Y98
Mammals,63.82768,43.550384,4.704135
Ants,82.619522,21.364542,7.071713
Vertebrates,55.179283,42.280876,13.944223
Primate Introns,93.42624,11.698909,1.347758


In [15]:
print df.to_latex(float_format=lambda x: '%.1f%%' % x)

\begin{tabular}{lrrr}
\toprule
{} &   GNC &  CNFGTR &   Y98 \\
\midrule
Mammals         & 63.8\% &   43.6\% &  4.7\% \\
Ants            & 82.6\% &   21.4\% &  7.1\% \\
Vertebrates     & 55.2\% &   42.3\% & 13.9\% \\
Primate Introns & 93.4\% &   11.7\% &  1.3\% \\
\bottomrule
\end{tabular}



## Get Some Handy Stats

In [17]:
rows = defaultdict(list)
labels = []
bl = get_blacklist()
for dataset in datasets:
    labels.append(ds_names[dataset])
    fit_count = getattr(getattr(client, dataset), col).count({'_id': {'$nin': bl}})
    rows['Number of Alignments'].append(fit_count)
    for col in cols[1:]:
        assert fit_count == getattr(getattr(client, dataset), col).count({'_id': {'$nin': bl}})
    aln_length = median([d['lf']['aln_length'] 
                         for d in getattr(getattr(client, dataset), cols[0]).find({'_id': {'$nin': bl}})])
    rows['Median Alignment Length (nt)'].append(aln_length)

In [18]:
df = DataFrame(rows, index=labels)
df

Unnamed: 0,Median Alignment Length (nt),Number of Alignments
Mammals,2256.0,4039
Ants,2257.5,2008
Vertebrates,2376.0,2008
Primate Introns,10113.0,10907


In [19]:
print df.to_latex()

\begin{tabular}{lrr}
\toprule
{} &  Median Alignment Length (nt) &  Number of Alignments \\
\midrule
Mammals         &                        2256.0 &                  4039 \\
Ants            &                        2257.5 &                  2008 \\
Vertebrates     &                        2376.0 &                  2008 \\
Primate Introns &                       10113.0 &                 10907 \\
\bottomrule
\end{tabular}



In [20]:
rows = defaultdict(list)
for label in labels[:-1]:
    for col in cols:
        rows[col].append(len(sat[label][col]))
df = DataFrame(rows, index=labels[:-1], columns=['GNC', 'CNFGTR', 'Y98'])
df

Unnamed: 0,GNC,CNFGTR,Y98
Mammals,0,1,9
Ants,0,0,0
Vertebrates,0,668,1109


In [21]:
print df.to_latex()

\begin{tabular}{lrrr}
\toprule
{} &  GNC &  CNFGTR &   Y98 \\
\midrule
Mammals     &    0 &       1 &     9 \\
Ants        &    0 &       0 &     0 \\
Vertebrates &    0 &     668 &  1109 \\
\bottomrule
\end{tabular}



In [22]:
rows = defaultdict(list)
for label in labels:
    for col in cols:
        rows[col].append(len(hup[label][col]))
df = DataFrame(rows, index=labels, columns=['GNC', 'CNFGTR', 'Y98'])
df

Unnamed: 0,GNC,CNFGTR,Y98
Mammals,3924,218,206
Ants,1898,64,59
Vertebrates,1995,21,22
Primate Introns,3901,271,264


In [24]:
for dataset in datasets:
    print ds_names[dataset], sum(d['aln'].count('>') == 3 
                                 for d in getattr(client, dataset).data.find({'_id': {'$nin': bl}}))

Mammals 4039
Ants 2023
Vertebrates 2008
Primate Introns 10907


In [26]:
len(list(client.ants.Y98.log.find({'level':'WARNING', '_id': {'$nin': bl}})))

15

In [28]:
2023-15

2008