In [2]:
import os
import sys
import glob
import scipy
import pickle
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
from collections import defaultdict
from matplotlib import pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

%matplotlib inline
sns.set_style('whitegrid')
pd.set_option('display.max_rows', 100)
matplotlib.rcParams['ps.fonttype'] = 42
matplotlib.rcParams['pdf.fonttype'] = 42
pd.set_option('display.max_columns', 100)

sys.path.append('/home/mattolm/Bio_scripts/')

from IPython.display import display, HTML

# List locations and names of fastANI outputs

In [3]:
names = ['refseq', 'infant', 'ocean', 'soil']
flocs = [
'/data1/bio_db/refseq/analysis/SubsetClustering/ListTheta/clusterTesting/fastANI/RefSeqTheta.FastANI.out',
'/data8/Human/NIH_4/MethodDevelopment/fastANI/delta.FastANI.out',
'/data1/bio_db/refseq/analysis/fastANI/oceanGenomeListKappa.FastANI.out',
'/data1/bio_db/refseq/analysis/fastANI/sigmaSoilList.FastANI.out'
]

# Load FastANI outputs in a standardized way

In [4]:
def load_fastani(loc):
    '''
    loc = location of fastANI output
    '''
    fdb = pd.read_table(floc, names=['genome1', 'genome2', 'ANI', 'j1', 'j2'], delim_whitespace=True)
    for c in ['genome1', 'genome2']:
        fdb[c] = [os.path.basename(x) for x in fdb[c]]
    fdb['fastani_fracAligned'] = [(j1/j2) for j1, j2 in zip(fdb['j1'], fdb['j2'])]
    fdb = fdb.rename(columns={'ANI':'fast_ani'})
    fdb = fdb[['genome1', 'genome2', 'fast_ani', 'fastani_fracAligned']]
    fdb['fast_ani'] = [x/100 for x in fdb['fast_ani']]
    
    SFdb = standardize(fdb, vals=['fast_ani', 'fastani_fracAligned'])
    return SFdb

def standardize(db, g1='genome1', g2='genome2', vals=['fast_ani', 'fastani_fracAligned']):
    '''
    Standardize such that only one genome combination remains
    
    First, merge / concatonate everything together such that both fwd and reverse are present
    Second, subset just that only comparisons where g1 > g2 are retained
    '''
    # Step 1- merge
    
    # Make a reverse database
    rdb = db.copy()
    rdb.rename(columns={g1:g2, g2:g1}, inplace=True)
    rdb.rename(columns={v:'rev_{0}'.format(v) for v in vals}, inplace=True)
    
    # merge it in
    cdb = pd.merge(db, rdb, on=[g1, g2], how='outer')
    #cdb = special_merge(db, rdb, on=[g1, g2])
    # take the average
    for v in vals:
        cdb['avg_{0}'.format(v)] = [make_av(v1, v2) for v1, v2 in zip(cdb[v], cdb['rev_{0}'.format(v)])]
    
    # Step 2- subset
    
    # Figure out which is to keep
    cdb['keep'] = [True if v1 >= v2 else False for v1, v2 in zip(cdb[g1], cdb[g2])]    
    # Only keep the keep
    cdb = cdb[cdb['keep']]
    
    # Rename
    del cdb['keep']
    for v in vals:
        del cdb['rev_{0}'.format(v)]
        del cdb[v]

        cdb = cdb.rename(columns={'avg_{0}'.format(v):v})
    
    # Make sure all is kosher
    #assert len(cdb) == ((len(cdb[g1].unique()) * len(cdb[g1].unique())) / 2) + len(cdb[g1].unique())
    return cdb.drop_duplicates(subset=[g1, g2])

def make_av(v1, v2):
    '''
    If v1 and v2 are not NA, average. If one is NA, return the other. If both are NA, fail
    '''
    if np.isnan(v1):
        assert not np.isnan(v2)
        return v2
    elif np.isnan(v2):
        assert not np.isnan(v1)
        return v1
    else:
        v = np.mean([v1,v2])
        return v

In [5]:
Fdb = pd.DataFrame()

for name, floc in zip(names, flocs):
    print("Loading {0}".format(name))
    db = load_fastani(floc)
    db['method'] = name
    Fdb = Fdb.append(db)
    
Fdb['method'] = Fdb['method'].astype('category')

Loading refseq
Loading infant
Loading ocean
Loading soil


## Print an overview of how many comparisons successfully loaded for each method

In [6]:
Fdb['method'].value_counts()

soil      926259
refseq    485506
infant    284017
ocean      25393
Name: method, dtype: int64

## Load information on which RefSeq genomes belong to the same species

In [11]:
GLdb = pd.read_csv('/data1/bio_db/refseq/parsed/data_tables/refseq_info_v1.2.csv')
GLdb['genome'] = [x + '.fasta' for x in GLdb['assembly_accession']]
g2s = GLdb.set_index('genome')['species'].to_dict()

Fdb['species1'] = Fdb['genome1'].map(g2s)
Fdb['species2'] = Fdb['genome2'].map(g2s)

Fdb['same_species'] = [x == y for x,y in zip(Fdb['species1'], Fdb['species2'])]

for col in ['species1', 'species2']:
    del Fdb[col]

## Load completeness and contamination information from dRep runs

In [12]:
dlocs = ['/data1/bio_db/refseq/analysis/MAGlists_2/goANI_soilList/data_tables/genomeInfo.csv',
        '/data1/bio_db/refseq/analysis/MAGlists_2/goANI_oceanList/data_tables/genomeInfo.csv',
        '/data1/bio_db/refseq/analysis/MAGlists_2/goANI_infantList/data_tables/genomeInfo.csv']

# Snag the completeness and contamination
G2con = {}
G2comp = {}
for floc in dlocs:
    gdb = pd.read_csv(floc)
    g2con = gdb.set_index('genome')['contamination'].to_dict()
    g2comp = gdb.set_index('genome')['completeness'].to_dict()
    
    G2con = {**G2con, **g2con}
    G2comp = {**G2comp, **g2comp}
    
Fdb['g1_con'] = Fdb['genome1'].map(G2con)
Fdb['g2_con'] = Fdb['genome2'].map(G2con)
Fdb['g1_comp'] = Fdb['genome1'].map(G2comp)
Fdb['g2_comp'] = Fdb['genome2'].map(G2comp)

assert len(Fdb[(Fdb['method'].isin(['soil', 'ocean', 'infant'])) 
                & ((Fdb['g1_con'].isna()) | (Fdb['g2_con'].isna()) 
                   | (Fdb['g1_comp'].isna()) | (Fdb['g2_comp'].isna()) )]) == 0

# Make RefSeq just 100% complete
for t in ['g1_con', 'g2_con']:
    Fdb[t] = Fdb[t].fillna(0)
for t in ['g1_comp', 'g2_comp']:
    Fdb[t] = Fdb[t].fillna(100)


## Filter genome set

In [17]:
COMP = 70
CON = 5

FFdb = Fdb[(Fdb['g1_comp'] >= COMP) & (Fdb['g2_comp'] >= COMP)\
           &(Fdb['g1_con'] <= CON) & (Fdb['g2_con'] <= CON)]
for col in ['g1_comp', 'g1_con', 'g2_comp', 'g2_con']:
    del FFdb[col]
    

## Save

In [19]:
FFdb.head()

Unnamed: 0,genome1,genome2,fast_ani,fastani_fracAligned,method,same_species
0,GCF_001280225.1.fasta,GCF_001280225.1.fasta,1.0,1.0,refseq,True
1,GCF_001280225.1.fasta,GCF_000521585.1.fasta,0.794963,0.607486,refseq,True
2,GCF_001280225.1.fasta,GCF_000225465.1.fasta,0.794084,0.571662,refseq,True
3,GCF_001280225.1.fasta,GCF_000009605.1.fasta,0.79386,0.547941,refseq,True
4,GCF_001280225.1.fasta,GCF_000183285.1.fasta,0.791693,0.564687,refseq,True


In [20]:
FFdb.to_csv('/data1/bio_db/refseq/analysis/Manuscript/github_methods/bacterialEvolutionMetrics/DataTables/FastANI_comps.csv.gz')
