In [1]:
import os
import sys
import glob
import scipy
import pickle
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
from collections import defaultdict
from matplotlib import pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

%matplotlib inline
sns.set_style('whitegrid')
pd.set_option('display.max_rows', 100)
matplotlib.rcParams['ps.fonttype'] = 42
matplotlib.rcParams['pdf.fonttype'] = 42
pd.set_option('display.max_columns', 100)

sys.path.append('/home/mattolm/Bio_scripts/')
import StatsTools

from IPython.display import display, HTML

from Bio import SeqIO

## Load FastANI data

In [3]:
FAdb = pd.read_csv('/data1/bio_db/refseq/analysis/Manuscript/github_methods/bacterialEvolutionMetrics/DataTables/FastANI_comps.csv.gz')
FAdb = FAdb.set_index(['genome1', 'genome2'])


In [5]:
def _get_ani(FAdb, g1, g2):
    try:
        db = FAdb.loc[g1, g2]
    except:
        db = FAdb.loc[g2, g1]
    
    return db['fast_ani']

## Load MAG data with HGT

This is generated by first running dRep in "goANI" mode, and then using the python script "dnds_from_drep.py" (located in this directory) on the resulting dRep folder. 

In [6]:
from tqdm import tqdm

def summarize_dnds_HGT(db, noFilt=False, fastANI=None):
    table = defaultdict(list)
    
    # Basic info
    table['reference'].append(db['reference'].tolist()[0])
    table['querry'].append(db['querry'].tolist()[0])
    
    # Validate fast_ani
    assert fastANI < 1, table
    assert fastANI != None

    # Number of comparisons
    table['total_comps'].append(len(db))
    table['failed_comps'].append(len(db[db['N_sites'] == 0]))
    table['successful_comps'].append(len(db[db['N_sites'] > 0]))
    table['considered_bases'].append(db[(db['N_sites'] > 0)]['al_len'].sum())
    table['percet_successful'].append((len(db[db['N_sites'] > 0]) / len(db)) * 100)
    
    # Filter to successful comparisons
    db = db[db['N_sites'] > 0]
    
    # dN/dS stuff
    N = db['N_sites'].sum()
    table['N_sites'].append(N)
    dN = db['N_changed'].sum()
    table['N_changed'].append(dN)
    S = db['S_sites'].sum()
    table['S_sites'].append(S)
    dS = db['S_changed'].sum()
    table['S_changed'].append(dS)
    if (N > 0) & (S > 0) & (dS > 0):
        table['dN/dS'].append(((dN/N) / (dS/S)))
    else:
        table['dN/dS'].append(np.nan)

    table['fast_ani'].append(fastANI)
        
    # Filter to only ones with a full alignment
    if noFilt:
        hdb = db[(db['al_len'] > 500)]
    else:
        hdb = db[(db['al_len'] > 500) & (db['al_len'] < 1500)]
                 
    ex_iden = _calc_expected(hdb, fastANI)
    table['counted_comps'].append(len(hdb))
    table['identical_comps'].append(len(hdb[hdb['p_inden'] > 99.99999999]))
    table['expected_identicle'].append(ex_iden)
    
    try:
        table['percent_enriched'].append(((len(hdb[hdb['p_inden'] > 99.99999999]) - int(ex_iden)) / len(hdb))*100)
        hani = sum([p * l for p, l in zip(hdb['p_inden'], hdb['al_len'])]) \
                / hdb['al_len'].sum()
        table['filtered_ani'].append(hani)
        table['filtered_af'].append(db['al_len'].sum() / db['qry_len'].sum())
    except:
        table['percent_enriched'].append(0)
        if not noFilt:
            table['filtered_ani'].append(0)
            table['filtered_af'].append(0)   

    Ndb = pd.DataFrame(table)
    return Ndb

def _calc_expected(db, ani):
    expected = 0
    prob_diff = (1-ani) / 1
    prob_same = ani# / 100
    for i, row in db.iterrows():
        # probability of this row being 0
        p = prob_same ** int(row['al_len'])
        if p > 1:
            expected += 1
        else:
            expected += p
    return expected

def parse_to_genomeLevel_HGT(Ddb, noFilt=False):
    Tdbs = []
    for query, qdb in tqdm(Ddb.groupby('querry')):
        for ref, db in qdb.groupby('reference'):
            
            if len(db) < 500:
                continue
            
            # get the FastANI
            try:
                fastANI = _get_ani(FAdb, db['reference'].tolist()[0], db['querry'].tolist()[0])
            except:
                #print("cant find {0} {1}".format(query, ref))
                fastANI = 0
            
            tdb = summarize_dnds_HGT(db, noFilt=noFilt, fastANI=fastANI)
            Tdbs.append(tdb)
    return pd.concat(Tdbs)

In [7]:
def load_dnds(wd_loc):
    baseloc = wd_loc + 'data/dNdS_data/detailed_dNdS_info.csv'
    if os.path.exists(baseloc):
        Tdb = pd.read_csv(baseloc)
    else:
        print("ERROR")
        
    return parse_to_genomeLevel_HGT(Tdb, noFilt=True)

Tdb = pd.DataFrame()
for floc in ['/data1/bio_db/refseq/analysis/MAGlists_2/goANI_oceanList/']:
    tdb = load_dnds(floc)
    tdb['method'] = floc.split('/')[6].split('_')[1]
    Tdb = Tdb.append(tdb)

100%|██████████| 307/307 [02:18<00:00,  2.22it/s]


# Save to run popCOGent

In [15]:
Tdb.to_csv('/data1/bio_db/refseq/analysis/Manuscript/github_methods/bacterialEvolutionMetrics/DataTables/OceanList_for_popCOGent.csv.gz')


# Load data from popCOGent

See notebook 4 for details on how popCOGent was run

In [16]:
POPdb = pd.read_csv('/data1/bio_db/refseq/analysis/Manuscript/github_methods/bacterialEvolutionMetrics/DataTables/oceanList.RESULTS.csv')
POPdb = POPdb.rename(columns={'Strain 1':'reference', 'Strain 2':'querry'})
POPdb


FileNotFoundError: [Errno 2] File b'/data1/bio_db/refseq/analysis/Manuscript/github_methods/bacterialEvolutionMetrics/DataTables/oceanList.RESULTS.csv' does not exist: b'/data1/bio_db/refseq/analysis/Manuscript/github_methods/bacterialEvolutionMetrics/DataTables/oceanList.RESULTS.csv'

## Merge

In [12]:
Mdb = pd.merge(Tdb, POPdb, how='left', on=['reference', 'querry'])
Mdb['Observed SSD'] = Mdb['Observed SSD'].astype(float)
Mdb['fast_ani'] = Mdb['fast_ani'] * 100

In [13]:
Mdb['percent_expected'] = [(x/y)*100 for x, y in zip(Mdb['expected_identicle'], Mdb['counted_comps'])]
Mdb['percent_identicle'] = [(x/y)*100 for x, y in zip(Mdb['identical_comps'], Mdb['counted_comps'])]
Mdb['percent_enriched'] = [((i - e) / t)*100 for i, e, t in zip(Mdb['identical_comps'], Mdb['expected_identicle'], Mdb['counted_comps'])]
Mdb = Mdb[Mdb['total_comps'] > 1000]

## Save

NOTE! To generate the file in located in this notebook "GenomeEvoMetrics.csv.gz", this proceedure was run on each of the 4 genome sets and combined

In [14]:
#Mdb.to_csv('Output.csv.gz')

Unnamed: 0,reference,querry,total_comps,failed_comps,successful_comps,considered_bases,percet_successful,N_sites,N_changed,S_sites,S_changed,dN/dS,fast_ani,counted_comps,identical_comps,expected_identicle,percent_enriched,filtered_ani,filtered_af,method,Initial divergence,Alignment size,Genome 1 size,Genome 2 size,Observed SSD,SSD 95 CI low,SSD 95 CI high,percent_expected,percent_identicle
0,TOBG_CPC-14.fna,TOBG_ARS-115.fna,1636,264,1372,1349279,83.863081,1.045152e+06,36790.166667,3.064563e+05,142979.833333,0.075448,89.17415,1071,6,1.127393e-24,5.602241e-01,86.672973,0.988352,oceanList,0.107042,1488349,2765177,2466433,5.973097,4.593316,7.743177,1.052655e-25,0.560224
1,TOBG_SP-259.fna,TOBG_ARS-115.fna,1662,799,863,804116,51.925391,6.210367e+05,2159.666667,1.822073e+05,8941.333333,0.070865,98.19005,618,48,4.098642e-03,7.766327e+00,98.611531,0.986317,oceanList,0.017903,1765574,2338586,2466433,52.239519,39.715062,68.240510,6.632107e-04,7.766990
2,TOBG_EAC-65.fna,TOBG_ARS-123.fna,1508,456,1052,876420,69.761273,6.563537e+05,7494.000000,2.182993e+05,22129.000000,0.112633,96.10665,768,6,4.494968e-08,7.812500e-01,96.732806,0.973083,oceanList,0.034797,1497540,2147286,1860870,6.806949,5.660173,8.516977,5.852823e-09,0.781250
3,TOBG_IN-1213.fna,TOBG_ARS-123.fna,1438,901,537,422142,37.343533,3.158287e+05,2759.500000,1.049903e+05,8002.500000,0.114631,96.45370,369,4,1.990122e-07,1.084011e+00,97.520601,0.928505,oceanList,0.030861,1423846,1920553,1860870,14.079823,12.274124,16.309080,5.393285e-08,1.084011
4,TOBG_RS-817.fna,TOBG_ARS-123.fna,1467,799,668,523867,45.535106,3.918850e+05,1827.500000,1.302320e+05,5344.500000,0.113634,98.06145,463,10,1.668181e-03,2.159467e+00,98.651356,0.970577,oceanList,0.016599,1455364,1985135,1860870,39.956461,34.873688,44.237461,3.602983e-04,2.159827
5,TOBG_SAT-195.fna,TOBG_ARS-13.fna,2991,23,2968,2892841,99.231026,2.276166e+06,531.000000,6.078122e+05,244.000000,0.581126,99.94110,2283,1832,1.213927e+03,2.707286e+01,99.976015,0.988533,oceanList,0.000261,3332031,3385183,3792666,1079.720870,253.345810,3203.225475,5.317243e+01,80.245291
6,TOBG_EAC-45.fna,TOBG_ARS-18.fna,2845,919,1926,1856800,67.697715,1.453765e+06,2354.666667,3.990680e+05,5324.333333,0.121399,98.96305,1441,1170,7.765538e-01,8.113973e+01,99.650262,0.996977,oceanList,0.010255,3231396,4388938,3711261,2221.030051,1867.519852,2772.125143,5.388992e-02,81.193616
7,TOBG_SAT-101.fna,TOBG_ARS-18.fna,2732,1002,1730,1660086,63.323572,1.299552e+06,2034.833333,3.567840e+05,4656.166667,0.119981,98.71830,1285,992,1.592059e-01,7.718605e+01,99.652619,0.996286,oceanList,0.012622,3120588,4047777,3711261,1690.260838,1342.720375,2060.245530,1.238957e-02,77.198444
8,TOBG_SP-45.fna,TOBG_ARS-18.fna,1949,1155,794,668882,40.738840,5.286442e+05,29044.833333,1.453208e+05,81255.166667,0.098261,84.81285,555,0,9.564986e-36,-1.723421e-36,83.821431,0.961112,oceanList,0.133279,1405556,3674042,3711261,0.575558,0.507158,0.649519,1.723421e-36,0.000000
9,TOBG_MED-599.fna,TOBG_ARS-28.fna,2875,1055,1820,1738652,63.304348,1.353695e+06,40641.666667,3.882762e+05,87194.333333,0.133691,91.53700,1454,1,9.072892e-19,6.877579e-02,92.810448,0.941281,oceanList,0.075421,3189775,4811953,4406224,0.412521,0.345620,0.475656,6.239953e-20,0.068776
