<b>fastANI</b> library counts ANI metric without align phase

https://github.com/ParBLiSS/FastANI

NOTE: No ANI output is reported for a genome pair if ANI value is much below 80%.

In [1]:
import os
import pandas as pd
import numpy as np
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

### Sort aligned 92 genes to 12 organisms

In [2]:
path = '../data/fastANI_data/pre_aligned_genes_by_orgs/'
gene_names = [f.split('.')[0] for f in os.listdir(path)]
full_paths = [path + f for f in os.listdir(path)]

In [3]:
gene_names[:10]

['alaS',
 'argS',
 'aspS',
 'cgtA',
 'coaE',
 'cysS',
 'dnaA',
 'dnaG',
 'dnaX',
 'engA']

In [4]:
full_paths[:5]

['fastANI_data/pre_aligned_genes_by_orgs/alaS.zZ.fasta',
 'fastANI_data/pre_aligned_genes_by_orgs/argS.zZ.fasta',
 'fastANI_data/pre_aligned_genes_by_orgs/aspS.zZ.fasta',
 'fastANI_data/pre_aligned_genes_by_orgs/cgtA.zZ.fasta',
 'fastANI_data/pre_aligned_genes_by_orgs/coaE.zZ.fasta']

In [5]:
org_ids = []
orgs = {}

for gene_file, gene_name in zip(full_paths, gene_names):
    genes = SeqIO.parse(gene_file, 'fasta')
    for gene in genes:
        org_ids.append(gene.id)
        if orgs.get(gene.id) is None:
            orgs[gene.id] = []
        orgs[gene.id].append((gene_name, gene.seq))

In [6]:
org_ids = list(set(org_ids))

In [7]:
path_orgs = '../data/fastANI_data/orgs_gene_aligned_1066/'

for org_id in org_ids:
    write_path = path_orgs + org_id + '.fasta'
    records = []
    
    for (gene_id, gene_seq) in orgs[org_id]:
        records.append(SeqRecord(gene_seq, id=gene_id, description='| '+org_id))
    
    with open(write_path, 'w') as output_file:
        SeqIO.write(records, output_file, 'fasta')

In [8]:
full_paths_orgs = [path_orgs + f for f in os.listdir(path_orgs) if f.endswith('.fasta')]

In [9]:
full_paths_orgs

['fastANI_data/orgs_gene_aligned_1066/Bacillus_sonorensis_GCF_002202015.1_ASM220201v1_genomic.fasta',
 'fastANI_data/orgs_gene_aligned_1066/Bacillus_halotolerans_GCF_001517105.1_ASM151710v1_genomic.fasta',
 'fastANI_data/orgs_gene_aligned_1066/Bacillus_licheniformis_GCF_000011645.1_ASM1164v1_genomic.fasta',
 'fastANI_data/orgs_gene_aligned_1066/Bacillus_mojavensis_GCF_000245335.1_ASM24533v1_genomic.fasta',
 'fastANI_data/orgs_gene_aligned_1066/Bacillus_tequilensis_GCF_000507145.1_KCTC_13622_01_genomic.fasta',
 'fastANI_data/orgs_gene_aligned_1066/Bacillus_amyloliquefaciens_GCF_000196735.1_ASM19673v1_genomic.fasta',
 'fastANI_data/orgs_gene_aligned_1066/Bacillus_paralicheniformis_GCF_000408885.1_ASM40888v1_genomic.fasta',
 'fastANI_data/orgs_gene_aligned_1066/Bacillus_subtilis_GCF_000009045.1_ASM904v1_genomic.fasta',
 'fastANI_data/orgs_gene_aligned_1066/Bacillus_velezensis_GCF_002117165.1_ASM211716v1_genomic.fasta',
 'fastANI_data/orgs_gene_aligned_1066/Bacillus_vallismortis_GCF_000245

### Count ANI metric with fastANI module

In [10]:
path_output = '../data/fastANI_data/orgs_gene_aligned_1066_fastANI/'

frag_len = 100
kmers = 8
min_frag = 5

for (p1, path_org1) in enumerate(full_paths_orgs):
    for (p2, path_org2) in enumerate(full_paths_orgs):
        output_file = path_output + 'output_' + str(p1+1) + '_' + str(p2+1) + '.txt'
        !fastANI -q {path_org1} -r {path_org2} -o {output_file} --fragLen {frag_len} -k {kmers} --minFrag {min_frag}

>>>>>>>>>>>>>>>>>>
Reference = [fastANI_data/orgs_gene_aligned_1066/Bacillus_sonorensis_GCF_002202015.1_ASM220201v1_genomic.fasta]
Query = [fastANI_data/orgs_gene_aligned_1066/Bacillus_sonorensis_GCF_002202015.1_ASM220201v1_genomic.fasta]
Kmer size = 8
Fragment length = 100
Threads = 1
ANI output file = fastANI_data/orgs_gene_aligned_1066_fastANI/output_1_1.txt
>>>>>>>>>>>>>>>>>>
INFO [thread 0], skch::Sketch::build, minimizers picked from reference = 56888
INFO [thread 0], skch::Sketch::index, unique minimizers = 19781
INFO [thread 0], skch::Sketch::computeFreqHist, Frequency histogram of minimizers = (1, 7203) ... (42, 1)
INFO [thread 0], skch::Sketch::computeFreqHist, With threshold 0.001%, consider all minimizers during lookup.
INFO [thread 0], skch::main, Time spent sketching the reference : 0.0158777 sec
INFO [thread 0], skch::main, Time spent mapping fragments in query #1 : 0.296648 sec
INFO [thread 0], skch::main, Time spent post mapping : 0.000102814 sec
>>>>>>>>>>>>>>>>>>
Ref

INFO [thread 0], skch::main, Time spent mapping fragments in query #1 : 0.268414 sec
INFO [thread 0], skch::main, Time spent post mapping : 0.000100401 sec
>>>>>>>>>>>>>>>>>>
Reference = [fastANI_data/orgs_gene_aligned_1066/Bacillus_vallismortis_GCF_000245315.1_ASM24531v1_genomic.fasta]
Query = [fastANI_data/orgs_gene_aligned_1066/Bacillus_sonorensis_GCF_002202015.1_ASM220201v1_genomic.fasta]
Kmer size = 8
Fragment length = 100
Threads = 1
ANI output file = fastANI_data/orgs_gene_aligned_1066_fastANI/output_1_10.txt
>>>>>>>>>>>>>>>>>>
INFO [thread 0], skch::Sketch::build, minimizers picked from reference = 43233
INFO [thread 0], skch::Sketch::index, unique minimizers = 18130
INFO [thread 0], skch::Sketch::computeFreqHist, Frequency histogram of minimizers = (1, 7855) ... (29, 1)
INFO [thread 0], skch::Sketch::computeFreqHist, With threshold 0.001%, consider all minimizers during lookup.
INFO [thread 0], skch::main, Time spent sketching the reference : 0.0290319 sec
INFO [thread 0], skc

INFO [thread 0], skch::main, Time spent mapping fragments in query #1 : 0.301836 sec
INFO [thread 0], skch::main, Time spent post mapping : 0.000103109 sec
>>>>>>>>>>>>>>>>>>
Reference = [fastANI_data/orgs_gene_aligned_1066/Bacillus_paralicheniformis_GCF_000408885.1_ASM40888v1_genomic.fasta]
Query = [fastANI_data/orgs_gene_aligned_1066/Bacillus_halotolerans_GCF_001517105.1_ASM151710v1_genomic.fasta]
Kmer size = 8
Fragment length = 100
Threads = 1
ANI output file = fastANI_data/orgs_gene_aligned_1066_fastANI/output_2_7.txt
>>>>>>>>>>>>>>>>>>
INFO [thread 0], skch::Sketch::build, minimizers picked from reference = 56994
INFO [thread 0], skch::Sketch::index, unique minimizers = 19866
INFO [thread 0], skch::Sketch::computeFreqHist, Frequency histogram of minimizers = (1, 7196) ... (35, 1)
INFO [thread 0], skch::Sketch::computeFreqHist, With threshold 0.001%, consider all minimizers during lookup.
INFO [thread 0], skch::main, Time spent sketching the reference : 0.0202436 sec
INFO [thread 0

INFO [thread 0], skch::main, Time spent mapping fragments in query #1 : 0.291484 sec
INFO [thread 0], skch::main, Time spent post mapping : 0.000107734 sec
>>>>>>>>>>>>>>>>>>
Reference = [fastANI_data/orgs_gene_aligned_1066/Bacillus_mojavensis_GCF_000245335.1_ASM24533v1_genomic.fasta]
Query = [fastANI_data/orgs_gene_aligned_1066/Bacillus_licheniformis_GCF_000011645.1_ASM1164v1_genomic.fasta]
Kmer size = 8
Fragment length = 100
Threads = 1
ANI output file = fastANI_data/orgs_gene_aligned_1066_fastANI/output_3_4.txt
>>>>>>>>>>>>>>>>>>
INFO [thread 0], skch::Sketch::build, minimizers picked from reference = 54015
INFO [thread 0], skch::Sketch::index, unique minimizers = 20025
INFO [thread 0], skch::Sketch::computeFreqHist, Frequency histogram of minimizers = (1, 7598) ... (37, 1)
INFO [thread 0], skch::Sketch::computeFreqHist, With threshold 0.001%, consider all minimizers during lookup.
INFO [thread 0], skch::main, Time spent sketching the reference : 0.0195274 sec
INFO [thread 0], skch:

INFO [thread 0], skch::main, Time spent mapping fragments in query #1 : 0.240821 sec
INFO [thread 0], skch::main, Time spent post mapping : 0.000103273 sec
>>>>>>>>>>>>>>>>>>
Reference = [fastANI_data/orgs_gene_aligned_1066/Bacillus_sonorensis_GCF_002202015.1_ASM220201v1_genomic.fasta]
Query = [fastANI_data/orgs_gene_aligned_1066/Bacillus_mojavensis_GCF_000245335.1_ASM24533v1_genomic.fasta]
Kmer size = 8
Fragment length = 100
Threads = 1
ANI output file = fastANI_data/orgs_gene_aligned_1066_fastANI/output_4_1.txt
>>>>>>>>>>>>>>>>>>
INFO [thread 0], skch::Sketch::build, minimizers picked from reference = 56888
INFO [thread 0], skch::Sketch::index, unique minimizers = 19781
INFO [thread 0], skch::Sketch::computeFreqHist, Frequency histogram of minimizers = (1, 7203) ... (42, 1)
INFO [thread 0], skch::Sketch::computeFreqHist, With threshold 0.001%, consider all minimizers during lookup.
INFO [thread 0], skch::main, Time spent sketching the reference : 0.0303822 sec
INFO [thread 0], skch::

INFO [thread 0], skch::main, Time spent mapping fragments in query #1 : 0.365079 sec
INFO [thread 0], skch::main, Time spent post mapping : 9.5745e-05 sec
>>>>>>>>>>>>>>>>>>
Reference = [fastANI_data/orgs_gene_aligned_1066/Bacillus_vallismortis_GCF_000245315.1_ASM24531v1_genomic.fasta]
Query = [fastANI_data/orgs_gene_aligned_1066/Bacillus_mojavensis_GCF_000245335.1_ASM24533v1_genomic.fasta]
Kmer size = 8
Fragment length = 100
Threads = 1
ANI output file = fastANI_data/orgs_gene_aligned_1066_fastANI/output_4_10.txt
>>>>>>>>>>>>>>>>>>
INFO [thread 0], skch::Sketch::build, minimizers picked from reference = 43233
INFO [thread 0], skch::Sketch::index, unique minimizers = 18130
INFO [thread 0], skch::Sketch::computeFreqHist, Frequency histogram of minimizers = (1, 7855) ... (29, 1)
INFO [thread 0], skch::Sketch::computeFreqHist, With threshold 0.001%, consider all minimizers during lookup.
INFO [thread 0], skch::main, Time spent sketching the reference : 0.0181531 sec
INFO [thread 0], skch:

INFO [thread 0], skch::main, Time spent mapping fragments in query #1 : 0.311247 sec
INFO [thread 0], skch::main, Time spent post mapping : 0.000100929 sec
>>>>>>>>>>>>>>>>>>
Reference = [fastANI_data/orgs_gene_aligned_1066/Bacillus_paralicheniformis_GCF_000408885.1_ASM40888v1_genomic.fasta]
Query = [fastANI_data/orgs_gene_aligned_1066/Bacillus_tequilensis_GCF_000507145.1_KCTC_13622_01_genomic.fasta]
Kmer size = 8
Fragment length = 100
Threads = 1
ANI output file = fastANI_data/orgs_gene_aligned_1066_fastANI/output_5_7.txt
>>>>>>>>>>>>>>>>>>
INFO [thread 0], skch::Sketch::build, minimizers picked from reference = 56994
INFO [thread 0], skch::Sketch::index, unique minimizers = 19866
INFO [thread 0], skch::Sketch::computeFreqHist, Frequency histogram of minimizers = (1, 7196) ... (35, 1)
INFO [thread 0], skch::Sketch::computeFreqHist, With threshold 0.001%, consider all minimizers during lookup.
INFO [thread 0], skch::main, Time spent sketching the reference : 0.0167955 sec
INFO [thread 

INFO [thread 0], skch::main, Time spent mapping fragments in query #1 : 0.262762 sec
INFO [thread 0], skch::main, Time spent post mapping : 9.7524e-05 sec
>>>>>>>>>>>>>>>>>>
Reference = [fastANI_data/orgs_gene_aligned_1066/Bacillus_mojavensis_GCF_000245335.1_ASM24533v1_genomic.fasta]
Query = [fastANI_data/orgs_gene_aligned_1066/Bacillus_amyloliquefaciens_GCF_000196735.1_ASM19673v1_genomic.fasta]
Kmer size = 8
Fragment length = 100
Threads = 1
ANI output file = fastANI_data/orgs_gene_aligned_1066_fastANI/output_6_4.txt
>>>>>>>>>>>>>>>>>>
INFO [thread 0], skch::Sketch::build, minimizers picked from reference = 54015
INFO [thread 0], skch::Sketch::index, unique minimizers = 20025
INFO [thread 0], skch::Sketch::computeFreqHist, Frequency histogram of minimizers = (1, 7598) ... (37, 1)
INFO [thread 0], skch::Sketch::computeFreqHist, With threshold 0.001%, consider all minimizers during lookup.
INFO [thread 0], skch::main, Time spent sketching the reference : 0.0161976 sec
INFO [thread 0], s

INFO [thread 0], skch::main, Time spent mapping fragments in query #1 : 0.305464 sec
INFO [thread 0], skch::main, Time spent post mapping : 0.000107873 sec
>>>>>>>>>>>>>>>>>>
Reference = [fastANI_data/orgs_gene_aligned_1066/Bacillus_sonorensis_GCF_002202015.1_ASM220201v1_genomic.fasta]
Query = [fastANI_data/orgs_gene_aligned_1066/Bacillus_paralicheniformis_GCF_000408885.1_ASM40888v1_genomic.fasta]
Kmer size = 8
Fragment length = 100
Threads = 1
ANI output file = fastANI_data/orgs_gene_aligned_1066_fastANI/output_7_1.txt
>>>>>>>>>>>>>>>>>>
INFO [thread 0], skch::Sketch::build, minimizers picked from reference = 56888
INFO [thread 0], skch::Sketch::index, unique minimizers = 19781
INFO [thread 0], skch::Sketch::computeFreqHist, Frequency histogram of minimizers = (1, 7203) ... (42, 1)
INFO [thread 0], skch::Sketch::computeFreqHist, With threshold 0.001%, consider all minimizers during lookup.
INFO [thread 0], skch::main, Time spent sketching the reference : 0.0200039 sec
INFO [thread 0],

INFO [thread 0], skch::main, Time spent mapping fragments in query #1 : 0.272775 sec
INFO [thread 0], skch::main, Time spent post mapping : 9.8221e-05 sec
>>>>>>>>>>>>>>>>>>
Reference = [fastANI_data/orgs_gene_aligned_1066/Bacillus_vallismortis_GCF_000245315.1_ASM24531v1_genomic.fasta]
Query = [fastANI_data/orgs_gene_aligned_1066/Bacillus_paralicheniformis_GCF_000408885.1_ASM40888v1_genomic.fasta]
Kmer size = 8
Fragment length = 100
Threads = 1
ANI output file = fastANI_data/orgs_gene_aligned_1066_fastANI/output_7_10.txt
>>>>>>>>>>>>>>>>>>
INFO [thread 0], skch::Sketch::build, minimizers picked from reference = 43233
INFO [thread 0], skch::Sketch::index, unique minimizers = 18130
INFO [thread 0], skch::Sketch::computeFreqHist, Frequency histogram of minimizers = (1, 7855) ... (29, 1)
INFO [thread 0], skch::Sketch::computeFreqHist, With threshold 0.001%, consider all minimizers during lookup.
INFO [thread 0], skch::main, Time spent sketching the reference : 0.0144358 sec
INFO [thread 0]

INFO [thread 0], skch::main, Time spent mapping fragments in query #1 : 0.258294 sec
INFO [thread 0], skch::main, Time spent post mapping : 0.000100587 sec
>>>>>>>>>>>>>>>>>>
Reference = [fastANI_data/orgs_gene_aligned_1066/Bacillus_paralicheniformis_GCF_000408885.1_ASM40888v1_genomic.fasta]
Query = [fastANI_data/orgs_gene_aligned_1066/Bacillus_subtilis_GCF_000009045.1_ASM904v1_genomic.fasta]
Kmer size = 8
Fragment length = 100
Threads = 1
ANI output file = fastANI_data/orgs_gene_aligned_1066_fastANI/output_8_7.txt
>>>>>>>>>>>>>>>>>>
INFO [thread 0], skch::Sketch::build, minimizers picked from reference = 56994
INFO [thread 0], skch::Sketch::index, unique minimizers = 19866
INFO [thread 0], skch::Sketch::computeFreqHist, Frequency histogram of minimizers = (1, 7196) ... (35, 1)
INFO [thread 0], skch::Sketch::computeFreqHist, With threshold 0.001%, consider all minimizers during lookup.
INFO [thread 0], skch::main, Time spent sketching the reference : 0.0200963 sec
INFO [thread 0], skch

INFO [thread 0], skch::main, Time spent mapping fragments in query #1 : 0.316132 sec
INFO [thread 0], skch::main, Time spent post mapping : 9.8975e-05 sec
>>>>>>>>>>>>>>>>>>
Reference = [fastANI_data/orgs_gene_aligned_1066/Bacillus_mojavensis_GCF_000245335.1_ASM24533v1_genomic.fasta]
Query = [fastANI_data/orgs_gene_aligned_1066/Bacillus_velezensis_GCF_002117165.1_ASM211716v1_genomic.fasta]
Kmer size = 8
Fragment length = 100
Threads = 1
ANI output file = fastANI_data/orgs_gene_aligned_1066_fastANI/output_9_4.txt
>>>>>>>>>>>>>>>>>>
INFO [thread 0], skch::Sketch::build, minimizers picked from reference = 54015
INFO [thread 0], skch::Sketch::index, unique minimizers = 20025
INFO [thread 0], skch::Sketch::computeFreqHist, Frequency histogram of minimizers = (1, 7598) ... (37, 1)
INFO [thread 0], skch::Sketch::computeFreqHist, With threshold 0.001%, consider all minimizers during lookup.
INFO [thread 0], skch::main, Time spent sketching the reference : 0.018432 sec
INFO [thread 0], skch::ma

INFO [thread 0], skch::main, Time spent mapping fragments in query #1 : 0.259687 sec
INFO [thread 0], skch::main, Time spent post mapping : 0.000105093 sec
>>>>>>>>>>>>>>>>>>
Reference = [fastANI_data/orgs_gene_aligned_1066/Bacillus_sonorensis_GCF_002202015.1_ASM220201v1_genomic.fasta]
Query = [fastANI_data/orgs_gene_aligned_1066/Bacillus_vallismortis_GCF_000245315.1_ASM24531v1_genomic.fasta]
Kmer size = 8
Fragment length = 100
Threads = 1
ANI output file = fastANI_data/orgs_gene_aligned_1066_fastANI/output_10_1.txt
>>>>>>>>>>>>>>>>>>
INFO [thread 0], skch::Sketch::build, minimizers picked from reference = 56888
INFO [thread 0], skch::Sketch::index, unique minimizers = 19781
INFO [thread 0], skch::Sketch::computeFreqHist, Frequency histogram of minimizers = (1, 7203) ... (42, 1)
INFO [thread 0], skch::Sketch::computeFreqHist, With threshold 0.001%, consider all minimizers during lookup.
INFO [thread 0], skch::main, Time spent sketching the reference : 0.0191022 sec
INFO [thread 0], skc

INFO [thread 0], skch::main, Time spent mapping fragments in query #1 : 0.214213 sec
INFO [thread 0], skch::main, Time spent post mapping : 8.6287e-05 sec
>>>>>>>>>>>>>>>>>>
Reference = [fastANI_data/orgs_gene_aligned_1066/Bacillus_vallismortis_GCF_000245315.1_ASM24531v1_genomic.fasta]
Query = [fastANI_data/orgs_gene_aligned_1066/Bacillus_vallismortis_GCF_000245315.1_ASM24531v1_genomic.fasta]
Kmer size = 8
Fragment length = 100
Threads = 1
ANI output file = fastANI_data/orgs_gene_aligned_1066_fastANI/output_10_10.txt
>>>>>>>>>>>>>>>>>>
INFO [thread 0], skch::Sketch::build, minimizers picked from reference = 43233
INFO [thread 0], skch::Sketch::index, unique minimizers = 18130
INFO [thread 0], skch::Sketch::computeFreqHist, Frequency histogram of minimizers = (1, 7855) ... (29, 1)
INFO [thread 0], skch::Sketch::computeFreqHist, With threshold 0.001%, consider all minimizers during lookup.
INFO [thread 0], skch::main, Time spent sketching the reference : 0.0160907 sec
INFO [thread 0], sk

INFO [thread 0], skch::main, Time spent mapping fragments in query #1 : 0.26838 sec
INFO [thread 0], skch::main, Time spent post mapping : 0.000101572 sec
>>>>>>>>>>>>>>>>>>
Reference = [fastANI_data/orgs_gene_aligned_1066/Bacillus_paralicheniformis_GCF_000408885.1_ASM40888v1_genomic.fasta]
Query = [fastANI_data/orgs_gene_aligned_1066/Bacillus_siamensis_GCF_000262045.1_KCTC_13613_01_genomic.fasta]
Kmer size = 8
Fragment length = 100
Threads = 1
ANI output file = fastANI_data/orgs_gene_aligned_1066_fastANI/output_11_7.txt
>>>>>>>>>>>>>>>>>>
INFO [thread 0], skch::Sketch::build, minimizers picked from reference = 56994
INFO [thread 0], skch::Sketch::index, unique minimizers = 19866
INFO [thread 0], skch::Sketch::computeFreqHist, Frequency histogram of minimizers = (1, 7196) ... (35, 1)
INFO [thread 0], skch::Sketch::computeFreqHist, With threshold 0.001%, consider all minimizers during lookup.
INFO [thread 0], skch::main, Time spent sketching the reference : 0.0185222 sec
INFO [thread 0]

INFO [thread 0], skch::main, Time spent mapping fragments in query #1 : 0.244293 sec
INFO [thread 0], skch::main, Time spent post mapping : 0.000133415 sec
>>>>>>>>>>>>>>>>>>
Reference = [fastANI_data/orgs_gene_aligned_1066/Bacillus_mojavensis_GCF_000245335.1_ASM24533v1_genomic.fasta]
Query = [fastANI_data/orgs_gene_aligned_1066/Bacillus_atrophaeus_GCF_000742675.1_ASM74267v1_genomic.fasta]
Kmer size = 8
Fragment length = 100
Threads = 1
ANI output file = fastANI_data/orgs_gene_aligned_1066_fastANI/output_12_4.txt
>>>>>>>>>>>>>>>>>>
INFO [thread 0], skch::Sketch::build, minimizers picked from reference = 54015
INFO [thread 0], skch::Sketch::index, unique minimizers = 20025
INFO [thread 0], skch::Sketch::computeFreqHist, Frequency histogram of minimizers = (1, 7598) ... (37, 1)
INFO [thread 0], skch::Sketch::computeFreqHist, With threshold 0.001%, consider all minimizers during lookup.
INFO [thread 0], skch::main, Time spent sketching the reference : 0.0137635 sec
INFO [thread 0], skch::

INFO [thread 0], skch::main, Time spent mapping fragments in query #1 : 0.276888 sec
INFO [thread 0], skch::main, Time spent post mapping : 0.000118145 sec


Optionally, users can also get a second .matrix file with identity values arranged in a phylip-formatted lower triangular matrix by supplying --matrix parameter. NOTE: No ANI output is reported for a genome pair if ANI value is much below 80%. Such case should be computed at amino acid level.

In [16]:
path_output_m = '../data/fastANI_data/orgs_gene_aligned_1066_fastANI_matrix/'

frag_len = 100
kmers = 8
min_frag = 5

for (p1, path_org1) in enumerate(full_paths_orgs):
    for (p2, path_org2) in enumerate(full_paths_orgs):
        output_file = path_output_m + 'output_' + str(p1+1) + '_' + str(p2+1) + '.txt'
        !fastANI -q {path_org1} -r {path_org2} -o {output_file} --fragLen {frag_len} -k {kmers} --minFrag {min_frag} --matrix

>>>>>>>>>>>>>>>>>>
Reference = [fastANI_data/orgs_gene_aligned_1066/Bacillus_sonorensis_GCF_002202015.1_ASM220201v1_genomic.fasta]
Query = [fastANI_data/orgs_gene_aligned_1066/Bacillus_sonorensis_GCF_002202015.1_ASM220201v1_genomic.fasta]
Kmer size = 8
Fragment length = 100
Threads = 1
ANI output file = fastANI_data/orgs_gene_aligned_1066_fastANI_matrix/output_1_1.txt
>>>>>>>>>>>>>>>>>>
INFO [thread 0], skch::Sketch::build, minimizers picked from reference = 56888
INFO [thread 0], skch::Sketch::index, unique minimizers = 19781
INFO [thread 0], skch::Sketch::computeFreqHist, Frequency histogram of minimizers = (1, 7203) ... (42, 1)
INFO [thread 0], skch::Sketch::computeFreqHist, With threshold 0.001%, consider all minimizers during lookup.
INFO [thread 0], skch::main, Time spent sketching the reference : 0.0181145 sec
INFO [thread 0], skch::main, Time spent mapping fragments in query #1 : 0.295849 sec
INFO [thread 0], skch::main, Time spent post mapping : 0.000103283 sec
>>>>>>>>>>>>>>>

INFO [thread 0], skch::main, Time spent mapping fragments in query #1 : 0.277901 sec
INFO [thread 0], skch::main, Time spent post mapping : 9.9941e-05 sec
>>>>>>>>>>>>>>>>>>
Reference = [fastANI_data/orgs_gene_aligned_1066/Bacillus_vallismortis_GCF_000245315.1_ASM24531v1_genomic.fasta]
Query = [fastANI_data/orgs_gene_aligned_1066/Bacillus_sonorensis_GCF_002202015.1_ASM220201v1_genomic.fasta]
Kmer size = 8
Fragment length = 100
Threads = 1
ANI output file = fastANI_data/orgs_gene_aligned_1066_fastANI_matrix/output_1_10.txt
>>>>>>>>>>>>>>>>>>
INFO [thread 0], skch::Sketch::build, minimizers picked from reference = 43233
INFO [thread 0], skch::Sketch::index, unique minimizers = 18130
INFO [thread 0], skch::Sketch::computeFreqHist, Frequency histogram of minimizers = (1, 7855) ... (29, 1)
INFO [thread 0], skch::Sketch::computeFreqHist, With threshold 0.001%, consider all minimizers during lookup.
INFO [thread 0], skch::main, Time spent sketching the reference : 0.0161336 sec
INFO [thread 0

INFO [thread 0], skch::main, Time spent mapping fragments in query #1 : 0.256613 sec
INFO [thread 0], skch::main, Time spent post mapping : 0.000100751 sec
>>>>>>>>>>>>>>>>>>
Reference = [fastANI_data/orgs_gene_aligned_1066/Bacillus_paralicheniformis_GCF_000408885.1_ASM40888v1_genomic.fasta]
Query = [fastANI_data/orgs_gene_aligned_1066/Bacillus_halotolerans_GCF_001517105.1_ASM151710v1_genomic.fasta]
Kmer size = 8
Fragment length = 100
Threads = 1
ANI output file = fastANI_data/orgs_gene_aligned_1066_fastANI_matrix/output_2_7.txt
>>>>>>>>>>>>>>>>>>
INFO [thread 0], skch::Sketch::build, minimizers picked from reference = 56994
INFO [thread 0], skch::Sketch::index, unique minimizers = 19866
INFO [thread 0], skch::Sketch::computeFreqHist, Frequency histogram of minimizers = (1, 7196) ... (35, 1)
INFO [thread 0], skch::Sketch::computeFreqHist, With threshold 0.001%, consider all minimizers during lookup.
INFO [thread 0], skch::main, Time spent sketching the reference : 0.0147817 sec
INFO [t

INFO [thread 0], skch::main, Time spent mapping fragments in query #1 : 0.296236 sec
INFO [thread 0], skch::main, Time spent post mapping : 0.000102692 sec
>>>>>>>>>>>>>>>>>>
Reference = [fastANI_data/orgs_gene_aligned_1066/Bacillus_mojavensis_GCF_000245335.1_ASM24533v1_genomic.fasta]
Query = [fastANI_data/orgs_gene_aligned_1066/Bacillus_licheniformis_GCF_000011645.1_ASM1164v1_genomic.fasta]
Kmer size = 8
Fragment length = 100
Threads = 1
ANI output file = fastANI_data/orgs_gene_aligned_1066_fastANI_matrix/output_3_4.txt
>>>>>>>>>>>>>>>>>>
INFO [thread 0], skch::Sketch::build, minimizers picked from reference = 54015
INFO [thread 0], skch::Sketch::index, unique minimizers = 20025
INFO [thread 0], skch::Sketch::computeFreqHist, Frequency histogram of minimizers = (1, 7598) ... (37, 1)
INFO [thread 0], skch::Sketch::computeFreqHist, With threshold 0.001%, consider all minimizers during lookup.
INFO [thread 0], skch::main, Time spent sketching the reference : 0.0162312 sec
INFO [thread 0]

INFO [thread 0], skch::main, Time spent mapping fragments in query #1 : 0.253782 sec
INFO [thread 0], skch::main, Time spent post mapping : 0.000103782 sec
>>>>>>>>>>>>>>>>>>
Reference = [fastANI_data/orgs_gene_aligned_1066/Bacillus_sonorensis_GCF_002202015.1_ASM220201v1_genomic.fasta]
Query = [fastANI_data/orgs_gene_aligned_1066/Bacillus_mojavensis_GCF_000245335.1_ASM24533v1_genomic.fasta]
Kmer size = 8
Fragment length = 100
Threads = 1
ANI output file = fastANI_data/orgs_gene_aligned_1066_fastANI_matrix/output_4_1.txt
>>>>>>>>>>>>>>>>>>
INFO [thread 0], skch::Sketch::build, minimizers picked from reference = 56888
INFO [thread 0], skch::Sketch::index, unique minimizers = 19781
INFO [thread 0], skch::Sketch::computeFreqHist, Frequency histogram of minimizers = (1, 7203) ... (42, 1)
INFO [thread 0], skch::Sketch::computeFreqHist, With threshold 0.001%, consider all minimizers during lookup.
INFO [thread 0], skch::main, Time spent sketching the reference : 0.0196088 sec
INFO [thread 0],

INFO [thread 0], skch::main, Time spent mapping fragments in query #1 : 0.258021 sec
INFO [thread 0], skch::main, Time spent post mapping : 9.6435e-05 sec
>>>>>>>>>>>>>>>>>>
Reference = [fastANI_data/orgs_gene_aligned_1066/Bacillus_vallismortis_GCF_000245315.1_ASM24531v1_genomic.fasta]
Query = [fastANI_data/orgs_gene_aligned_1066/Bacillus_mojavensis_GCF_000245335.1_ASM24533v1_genomic.fasta]
Kmer size = 8
Fragment length = 100
Threads = 1
ANI output file = fastANI_data/orgs_gene_aligned_1066_fastANI_matrix/output_4_10.txt
>>>>>>>>>>>>>>>>>>
INFO [thread 0], skch::Sketch::build, minimizers picked from reference = 43233
INFO [thread 0], skch::Sketch::index, unique minimizers = 18130
INFO [thread 0], skch::Sketch::computeFreqHist, Frequency histogram of minimizers = (1, 7855) ... (29, 1)
INFO [thread 0], skch::Sketch::computeFreqHist, With threshold 0.001%, consider all minimizers during lookup.
INFO [thread 0], skch::main, Time spent sketching the reference : 0.0177733 sec
INFO [thread 0]

INFO [thread 0], skch::main, Time spent mapping fragments in query #1 : 0.268096 sec
INFO [thread 0], skch::main, Time spent post mapping : 9.7447e-05 sec
>>>>>>>>>>>>>>>>>>
Reference = [fastANI_data/orgs_gene_aligned_1066/Bacillus_paralicheniformis_GCF_000408885.1_ASM40888v1_genomic.fasta]
Query = [fastANI_data/orgs_gene_aligned_1066/Bacillus_tequilensis_GCF_000507145.1_KCTC_13622_01_genomic.fasta]
Kmer size = 8
Fragment length = 100
Threads = 1
ANI output file = fastANI_data/orgs_gene_aligned_1066_fastANI_matrix/output_5_7.txt
>>>>>>>>>>>>>>>>>>
INFO [thread 0], skch::Sketch::build, minimizers picked from reference = 56994
INFO [thread 0], skch::Sketch::index, unique minimizers = 19866
INFO [thread 0], skch::Sketch::computeFreqHist, Frequency histogram of minimizers = (1, 7196) ... (35, 1)
INFO [thread 0], skch::Sketch::computeFreqHist, With threshold 0.001%, consider all minimizers during lookup.
INFO [thread 0], skch::main, Time spent sketching the reference : 0.0197009 sec
INFO [t

INFO [thread 0], skch::main, Time spent mapping fragments in query #1 : 0.265149 sec
INFO [thread 0], skch::main, Time spent post mapping : 0.000106452 sec
>>>>>>>>>>>>>>>>>>
Reference = [fastANI_data/orgs_gene_aligned_1066/Bacillus_mojavensis_GCF_000245335.1_ASM24533v1_genomic.fasta]
Query = [fastANI_data/orgs_gene_aligned_1066/Bacillus_amyloliquefaciens_GCF_000196735.1_ASM19673v1_genomic.fasta]
Kmer size = 8
Fragment length = 100
Threads = 1
ANI output file = fastANI_data/orgs_gene_aligned_1066_fastANI_matrix/output_6_4.txt
>>>>>>>>>>>>>>>>>>
INFO [thread 0], skch::Sketch::build, minimizers picked from reference = 54015
INFO [thread 0], skch::Sketch::index, unique minimizers = 20025
INFO [thread 0], skch::Sketch::computeFreqHist, Frequency histogram of minimizers = (1, 7598) ... (37, 1)
INFO [thread 0], skch::Sketch::computeFreqHist, With threshold 0.001%, consider all minimizers during lookup.
INFO [thread 0], skch::main, Time spent sketching the reference : 0.0227036 sec
INFO [thre

INFO [thread 0], skch::main, Time spent mapping fragments in query #1 : 0.267621 sec
INFO [thread 0], skch::main, Time spent post mapping : 0.000104871 sec
>>>>>>>>>>>>>>>>>>
Reference = [fastANI_data/orgs_gene_aligned_1066/Bacillus_sonorensis_GCF_002202015.1_ASM220201v1_genomic.fasta]
Query = [fastANI_data/orgs_gene_aligned_1066/Bacillus_paralicheniformis_GCF_000408885.1_ASM40888v1_genomic.fasta]
Kmer size = 8
Fragment length = 100
Threads = 1
ANI output file = fastANI_data/orgs_gene_aligned_1066_fastANI_matrix/output_7_1.txt
>>>>>>>>>>>>>>>>>>
INFO [thread 0], skch::Sketch::build, minimizers picked from reference = 56888
INFO [thread 0], skch::Sketch::index, unique minimizers = 19781
INFO [thread 0], skch::Sketch::computeFreqHist, Frequency histogram of minimizers = (1, 7203) ... (42, 1)
INFO [thread 0], skch::Sketch::computeFreqHist, With threshold 0.001%, consider all minimizers during lookup.
INFO [thread 0], skch::main, Time spent sketching the reference : 0.0203065 sec
INFO [thr

INFO [thread 0], skch::main, Time spent mapping fragments in query #1 : 0.267245 sec
INFO [thread 0], skch::main, Time spent post mapping : 9.5674e-05 sec
>>>>>>>>>>>>>>>>>>
Reference = [fastANI_data/orgs_gene_aligned_1066/Bacillus_vallismortis_GCF_000245315.1_ASM24531v1_genomic.fasta]
Query = [fastANI_data/orgs_gene_aligned_1066/Bacillus_paralicheniformis_GCF_000408885.1_ASM40888v1_genomic.fasta]
Kmer size = 8
Fragment length = 100
Threads = 1
ANI output file = fastANI_data/orgs_gene_aligned_1066_fastANI_matrix/output_7_10.txt
>>>>>>>>>>>>>>>>>>
INFO [thread 0], skch::Sketch::build, minimizers picked from reference = 43233
INFO [thread 0], skch::Sketch::index, unique minimizers = 18130
INFO [thread 0], skch::Sketch::computeFreqHist, Frequency histogram of minimizers = (1, 7855) ... (29, 1)
INFO [thread 0], skch::Sketch::computeFreqHist, With threshold 0.001%, consider all minimizers during lookup.
INFO [thread 0], skch::main, Time spent sketching the reference : 0.0162493 sec
INFO [th

INFO [thread 0], skch::main, Time spent mapping fragments in query #1 : 0.277817 sec
INFO [thread 0], skch::main, Time spent post mapping : 0.000100818 sec
>>>>>>>>>>>>>>>>>>
Reference = [fastANI_data/orgs_gene_aligned_1066/Bacillus_paralicheniformis_GCF_000408885.1_ASM40888v1_genomic.fasta]
Query = [fastANI_data/orgs_gene_aligned_1066/Bacillus_subtilis_GCF_000009045.1_ASM904v1_genomic.fasta]
Kmer size = 8
Fragment length = 100
Threads = 1
ANI output file = fastANI_data/orgs_gene_aligned_1066_fastANI_matrix/output_8_7.txt
>>>>>>>>>>>>>>>>>>
INFO [thread 0], skch::Sketch::build, minimizers picked from reference = 56994
INFO [thread 0], skch::Sketch::index, unique minimizers = 19866
INFO [thread 0], skch::Sketch::computeFreqHist, Frequency histogram of minimizers = (1, 7196) ... (35, 1)
INFO [thread 0], skch::Sketch::computeFreqHist, With threshold 0.001%, consider all minimizers during lookup.
INFO [thread 0], skch::main, Time spent sketching the reference : 0.0165874 sec
INFO [thread 0

INFO [thread 0], skch::main, Time spent mapping fragments in query #1 : 0.258447 sec
INFO [thread 0], skch::main, Time spent post mapping : 0.00010073 sec
>>>>>>>>>>>>>>>>>>
Reference = [fastANI_data/orgs_gene_aligned_1066/Bacillus_mojavensis_GCF_000245335.1_ASM24533v1_genomic.fasta]
Query = [fastANI_data/orgs_gene_aligned_1066/Bacillus_velezensis_GCF_002117165.1_ASM211716v1_genomic.fasta]
Kmer size = 8
Fragment length = 100
Threads = 1
ANI output file = fastANI_data/orgs_gene_aligned_1066_fastANI_matrix/output_9_4.txt
>>>>>>>>>>>>>>>>>>
INFO [thread 0], skch::Sketch::build, minimizers picked from reference = 54015
INFO [thread 0], skch::Sketch::index, unique minimizers = 20025
INFO [thread 0], skch::Sketch::computeFreqHist, Frequency histogram of minimizers = (1, 7598) ... (37, 1)
INFO [thread 0], skch::Sketch::computeFreqHist, With threshold 0.001%, consider all minimizers during lookup.
INFO [thread 0], skch::main, Time spent sketching the reference : 0.0136511 sec
INFO [thread 0], 

INFO [thread 0], skch::main, Time spent mapping fragments in query #1 : 0.258536 sec
INFO [thread 0], skch::main, Time spent post mapping : 0.00010307 sec
>>>>>>>>>>>>>>>>>>
Reference = [fastANI_data/orgs_gene_aligned_1066/Bacillus_sonorensis_GCF_002202015.1_ASM220201v1_genomic.fasta]
Query = [fastANI_data/orgs_gene_aligned_1066/Bacillus_vallismortis_GCF_000245315.1_ASM24531v1_genomic.fasta]
Kmer size = 8
Fragment length = 100
Threads = 1
ANI output file = fastANI_data/orgs_gene_aligned_1066_fastANI_matrix/output_10_1.txt
>>>>>>>>>>>>>>>>>>
INFO [thread 0], skch::Sketch::build, minimizers picked from reference = 56888
INFO [thread 0], skch::Sketch::index, unique minimizers = 19781
INFO [thread 0], skch::Sketch::computeFreqHist, Frequency histogram of minimizers = (1, 7203) ... (42, 1)
INFO [thread 0], skch::Sketch::computeFreqHist, With threshold 0.001%, consider all minimizers during lookup.
INFO [thread 0], skch::main, Time spent sketching the reference : 0.0199958 sec
INFO [thread 0

INFO [thread 0], skch::main, Time spent mapping fragments in query #1 : 0.231034 sec
INFO [thread 0], skch::main, Time spent post mapping : 8.906e-05 sec
>>>>>>>>>>>>>>>>>>
Reference = [fastANI_data/orgs_gene_aligned_1066/Bacillus_vallismortis_GCF_000245315.1_ASM24531v1_genomic.fasta]
Query = [fastANI_data/orgs_gene_aligned_1066/Bacillus_vallismortis_GCF_000245315.1_ASM24531v1_genomic.fasta]
Kmer size = 8
Fragment length = 100
Threads = 1
ANI output file = fastANI_data/orgs_gene_aligned_1066_fastANI_matrix/output_10_10.txt
>>>>>>>>>>>>>>>>>>
INFO [thread 0], skch::Sketch::build, minimizers picked from reference = 43233
INFO [thread 0], skch::Sketch::index, unique minimizers = 18130
INFO [thread 0], skch::Sketch::computeFreqHist, Frequency histogram of minimizers = (1, 7855) ... (29, 1)
INFO [thread 0], skch::Sketch::computeFreqHist, With threshold 0.001%, consider all minimizers during lookup.
INFO [thread 0], skch::main, Time spent sketching the reference : 0.0288334 sec
INFO [thread 

INFO [thread 0], skch::main, Time spent mapping fragments in query #1 : 0.273781 sec
INFO [thread 0], skch::main, Time spent post mapping : 9.8925e-05 sec
>>>>>>>>>>>>>>>>>>
Reference = [fastANI_data/orgs_gene_aligned_1066/Bacillus_paralicheniformis_GCF_000408885.1_ASM40888v1_genomic.fasta]
Query = [fastANI_data/orgs_gene_aligned_1066/Bacillus_siamensis_GCF_000262045.1_KCTC_13613_01_genomic.fasta]
Kmer size = 8
Fragment length = 100
Threads = 1
ANI output file = fastANI_data/orgs_gene_aligned_1066_fastANI_matrix/output_11_7.txt
>>>>>>>>>>>>>>>>>>
INFO [thread 0], skch::Sketch::build, minimizers picked from reference = 56994
INFO [thread 0], skch::Sketch::index, unique minimizers = 19866
INFO [thread 0], skch::Sketch::computeFreqHist, Frequency histogram of minimizers = (1, 7196) ... (35, 1)
INFO [thread 0], skch::Sketch::computeFreqHist, With threshold 0.001%, consider all minimizers during lookup.
INFO [thread 0], skch::main, Time spent sketching the reference : 0.0189866 sec
INFO [th

INFO [thread 0], skch::main, Time spent mapping fragments in query #1 : 0.280332 sec
INFO [thread 0], skch::main, Time spent post mapping : 0.00010617 sec
>>>>>>>>>>>>>>>>>>
Reference = [fastANI_data/orgs_gene_aligned_1066/Bacillus_mojavensis_GCF_000245335.1_ASM24533v1_genomic.fasta]
Query = [fastANI_data/orgs_gene_aligned_1066/Bacillus_atrophaeus_GCF_000742675.1_ASM74267v1_genomic.fasta]
Kmer size = 8
Fragment length = 100
Threads = 1
ANI output file = fastANI_data/orgs_gene_aligned_1066_fastANI_matrix/output_12_4.txt
>>>>>>>>>>>>>>>>>>
INFO [thread 0], skch::Sketch::build, minimizers picked from reference = 54015
INFO [thread 0], skch::Sketch::index, unique minimizers = 20025
INFO [thread 0], skch::Sketch::computeFreqHist, Frequency histogram of minimizers = (1, 7598) ... (37, 1)
INFO [thread 0], skch::Sketch::computeFreqHist, With threshold 0.001%, consider all minimizers during lookup.
INFO [thread 0], skch::main, Time spent sketching the reference : 0.0173973 sec
INFO [thread 0], 

INFO [thread 0], skch::main, Time spent mapping fragments in query #1 : 0.266102 sec
INFO [thread 0], skch::main, Time spent post mapping : 0.000100549 sec


In [None]:
# result example in dir data/fastANI_data/orgs_gene_aligned_1066_fastANI_matrix

### Show results

In [11]:
full_paths_output = [path_output + f for f in os.listdir(path_output) if f.endswith('.txt')]
len(full_paths_output), full_paths_output[:5]

(144,
 ['fastANI_data/orgs_gene_aligned_1066_fastANI/output_1_1.txt',
  'fastANI_data/orgs_gene_aligned_1066_fastANI/output_1_2.txt',
  'fastANI_data/orgs_gene_aligned_1066_fastANI/output_1_3.txt',
  'fastANI_data/orgs_gene_aligned_1066_fastANI/output_1_4.txt',
  'fastANI_data/orgs_gene_aligned_1066_fastANI/output_1_5.txt'])

In [13]:
df = pd.concat([pd.read_csv(output_file, sep='\t', header=None) for output_file in full_paths_output])

In [14]:
df.shape

(144, 5)

In [15]:
df.head()

Unnamed: 0,0,1,2,3,4
0,fastANI_data/orgs_gene_aligned_1066/Bacillus_s...,fastANI_data/orgs_gene_aligned_1066/Bacillus_s...,99.9865,817,817
0,fastANI_data/orgs_gene_aligned_1066/Bacillus_s...,fastANI_data/orgs_gene_aligned_1066/Bacillus_h...,83.3769,573,817
0,fastANI_data/orgs_gene_aligned_1066/Bacillus_s...,fastANI_data/orgs_gene_aligned_1066/Bacillus_l...,88.3056,734,817
0,fastANI_data/orgs_gene_aligned_1066/Bacillus_s...,fastANI_data/orgs_gene_aligned_1066/Bacillus_m...,83.1645,553,817
0,fastANI_data/orgs_gene_aligned_1066/Bacillus_s...,fastANI_data/orgs_gene_aligned_1066/Bacillus_t...,83.2091,572,817


fastANI OUTPUT FORMAT: In all above use cases, OUTPUT_FILE will contain tab delimited row(s) with query genome, reference genome, ANI value, count of bidirectional fragment mappings, and total query fragments. Alignment fraction (wrt. the query genome) is simply the ratio of mappings and total fragments.

In [17]:
df['organism_1'] = df[0].apply(lambda x: '_'.join((x.split('/',2)[2].split('_',2)[:2])))
df['organism_2'] = df[1].apply(lambda x: '_'.join((x.split('/',2)[2].split('_',2)[:2])))
df.drop([0,1], axis=1, inplace=True)
df.columns = ['ani', 'count_bidir_frags', 'total_frags', 'organism_1', 'organism_2']
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,ani,count_bidir_frags,total_frags,organism_1,organism_2
0,99.9865,817,817,Bacillus_sonorensis,Bacillus_sonorensis
1,83.3769,573,817,Bacillus_sonorensis,Bacillus_halotolerans
2,88.3056,734,817,Bacillus_sonorensis,Bacillus_licheniformis
3,83.1645,553,817,Bacillus_sonorensis,Bacillus_mojavensis
4,83.2091,572,817,Bacillus_sonorensis,Bacillus_tequilensis


#### Most close organisms

In [18]:
df.sort_values(by=['ani'], ascending=False)[:10]

Unnamed: 0,ani,count_bidir_frags,total_frags,organism_1,organism_2
13,99.993,818,820,Bacillus_halotolerans,Bacillus_halotolerans
130,99.9926,818,820,Bacillus_siamensis,Bacillus_siamensis
104,99.9925,818,820,Bacillus_velezensis,Bacillus_velezensis
52,99.9921,818,820,Bacillus_tequilensis,Bacillus_tequilensis
78,99.9909,819,820,Bacillus_paralicheniformis,Bacillus_paralicheniformis
91,99.9905,818,820,Bacillus_subtilis,Bacillus_subtilis
26,99.9888,819,820,Bacillus_licheniformis,Bacillus_licheniformis
143,99.9886,818,820,Bacillus_atrophaeus,Bacillus_atrophaeus
0,99.9865,817,817,Bacillus_sonorensis,Bacillus_sonorensis
65,99.9795,818,820,Bacillus_amyloliquefaciens,Bacillus_amyloliquefaciens


#### Not very close organisms

In [19]:
df.sort_values(by=['ani'])[:10]

Unnamed: 0,ani,count_bidir_frags,total_frags,organism_1,organism_2
9,80.8714,438,817,Bacillus_sonorensis,Bacillus_vallismortis
33,80.915,426,820,Bacillus_licheniformis,Bacillus_vallismortis
110,81.0444,428,645,Bacillus_vallismortis,Bacillus_licheniformis
108,81.2449,418,645,Bacillus_vallismortis,Bacillus_sonorensis
81,81.2667,418,820,Bacillus_paralicheniformis,Bacillus_vallismortis
114,81.3444,415,645,Bacillus_vallismortis,Bacillus_paralicheniformis
50,82.9806,590,820,Bacillus_tequilensis,Bacillus_licheniformis
28,83.0327,585,820,Bacillus_licheniformis,Bacillus_tequilensis
38,83.0471,552,805,Bacillus_mojavensis,Bacillus_licheniformis
35,83.0541,558,820,Bacillus_licheniformis,Bacillus_atrophaeus
