In [1]:
import argparse
import subprocess
from Bio import SeqIO
import pandas as pd
import numpy as np
import sys
import os

In [None]:
'''
SEQ1 = "proteins/Streptomyces_sp._MMCC_100.proteins.faa"
SEQ2 = "proteins/Streptomyces_rubrogriseus_NBRC_15455.proteins.faa"

subprocess.run(["makeblastdb", "-in", SEQ1, "-dbtype", "prot"], check=True)
subprocess.run(["makeblastdb", "-in", SEQ2, "-dbtype", "prot"], check=True)

# =====================================================
# RUN BLASTP (Ruby blast+ behavior)
# =====================================================
if not args.quiet:
    print("Running BLASTP...")

subprocess.run([
    "blastp",
    "-db", SEQ1,
    "-query", SEQ2,
    "-max_target_seqs", "1",
    "-num_threads", str(THREADS),
    "-outfmt", "6",
    "-out", 1_vs_2.tsv
], check=True)

subprocess.run([
    "blastp",
    "-db", SEQ2,
    "-query", SEQ1,
    "-max_target_seqs", "1",
    "-num_threads", str(THREADS),
    "-outfmt", "6",
    "-out", 2_vs_1.tsv
], check=True)
'''

# blast can be run using subprocess.run as above or directly using ! and blast parameter as given below

In [20]:
# =====================================================
# PARAMETERS (same as Ruby defaults)
# =====================================================

SEQ1 = "proteins/Streptomyces_sp._MMCC_100.proteins.faa"
SEQ2 = "proteins/Streptomyces_rubrogriseus_NBRC_15455.proteins.faa"

MIN_ID = 20          # -i
MIN_LEN = 0          # -l
MIN_BITSCORE = 50    # -s
LEN_FRAC = 0.0       # -L
MIN_HITS = 50        # -n
THREADS = 8          # -t

In [5]:
# =====================================================
# MAKE BLAST DATABASES
# =====================================================
!makeblastdb -in proteins/Streptomyces_sp._MMCC_100.proteins.faa -dbtype  prot -out Streptomyces_sp_MMCC_100_db
!makeblastdb -in proteins/Streptomyces_rubrogriseus_NBRC_15455.proteins.faa -dbtype  prot -out Streptomyces_rubrogriseus_NBRC_15455_db




Building a new DB, current time: 02/18/2026 00:03:01
New DB name:   /Users/mdumar/Desktop/md10/Journal_of_Antibiotics/AAI/Streptomyces_sp_MMCC_100_db
New DB title:  proteins/Streptomyces_sp._MMCC_100.proteins.faa
Sequence type: Protein
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 7620 sequences in 0.300972 seconds.




Building a new DB, current time: 02/18/2026 00:03:01
New DB name:   /Users/mdumar/Desktop/md10/Journal_of_Antibiotics/AAI/Streptomyces_rubrogriseus_NBRC_15455_db
New DB title:  proteins/Streptomyces_rubrogriseus_NBRC_15455.proteins.faa
Sequence type: Protein
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 7577 sequences in 0.291061 seconds.




In [2]:
# =====================================================
# RUN BLASTP (Ruby blast+ behavior)
# =====================================================
!blastp -db Streptomyces_sp_MMCC_100_db -query  proteins/Streptomyces_sp._MMCC_100.proteins.faa \
-max_target_seqs 1 -num_threads 8 -outfmt 6  -out 1_vs_2.tsv

!blastp -db Streptomyces_rubrogriseus_NBRC_15455_db -query  proteins/Streptomyces_sp._MMCC_100.proteins.faa \
-max_target_seqs 1 -num_threads 8 -outfmt 6  -out 2_vs_1.tsv




In [40]:
# =====================================================
# 3️⃣ LOAD BLAST OUTPUT
# =====================================================

cols = [
    "qseqid","sseqid","pident","length","mismatch","gapopen",
    "qstart","qend","sstart","send","evalue","bitscore"
]

df1 = pd.read_csv("1_vs_2.tsv", sep="\t", names=cols)
df2 = pd.read_csv("2_vs_1.tsv", sep="\t", names=cols)
df1.head()

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
0,JBGORK010000001.1_1,JBGORK010000001.1_1,100.0,437,0,0,1,437,1,437,0.0,846.0
1,JBGORK010000001.1_2,JBGORK010000001.1_2,100.0,331,0,0,1,331,1,331,0.0,655.0
2,JBGORK010000001.1_3,JBGORK010000001.1_3,100.0,406,0,0,1,406,1,406,0.0,784.0
3,JBGORK010000001.1_4,JBGORK010000001.1_4,100.0,264,0,0,1,264,1,264,0.0,516.0
4,JBGORK010000001.1_5,JBGORK010000001.1_5,100.0,461,0,0,1,461,1,461,0.0,870.0


In [51]:
# =====================================================
# 4️⃣ LOAD PROTEIN LENGTHS
# =====================================================

lenA = {r.id: len(r.seq) for r in SeqIO.parse(SEQ1, "fasta")}
lenB = {r.id: len(r.seq) for r in SeqIO.parse(SEQ2, "fasta")}

# ---- Add minimum protein length ----
df1["len_q"] = df1["qseqid"].map(lenA)
df1["len_s"] = df1["sseqid"].map(lenB)
df1["min_len"] = np.minimum(df1["len_q"], df1["len_s"])

df2["len_q"] = df2["qseqid"].map(lenB)
df2["len_s"] = df2["sseqid"].map(lenA)
df2["min_len"] = np.minimum(df2["len_q"], df2["len_s"])

# ---- Remove invalid lengths ----
df1 = df1[df1["min_len"] > 0]
df2 = df2[df2["min_len"] > 0]

# ---- Apply length fraction filter ----
df1 = df1[(df1["length"] / df1["min_len"]) >= LEN_FRAC]
df2 = df2[(df2["length"] / df2["min_len"]) >= LEN_FRAC]
df1.head()

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,len_q,len_s,min_len
0,JBGORK010000001.1_1,BEWD01000007.1_1,94.966,437,22,0,1,437,1,437,0.0,778.0,437,437,437
1,JBGORK010000001.1_2,BEWD01000007.1_2,97.281,331,9,0,1,331,1,331,0.0,640.0,331,331,331
2,JBGORK010000001.1_3,BEWD01000007.1_3,94.335,406,23,0,1,406,1,406,0.0,683.0,406,406,406
3,JBGORK010000001.1_4,BEWD01000007.1_4,86.415,265,21,2,1,264,1,251,8.239999999999999e-132,370.0,264,251,251
4,JBGORK010000001.1_5,BEWD01000007.1_5,73.374,492,69,6,1,461,1,461,0.0,561.0,461,461,461


In [52]:
# ---- Keep FIRST hit per query (Ruby qry_seen logic) ----
df1 = df1.drop_duplicates("qseqid")
df2 = df2.drop_duplicates("qseqid")

In [53]:
# ---- Apply identity / length / bitscore filters ----
df1 = df1[
    (df1["pident"] >= MIN_ID) &
    (df1["length"] >= MIN_LEN) &
    (df1["bitscore"] >= MIN_BITSCORE)
].copy()

df2 = df2[
    (df2["pident"] >= MIN_ID) &
    (df2["length"] >= MIN_LEN) &
    (df2["bitscore"] >= MIN_BITSCORE)
].copy()
df1.head()

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,len_q,len_s,min_len
0,JBGORK010000001.1_1,BEWD01000007.1_1,94.966,437,22,0,1,437,1,437,0.0,778.0,437,437,437
1,JBGORK010000001.1_2,BEWD01000007.1_2,97.281,331,9,0,1,331,1,331,0.0,640.0,331,331,331
2,JBGORK010000001.1_3,BEWD01000007.1_3,94.335,406,23,0,1,406,1,406,0.0,683.0,406,406,406
3,JBGORK010000001.1_4,BEWD01000007.1_4,86.415,265,21,2,1,264,1,251,8.239999999999999e-132,370.0,264,251,251
4,JBGORK010000001.1_5,BEWD01000007.1_5,73.374,492,69,6,1,461,1,461,0.0,561.0,461,461,461


In [47]:
# =====================================================
# 6️⃣ ONE-WAY AAI
# =====================================================

def compute_stats(df):
    n = len(df)
    if n == 0:
        return 0, 0, 0
    mean = df["pident"].mean()
    sd = np.sqrt((df["pident"]**2).mean() - mean**2)
    return mean, sd, n

mean1, sd1, n1 = compute_stats(df1)
mean2, sd2, n2 = compute_stats(df2)

print("\n==============================")
print("ONE-WAY AAI")
print("==============================")
print(f"Genome1 → Genome2: {mean1:.2f}%  (SD: {sd1:.2f})  n={n1}")
print(f"Genome2 → Genome1: {mean2:.2f}%  (SD: {sd2:.2f})  n={n2}")



ONE-WAY AAI
Genome1 → Genome2: 85.61%  (SD: 19.56)  n=6636
Genome2 → Genome1: 84.88%  (SD: 20.25)  n=6678


In [55]:
# =====================================================
# 7️⃣ RECIPROCAL BEST HITS (RBH)
# =====================================================

map1 = dict(zip(df1["qseqid"], df1["sseqid"]))
map2 = dict(zip(df2["qseqid"], df2["sseqid"]))

rbh_mask = df1.apply(
    lambda r: map2.get(r["sseqid"], None) == r["qseqid"],
    axis=1
)

rbh = df1[rbh_mask]


In [56]:
# =====================================================
# 8️⃣ TWO-WAY AAI
# =====================================================

n_rbh = len(rbh)

print("\n==============================")
print("TWO-WAY AAI (RBH)")
print("==============================")

if n_rbh < MIN_HITS:
    print(f"Insufficient reciprocal hits: {n_rbh}")
else:
    mean_rbh = rbh["pident"].mean()
    sd_rbh = np.sqrt((rbh["pident"]**2).mean() - mean_rbh**2)

    print(f"Two-way AAI : {mean_rbh:.2f}%")
    print(f"SD          : {sd_rbh:.2f}")
    print(f"Proteins    : {n_rbh}")
    print("==============================")



TWO-WAY AAI (RBH)
Two-way AAI : 91.80%
SD          : 10.45
Proteins    : 5722
