# Microbial habitability and IDP propensity of ARFs


ProkAtlas database (Mise & Iwasaki, *iScience*, 2020) was downloaded on May. 24, 2021, with the follwing commands:

```{bash}
# at data/pubdata/prokatlas
wget http://msk33.github.io/ProkAtlas.fa.zip
unzip ProkAtlas.fa.zip ProkAtlas.fa

md5sum ProkAtlas.fa
#=> 5073763f565fb8def4dcd3b7b052245a  ProkAtlas.fa
```

BLAST search was performed under the following configurations:

```{bash}
docker-compose run --rm blast /bin/bash /scripts/210527_prokatlas_blastn.sh
```

- docker-compose.yml (partial)
    ```
    blast:
    image: ncbi/blast:2.11.0
    volumes:
      - ./data:/data
      - ./scripts:/scripts
    ```


- docker/blast/Dockerfile
    ```{Dockerfile}
    FROM continuumio/miniconda3:4.9.2
    RUN conda install -c conda-forge -y mamba==0.8.2
    RUN mamba install -c bioconda    -y blast==2.11.0  
    ```


- scripts/210527_prokatlas_blastn.sh:

    ```{bash}
    #!/bin/bash -eu

    run_blastn () {
        local infile=$1
        local outfile=${infile/16S/prokatlas}
        outfile=${outfile/.fna/.blasttab}
        blastn -db /data/prokatlas/prokatlas -query $infile -out $outfile -outfmt "6 qseqid sseqid qlen qstart qend slen sstart send nident length" -evalue 1e-10 -max_target_seqs 361474
    }

    export -f run_blastn

    # DB compliation
    makeblastdb -dbtype nucl -in /data/pubdata/prokatlas/ProkAtlas.fa -hash_index -out /data/prokatlas/prokatlas

    # BLAST search
    ls /data/16S/*.fna | xargs -t -P100 -L1 -I{} /bin/bash -c 'run_blastn {}'
    ```

In [1]:
import re
import numpy as np
import pandas as pd
from Bio import SeqIO
from collections import Counter
from tqdm.notebook import tqdm
from pyscripts.config import path2
from pyscripts.datasets import Metadata
metadata = Metadata()

In [2]:
db_counts = pd.Series(Counter([
    re.match('seq_\d+;barcodelabel=(.+);', rec.id).group(1) 
    for rec in SeqIO.parse(path2.pubdata/'prokatlas'/'ProkAtlas.fa', 'fasta')
]))
env_clusters = pd.read_csv(
    path2.metadata/'ProkAtlas_env_clusters.tsv', sep='\t', index_col=0, usecols=[0,2], squeeze=True
).rename('cluster')
idf_wights = np.log(db_counts.sum() / db_counts.groupby(env_clusters).sum())

In [3]:
def filter_hits(hits, len_thresh=150, id_thresh=0.97): # as recommended
    filt1 = hits[
        (hits.length >= len_thresh) & 
        ((hits.nident / hits.length) >= id_thresh)
    ]
    filt1_sstart = filt1[['sstartp', 'sendp']].min(axis=1)
    filt1_send   = filt1[['sstartp', 'sendp']].max(axis=1)
    filt2 = filt1[
        ((filt1.qlen - filt1.qend <= 2) | (filt1.slen - filt1_send <= 2)) &
        ((filt1.qstart <= 3) | (filt1_sstart <= 3))
    ][['qseqid','sseqid']].reset_index(drop=True)
    filt2 = filt2.assign(sseqid=filt2.sseqid.str.extract('seq_\d+;barcodelabel=(.+)'))
    return filt2.rename(columns={'qseqid': 'rrs_id', 'sseqid': 'environment'})

def calc_custom_scores(gcf):
    hits = pd.read_csv(
        path2.data/'prokatlas'/f'{gcf}.blasttab', sep='\t',
        names="qseqid sseqid qlen qstart qend slen sstartp sendp nident length".split()
    )
    filtered_hits = filter_hits(hits)
    clustered_counts = pd.crosstab(filtered_hits['rrs_id'], filtered_hits['environment']).mean().groupby(env_clusters).sum()
    return gcf, (clustered_counts * idf_wights)

from multiprocessing import Pool

with Pool(100) as pool:
    habitablity_scores = pd.DataFrame(
        dict(tqdm(pool.imap_unordered(calc_custom_scores, metadata.acc['refseq']), total=len(metadata.acc))), 
        index=idf_wights.index, columns=metadata.acc['refseq'], dtype=float
    )


  0%|          | 0/2624 [00:00<?, ?it/s]

In [4]:
habitablity_scores.to_pickle(path2.data/'prokatlas'/'summary.pkl.bz2')