<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Extract-V-regions" data-toc-modified-id="Extract-V-regions-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Extract V regions</a></span></li><li><span><a href="#Dereplicate" data-toc-modified-id="Dereplicate-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Dereplicate</a></span></li><li><span><a href="#Check-PATRIC-db" data-toc-modified-id="Check-PATRIC-db-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Check PATRIC db</a></span></li></ul></div>

In [8]:
from qiime2 import Artifact

from os.path import join, dirname, exists
import os

import networkx as nx

import utils
import new_approach
import parse_cdhit
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
or_dir = 'original_db'
new_dir = 'created_db'

v_regions = ['V4', 'V3-V4', 'V1-V3', 'V3-V5']
# v_regions = ['V4']

# Full 16S databases
dbs = {'gsrv': {
    'seq': join(new_dir, 'gsrv_full-16S_seqs.qza'),
    'taxa': join(new_dir, 'gsrv_full-16S_taxa.qza')
    },
      "rdp" :{
        'seq' : join(or_dir, "rdp_16S_seqs.qza"),
        'taxa': join(or_dir, "rdp_16S_taxa_spformat.qza")
    },
    "silva" : {
        'seq' :join(or_dir, "silva_138_NR99_seqs.qza"),
        'taxa': join(or_dir, "silva_138_NR99_taxa.qza")
    },
    "gg": {
        'seq':join(or_dir, "gg_13_8_NR99_seqs.qza"),
        'taxa':join(or_dir, "gg_13_8_NR99_taxa_spformat.qza")
    }
#        ,
#        "metasquare": {
#            'seq':join(or_dir, "metasquare_full-16S_seqs.qza"),
#            'taxa':join(or_dir, "metasquare_full-16S_taxa.qza")
#        }
    ,
       "gtdb": {
           'seq':join(or_dir, "gtdb_full-16S_seqs.qza"),
           'taxa':join(or_dir, "gtdb_full-16S_taxa.qza")
       }
      
}


db_out_prefixes = {db: {v_region: join(new_dir, f'{db}_{v_region}') for v_region in v_regions} for db in dbs }
db_out_prefixes


{'gsrv': {'V4': 'created_db/gsrv_V4',
  'V3-V4': 'created_db/gsrv_V3-V4',
  'V1-V3': 'created_db/gsrv_V1-V3',
  'V3-V5': 'created_db/gsrv_V3-V5'},
 'rdp': {'V4': 'created_db/rdp_V4',
  'V3-V4': 'created_db/rdp_V3-V4',
  'V1-V3': 'created_db/rdp_V1-V3',
  'V3-V5': 'created_db/rdp_V3-V5'},
 'silva': {'V4': 'created_db/silva_V4',
  'V3-V4': 'created_db/silva_V3-V4',
  'V1-V3': 'created_db/silva_V1-V3',
  'V3-V5': 'created_db/silva_V3-V5'},
 'gg': {'V4': 'created_db/gg_V4',
  'V3-V4': 'created_db/gg_V3-V4',
  'V1-V3': 'created_db/gg_V1-V3',
  'V3-V5': 'created_db/gg_V3-V5'},
 'gtdb': {'V4': 'created_db/gtdb_V4',
  'V3-V4': 'created_db/gtdb_V3-V4',
  'V1-V3': 'created_db/gtdb_V1-V3',
  'V3-V5': 'created_db/gtdb_V3-V5'}}

# Extract V regions

To obtain the primers to extract the regions, we used the table 1 from this [article](https://doi.org/10.1128/msphere.01202-20)

In [3]:
%autoreload
v_primers = utils.get_vregion_primers()
v_lengths = utils.get_vregion_lengths()

In [10]:
%autoreload
out_seqs_files = {}
for db, prefixes in db_out_prefixes.items():
    out_seqs_files[db] = {}
    for v_region, prefix in prefixes.items():
        print(v_region)
        out_seqs_file = new_approach.extract_region(
        seqs= dbs[db]['seq'], 
        prefix_db = prefix,  
        f_primer = v_primers[v_region]['f'], 
        r_primer = v_primers[v_region]['r'], 
        min_length = v_lengths[v_region]['min'], 
        max_length = v_lengths[v_region]['max'], 
        threads = 35, force=False, output_formats = ['qza', 'fasta'])

        out_seqs_files[db][v_region] = out_seqs_file


V4
V3-V4
V1-V3
V3-V5
V4
V3-V4
V1-V3
V3-V5
V4
V3-V4
V1-V3
V3-V5
V4
V3-V4
V1-V3
V3-V5
V4
V3-V4
V1-V3
V3-V5


In [11]:
out_seqs_files

{'gsrv': {'V4': {'qza': 'created_db/gsrv_V4_seqs.qza',
   'fasta': 'created_db/gsrv_V4_seqs.fasta'},
  'V3-V4': {'qza': 'created_db/gsrv_V3-V4_seqs.qza',
   'fasta': 'created_db/gsrv_V3-V4_seqs.fasta'},
  'V1-V3': {'qza': 'created_db/gsrv_V1-V3_seqs.qza',
   'fasta': 'created_db/gsrv_V1-V3_seqs.fasta'},
  'V3-V5': {'qza': 'created_db/gsrv_V3-V5_seqs.qza',
   'fasta': 'created_db/gsrv_V3-V5_seqs.fasta'}},
 'rdp': {'V4': {'qza': 'created_db/rdp_V4_seqs.qza',
   'fasta': 'created_db/rdp_V4_seqs.fasta'},
  'V3-V4': {'qza': 'created_db/rdp_V3-V4_seqs.qza',
   'fasta': 'created_db/rdp_V3-V4_seqs.fasta'},
  'V1-V3': {'qza': 'created_db/rdp_V1-V3_seqs.qza',
   'fasta': 'created_db/rdp_V1-V3_seqs.fasta'},
  'V3-V5': {'qza': 'created_db/rdp_V3-V5_seqs.qza',
   'fasta': 'created_db/rdp_V3-V5_seqs.fasta'}},
 'silva': {'V4': {'qza': 'created_db/silva_V4_seqs.qza',
   'fasta': 'created_db/silva_V4_seqs.fasta'},
  'V3-V4': {'qza': 'created_db/silva_V3-V4_seqs.qza',
   'fasta': 'created_db/silva_V3-V4

# Dereplicate

We have extracted only a region, so there are more chances that the sequences we added are redundant. We will delete entries with the same taxonomy and sequence.

In [14]:
dbs_extracted = {}

for db, items in out_seqs_files.items():
    dbs_extracted[db] = {}
    for v_region, files in items.items():
        
        print(f"Dereplicating {db} {v_region}...")
        
        taxa_input = dbs[db]['taxa']
        taxa_output = f"{db_out_prefixes[db][v_region]}_taxa.qza"
        
        seqs_input = seqs_output = files['qza']
        
        new_approach.dereplicate(
            taxa_in = taxa_input,
            taxa_out = taxa_output,
            seqs_in = seqs_input,
            seqs_out = seqs_output,
            force = False
        )
        
        seqs_out_fasta = f"{os.path.splitext(seqs_output)[0]}.fasta"
        taxa_out_txt = f"{os.path.splitext(taxa_output)[0]}.txt"
        
        dbs_extracted[db][v_region] = {'seq': seqs_out_fasta, 'taxa' : taxa_out_txt}

Dereplicating gsrv V4...
Dereplicating gsrv V3-V4...
Dereplicating gsrv V1-V3...
Dereplicating gsrv V3-V5...
Dereplicating rdp V4...
Dereplicating rdp V3-V4...
Dereplicating rdp V1-V3...
Dereplicating rdp V3-V5...
Dereplicating silva V4...
Dereplicating silva V3-V4...
Dereplicating silva V1-V3...
Dereplicating silva V3-V5...
Dereplicating gg V4...
Dereplicating gg V3-V4...
Dereplicating gg V1-V3...
Dereplicating gg V3-V5...
Dereplicating gtdb V4...
Saved FeatureData[Sequence] to: created_db/gtdb_V4_seqs.qza
Saved FeatureData[Taxonomy] to: created_db/gtdb_V4_taxa.qza
Dereplicating gtdb V3-V4...
Saved FeatureData[Sequence] to: created_db/gtdb_V3-V4_seqs.qza
Saved FeatureData[Taxonomy] to: created_db/gtdb_V3-V4_taxa.qza
Dereplicating gtdb V1-V3...
Saved FeatureData[Sequence] to: created_db/gtdb_V1-V3_seqs.qza
Saved FeatureData[Taxonomy] to: created_db/gtdb_V1-V3_taxa.qza
Dereplicating gtdb V3-V5...
Saved FeatureData[Sequence] to: created_db/gtdb_V3-V5_seqs.qza
Saved FeatureData[Taxonomy] 

Check numbers

In [15]:
for db, items in dbs_extracted.items():
    print(f"{db} full-16S")
    taxa = f"{os.path.splitext(dbs[db]['taxa'])[0]}.txt"
    !wc -l {taxa}
    for v_region, files in items.items():
        print(f"{db} {v_region}")
        !wc -l {files['taxa']}

gsrv full-16S
90409 created_db/gsrv_full-16S_taxa.txt
gsrv V4
48532 created_db/gsrv_V4_taxa.txt
gsrv V3-V4
56617 created_db/gsrv_V3-V4_taxa.txt
gsrv V1-V3
28567 created_db/gsrv_V1-V3_taxa.txt
gsrv V3-V5
62582 created_db/gsrv_V3-V5_taxa.txt
rdp full-16S
21196 original_db/rdp_16S_taxa_spformat.txt
rdp V4
18247 created_db/rdp_V4_taxa.txt
rdp V3-V4
18352 created_db/rdp_V3-V4_taxa.txt
rdp V1-V3
6921 created_db/rdp_V1-V3_taxa.txt
rdp V3-V5
19219 created_db/rdp_V3-V5_taxa.txt
silva full-16S
436681 original_db/silva_138_NR99_taxa.txt
silva V4
278462 created_db/silva_V4_taxa.txt
silva V3-V4
297008 created_db/silva_V3-V4_taxa.txt
silva V1-V3
152013 created_db/silva_V1-V3_taxa.txt
silva V3-V5
332547 created_db/silva_V3-V5_taxa.txt
gg full-16S
203453 original_db/gg_13_8_NR99_taxa_spformat.txt
gg V4
156746 created_db/gg_V4_taxa.txt
gg V3-V4
174922 created_db/gg_V3-V4_taxa.txt
gg V1-V3
94396 created_db/gg_V1-V3_taxa.txt
gg V3-V5
190252 created_db/gg_V3-V5_taxa.txt
gtdb full-16S
569539 original_db/gt

We can see that after dereplication, we have lost half of the database entries, as there are many sequences that have the same sequence and taxonomy for V4 region.