Skip to content

Commit

Permalink
Merge pull request #1 from horkko/taxid_0
Browse files Browse the repository at this point in the history
Support for non mapped accession with taxid
  • Loading branch information
HadrienG committed Jan 31, 2017
2 parents 2f6a6c6 + 2b0e471 commit fcb37e6
Show file tree
Hide file tree
Showing 4 changed files with 149 additions and 49 deletions.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
ftputil
peewee==2.8.1
psycopg2
PyMySQL
nose
99 changes: 72 additions & 27 deletions taxadb/accession.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
# -*- coding: utf-8 -*-

from taxadb.schema import *
import sys


def taxid(acc_number_list, db_name, table):
"""given a list of acession numbers, yield
"""given a list of accession numbers, yield
the accession number and their associated taxids as tuples
Arguments:
Expand All @@ -16,11 +17,14 @@ def taxid(acc_number_list, db_name, table):
database = pw.SqliteDatabase(db_name)
db.initialize(database)
db.connect()
_check_table_exists(table)
with db.atomic():
query = table.select().where(table.accession << acc_number_list)
# taxid = table.get(table.accession == acc_number)
for i in query:
yield (i.accession, i.taxid.ncbi_taxid)
try:
yield (i.accession, i.taxid.ncbi_taxid)
except Taxa.DoesNotExist:
_unmapped_taxid(i.accession)
db.close()


Expand All @@ -36,11 +40,14 @@ def sci_name(acc_number_list, db_name, table):
database = pw.SqliteDatabase(db_name)
db.initialize(database)
db.connect()
_check_table_exists(table)
with db.atomic():
query = table.select().where(table.accession << acc_number_list)
# taxid = table.get(table.accession == acc_number)
for i in query:
yield (i.accession, i.taxid.tax_name)
try:
yield (i.accession, i.taxid.tax_name)
except Taxa.DoesNotExist:
_unmapped_taxid(i.accession)
db.close()


Expand All @@ -56,21 +63,25 @@ def lineage_id(acc_number_list, db_name, table):
database = pw.SqliteDatabase(db_name)
db.initialize(database)
db.connect()
_check_table_exists(table)
with db.atomic():
query = table.select().where(table.accession << acc_number_list)
for i in query:
lineage_list = []
current_lineage = i.taxid.tax_name
current_lineage_id = i.taxid.ncbi_taxid
parent = i.taxid.parent_taxid
while current_lineage != 'root':
lineage_list.append(current_lineage_id)
new_query = Taxa.get(Taxa.ncbi_taxid == parent)

current_lineage = new_query.tax_name
current_lineage_id = new_query.ncbi_taxid
parent = new_query.parent_taxid
yield (i.accession, lineage_list)
try:
lineage_list = []
current_lineage = i.taxid.tax_name
current_lineage_id = i.taxid.ncbi_taxid
parent = i.taxid.parent_taxid
while current_lineage != 'root':
lineage_list.append(current_lineage_id)
new_query = Taxa.get(Taxa.ncbi_taxid == parent)

current_lineage = new_query.tax_name
current_lineage_id = new_query.ncbi_taxid
parent = new_query.parent_taxid
yield (i.accession, lineage_list)
except Taxa.DoesNotExist:
_unmapped_taxid(i.accession)
db.close()


Expand All @@ -86,17 +97,51 @@ def lineage_name(acc_number_list, db_name, table):
database = pw.SqliteDatabase(db_name)
db.initialize(database)
db.connect()
_check_table_exists(table)
with db.atomic():
query = table.select().where(table.accession << acc_number_list)
for i in query:
lineage_list = []
current_lineage = i.taxid.tax_name
parent = i.taxid.parent_taxid
while current_lineage != 'root':
print(current_lineage)
lineage_list.append(current_lineage)
new_query = Taxa.get(Taxa.ncbi_taxid == parent)
current_lineage = new_query.tax_name
parent = new_query.parent_taxid
yield (i.accession, lineage_list)
try:
lineage_list = []
current_lineage = i.taxid.tax_name
parent = i.taxid.parent_taxid
while current_lineage != 'root':
print(current_lineage)
lineage_list.append(current_lineage)
new_query = Taxa.get(Taxa.ncbi_taxid == parent)
current_lineage = new_query.tax_name
parent = new_query.parent_taxid
yield (i.accession, lineage_list)
except Taxa.DoesNotExist:
_unmapped_taxid(i.accession)
db.close()


def _check_table_exists(table):
"""Check a table exists in the database
Arguments:
table -- table name
Throws `SystemExit` if table does not exist
"""
if not table.table_exists():
print("Table %s does not exist" % (str(table._meta.db_table)), file=sys.stderr)
sys.exit(1)
return True

def _unmapped_taxid(acc, exit=False):
"""Prints an error message on stderr an accession number is not mapped with a taxid
Source ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/README
>> If for some reason the source organism cannot be mapped to the taxonomy database,
the column will contain 0.<<
Arguments:
acc -- Accession number not mapped with taxid
exit -- Exit with code 1, default False
"""
print("No taxid mapped for accession %s" % str(acc), file=sys.stderr)
if exit:
sys.exit(1)
return True

40 changes: 28 additions & 12 deletions taxadb/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,13 @@
import peewee as pw

import os
import gzip
import tarfile
import ftputil
import argparse

from taxadb import util
from taxadb import parse

from taxadb import accession
from taxadb import taxid

from taxadb.schema import *


Expand Down Expand Up @@ -87,6 +83,15 @@ def create_db(args):
user=args.username,
password=args.password
)
elif args.dbtype == 'postgres':
if args.username is None or args.password is None:
print('--dbtype postgres requires --username and --password.\n')
database = pw.PostgresqlDatabase(
args.dbname,
user=args.username,
password=args.password
)

div = args.division # am lazy at typing
db.initialize(database)

Expand Down Expand Up @@ -120,16 +125,19 @@ def create_db(args):
)
# insert in database
with db.atomic():
for i in range(0, len(taxa_info_list), 500):
Taxa.insert_many(taxa_info_list[i:i+500]).execute()
for i in range(0, len(taxa_info_list), args.chunk):
Taxa.insert_many(taxa_info_list[i:i+args.chunk]).execute()
print('Taxa: completed')

with db.atomic():
for table, acc_file in acc_dl_dict.items():
for data_dict in parse.accession2taxid(
args.input + '/' + acc_file):
table.create(**data_dict)
args.input + '/' + acc_file, args.chunk):
table.insert_many(data_dict[0:args.chunk]).execute()
print('%s: %s added to database' % (table, acc_file))
print('Creating index for field accession ... ', end="")
db.create_index(table, ['accession'], unique=True)
print('created.')
print('Sequence: completed')
db.close()

Expand Down Expand Up @@ -171,6 +179,14 @@ def main():
description='build the database',
help='build the database'
)
parser_create.add_argument(
'--chunk',
'-c',
metavar='<#chunk>',
type=int,
help='Number of sequences to insert in bulk',
default=500
)
parser_create.add_argument(
'--input',
'-i',
Expand All @@ -188,9 +204,9 @@ def main():
parser_create.add_argument(
'--dbtype',
'-t',
choices=['sqlite', 'mysql'],
choices=['sqlite', 'mysql', 'postgres'],
default='sqlite',
metavar='[sqlite|mysql]',
metavar='[sqlite|mysql|postgres]',
help='type of the database (default: %(default)s))'
)
parser_create.add_argument(
Expand All @@ -204,12 +220,12 @@ def main():
parser_create.add_argument(
'--username',
'-u',
help='Username to login as (required for MySQLdatabase)'
help='Username to login as (required for MySQLdatabase and PostgreSQLdatabase)'
)
parser_create.add_argument(
'--password',
'-p',
help='Password to use (required for MySQLdatabase)'
help='Password to use (required for MySQLdatabase and PostgreSQLdatabase)'
)
parser_create.set_defaults(func=create_db)

Expand Down
58 changes: 48 additions & 10 deletions taxadb/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-

import gzip

from taxadb.schema import Taxa

def taxdump(nodes_file, names_file):
"""Parse the nodes.dmp and names.dmp files (from taxdump.tgz) and insert
Expand Down Expand Up @@ -41,27 +41,65 @@ def taxdump(nodes_file, names_file):

# merge the two dictionaries
taxa_info_list = list()
taxa_info = {}
for nodes, names in zip(nodes_data, names_data):
taxa_info = {**nodes, **names} # PEP 448, requires python 3.5
taxa_info_list.append(taxa_info)
print('merge successful')
return taxa_info_list


def accession2taxid(acc2taxid):
"""Parse the accession2taxid files. and insert
squences in the Sequence table.
#def accession2taxid(acc2taxid):
# """Parse the accession2taxid files. and insert
# squences in the Sequence table.
#
# Arguments:
# acc2taxid -- input file (gzipped)
# """
# with gzip.open(acc2taxid, 'rb') as f:
# f.readline() # discard the header
# for line in f:
# line_list = line.decode().rstrip('\n').split('\t')
# data_dict = {
# 'accession': line_list[0],
# 'taxid': line_list[2]
# }
# yield(data_dict)


def accession2taxid(acc2taxid, chunk):
"""Parses the accession2taxid files and insert sequences in Sequences table(s).
Arguments:
acc2taxid -- input file (gzipped)
chunk -- Chunk size of entries to gather before yielding, default 500
"""
# Some accessions (e.g.: AAA22826) have a taxid = 0
entries = []
counter = 0
taxids = {}
if not chunk:
chunk = 500
with gzip.open(acc2taxid, 'rb') as f:
f.readline() # discard the header
for line in f:
line_list = line.decode().rstrip('\n').split('\t')
data_dict = {
'accession': line_list[0],
'taxid': line_list[2]
}
yield(data_dict)
if not line_list[2] in taxids:
try:
Taxa.get(Taxa.ncbi_taxid == int(line_list[2]))
taxids[line_list[2]] = True
except Taxa.DoesNotExist:
taxids[line_list[2]] = False
continue
if taxids[line_list[2]]:
data_dict = {
'accession': line_list[0],
'taxid': line_list[2]
}
entries.append(data_dict)
counter += 1
if counter == chunk:
yield(entries)
entries = []
counter = 0
if len(entries):
yield(entries)

0 comments on commit fcb37e6

Please sign in to comment.