Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for non mapped accession with taxid #1

Merged
merged 7 commits into from
Jan 31, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
ftputil
peewee==2.8.1
psycopg2
PyMySQL
nose
99 changes: 72 additions & 27 deletions taxadb/accession.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
# -*- coding: utf-8 -*-

from taxadb.schema import *
import sys


def taxid(acc_number_list, db_name, table):
"""given a list of acession numbers, yield
"""given a list of accession numbers, yield
the accession number and their associated taxids as tuples

Arguments:
Expand All @@ -16,11 +17,14 @@ def taxid(acc_number_list, db_name, table):
database = pw.SqliteDatabase(db_name)
db.initialize(database)
db.connect()
_check_table_exists(table)
with db.atomic():
query = table.select().where(table.accession << acc_number_list)
# taxid = table.get(table.accession == acc_number)
for i in query:
yield (i.accession, i.taxid.ncbi_taxid)
try:
yield (i.accession, i.taxid.ncbi_taxid)
except Taxa.DoesNotExist:
_unmapped_taxid(i.accession)
db.close()


Expand All @@ -36,11 +40,14 @@ def sci_name(acc_number_list, db_name, table):
database = pw.SqliteDatabase(db_name)
db.initialize(database)
db.connect()
_check_table_exists(table)
with db.atomic():
query = table.select().where(table.accession << acc_number_list)
# taxid = table.get(table.accession == acc_number)
for i in query:
yield (i.accession, i.taxid.tax_name)
try:
yield (i.accession, i.taxid.tax_name)
except Taxa.DoesNotExist:
_unmapped_taxid(i.accession)
db.close()


Expand All @@ -56,21 +63,25 @@ def lineage_id(acc_number_list, db_name, table):
database = pw.SqliteDatabase(db_name)
db.initialize(database)
db.connect()
_check_table_exists(table)
with db.atomic():
query = table.select().where(table.accession << acc_number_list)
for i in query:
lineage_list = []
current_lineage = i.taxid.tax_name
current_lineage_id = i.taxid.ncbi_taxid
parent = i.taxid.parent_taxid
while current_lineage != 'root':
lineage_list.append(current_lineage_id)
new_query = Taxa.get(Taxa.ncbi_taxid == parent)

current_lineage = new_query.tax_name
current_lineage_id = new_query.ncbi_taxid
parent = new_query.parent_taxid
yield (i.accession, lineage_list)
try:
lineage_list = []
current_lineage = i.taxid.tax_name
current_lineage_id = i.taxid.ncbi_taxid
parent = i.taxid.parent_taxid
while current_lineage != 'root':
lineage_list.append(current_lineage_id)
new_query = Taxa.get(Taxa.ncbi_taxid == parent)

current_lineage = new_query.tax_name
current_lineage_id = new_query.ncbi_taxid
parent = new_query.parent_taxid
yield (i.accession, lineage_list)
except Taxa.DoesNotExist:
_unmapped_taxid(i.accession)
db.close()


Expand All @@ -86,17 +97,51 @@ def lineage_name(acc_number_list, db_name, table):
database = pw.SqliteDatabase(db_name)
db.initialize(database)
db.connect()
_check_table_exists(table)
with db.atomic():
query = table.select().where(table.accession << acc_number_list)
for i in query:
lineage_list = []
current_lineage = i.taxid.tax_name
parent = i.taxid.parent_taxid
while current_lineage != 'root':
print(current_lineage)
lineage_list.append(current_lineage)
new_query = Taxa.get(Taxa.ncbi_taxid == parent)
current_lineage = new_query.tax_name
parent = new_query.parent_taxid
yield (i.accession, lineage_list)
try:
lineage_list = []
current_lineage = i.taxid.tax_name
parent = i.taxid.parent_taxid
while current_lineage != 'root':
print(current_lineage)
lineage_list.append(current_lineage)
new_query = Taxa.get(Taxa.ncbi_taxid == parent)
current_lineage = new_query.tax_name
parent = new_query.parent_taxid
yield (i.accession, lineage_list)
except Taxa.DoesNotExist:
_unmapped_taxid(i.accession)
db.close()


def _check_table_exists(table):
"""Check a table exists in the database

Arguments:
table -- table name
Throws `SystemExit` if table does not exist
"""
if not table.table_exists():
print("Table %s does not exist" % (str(table._meta.db_table)), file=sys.stderr)
sys.exit(1)
return True

def _unmapped_taxid(acc, exit=False):
"""Prints an error message on stderr an accession number is not mapped with a taxid

Source ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/README
>> If for some reason the source organism cannot be mapped to the taxonomy database,
the column will contain 0.<<

Arguments:
acc -- Accession number not mapped with taxid
exit -- Exit with code 1, default False
"""
print("No taxid mapped for accession %s" % str(acc), file=sys.stderr)
if exit:
sys.exit(1)
return True

40 changes: 28 additions & 12 deletions taxadb/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,13 @@
import peewee as pw

import os
import gzip
import tarfile
import ftputil
import argparse

from taxadb import util
from taxadb import parse

from taxadb import accession
from taxadb import taxid

from taxadb.schema import *


Expand Down Expand Up @@ -87,6 +83,15 @@ def create_db(args):
user=args.username,
password=args.password
)
elif args.dbtype == 'postgres':
if args.username is None or args.password is None:
print('--dbtype postgres requires --username and --password.\n')
database = pw.PostgresqlDatabase(
args.dbname,
user=args.username,
password=args.password
)

div = args.division # am lazy at typing
db.initialize(database)

Expand Down Expand Up @@ -120,16 +125,19 @@ def create_db(args):
)
# insert in database
with db.atomic():
for i in range(0, len(taxa_info_list), 500):
Taxa.insert_many(taxa_info_list[i:i+500]).execute()
for i in range(0, len(taxa_info_list), args.chunk):
Taxa.insert_many(taxa_info_list[i:i+args.chunk]).execute()
print('Taxa: completed')

with db.atomic():
for table, acc_file in acc_dl_dict.items():
for data_dict in parse.accession2taxid(
args.input + '/' + acc_file):
table.create(**data_dict)
args.input + '/' + acc_file, args.chunk):
table.insert_many(data_dict[0:args.chunk]).execute()
print('%s: %s added to database' % (table, acc_file))
print('Creating index for field accession ... ', end="")
db.create_index(table, ['accession'], unique=True)
print('created.')
print('Sequence: completed')
db.close()

Expand Down Expand Up @@ -171,6 +179,14 @@ def main():
description='build the database',
help='build the database'
)
parser_create.add_argument(
'--chunk',
'-c',
metavar='<#chunk>',
type=int,
help='Number of sequences to insert in bulk',
default=500
)
parser_create.add_argument(
'--input',
'-i',
Expand All @@ -188,9 +204,9 @@ def main():
parser_create.add_argument(
'--dbtype',
'-t',
choices=['sqlite', 'mysql'],
choices=['sqlite', 'mysql', 'postgres'],
default='sqlite',
metavar='[sqlite|mysql]',
metavar='[sqlite|mysql|postgres]',
help='type of the database (default: %(default)s))'
)
parser_create.add_argument(
Expand All @@ -204,12 +220,12 @@ def main():
parser_create.add_argument(
'--username',
'-u',
help='Username to login as (required for MySQLdatabase)'
help='Username to login as (required for MySQLdatabase and PostgreSQLdatabase)'
)
parser_create.add_argument(
'--password',
'-p',
help='Password to use (required for MySQLdatabase)'
help='Password to use (required for MySQLdatabase and PostgreSQLdatabase)'
)
parser_create.set_defaults(func=create_db)

Expand Down
58 changes: 48 additions & 10 deletions taxadb/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-

import gzip

from taxadb.schema import Taxa

def taxdump(nodes_file, names_file):
"""Parse the nodes.dmp and names.dmp files (from taxdump.tgz) and insert
Expand Down Expand Up @@ -41,27 +41,65 @@ def taxdump(nodes_file, names_file):

# merge the two dictionaries
taxa_info_list = list()
taxa_info = {}
for nodes, names in zip(nodes_data, names_data):
taxa_info = {**nodes, **names} # PEP 448, requires python 3.5
taxa_info_list.append(taxa_info)
print('merge successful')
return taxa_info_list


def accession2taxid(acc2taxid):
"""Parse the accession2taxid files. and insert
squences in the Sequence table.
#def accession2taxid(acc2taxid):
# """Parse the accession2taxid files. and insert
# squences in the Sequence table.
#
# Arguments:
# acc2taxid -- input file (gzipped)
# """
# with gzip.open(acc2taxid, 'rb') as f:
# f.readline() # discard the header
# for line in f:
# line_list = line.decode().rstrip('\n').split('\t')
# data_dict = {
# 'accession': line_list[0],
# 'taxid': line_list[2]
# }
# yield(data_dict)


def accession2taxid(acc2taxid, chunk):
"""Parses the accession2taxid files and insert sequences in Sequences table(s).

Arguments:
acc2taxid -- input file (gzipped)
chunk -- Chunk size of entries to gather before yielding, default 500
"""
# Some accessions (e.g.: AAA22826) have a taxid = 0
entries = []
counter = 0
taxids = {}
if not chunk:
chunk = 500
with gzip.open(acc2taxid, 'rb') as f:
f.readline() # discard the header
for line in f:
line_list = line.decode().rstrip('\n').split('\t')
data_dict = {
'accession': line_list[0],
'taxid': line_list[2]
}
yield(data_dict)
if not line_list[2] in taxids:
try:
Taxa.get(Taxa.ncbi_taxid == int(line_list[2]))
taxids[line_list[2]] = True
except Taxa.DoesNotExist:
taxids[line_list[2]] = False
continue
if taxids[line_list[2]]:
data_dict = {
'accession': line_list[0],
'taxid': line_list[2]
}
entries.append(data_dict)
counter += 1
if counter == chunk:
yield(entries)
entries = []
counter = 0
if len(entries):
yield(entries)