In [6]:
#imports
import pandas as pd
import wget
import os
import gzip
from os.path import isfile
import time
import json
#re.compile and re.findall
import re
#different package for downloading files from ncbi's ftp server
import ftplib

#ALL FUNCTIONS
#summary_file_path = filepath to local assembly_summary_refseq.txt file --> STATIC FILE
#refseq_level_checklist = list of assembly_levels e.g. ['Complete Genome'] --> USER INPUT
def read_current_assembly_summary_with_pandas(summary_file_path, refseq_level_checklist):
    #function for changing the ftp_header in the pandas table
    def set_protein_assembly_file(ftp_path):
        protein_genome = ftp_path.split('/')[-1:][0]
        protein_genome = ftp_path + '/' + str(protein_genome) + '_protein.faa.gz'
        return protein_genome

    #init parsing refseq table with pandas
    try:
        #changed filepath input to full filepath (dependency)
        #skip the first two rows as there are just some informations regarding the file
        refseq_table = pd.read_table(summary_file_path, skiprows=[0, 1], header=None)
        header = ["assembly_accession", "bioproject", "biosample", "wgs_master", "refseq_category", "taxid",
                  "species_taxid", "organism_name", "infraspecific_name", "isolate", "version_status", "assembly_level",
                  "release_type", "genome_rep", "seq_rel_date", "asm_name", "submitter", "gbrs_paired_asm",
                  "paired_asm_comp", "ftp_path", "excluded_from_refseq", "relation_to_type_material"]
        refseq_table.columns = header
    except Exception as e:
        raise Exception("[-] Exception during pandas parsing of assembly_summary_refseq.txt file ...\n\tException: {}".format(e))

    #extract necessary data fields: assembly number, names, taxids and the correct ftp_filepath for downloading with gzip
    try:
        refseq_table = refseq_table[['assembly_accession', 'organism_name', 'taxid', 'species_taxid','assembly_level', 'ftp_path']]
        refseq_table['ftp_path'] = refseq_table['ftp_path'].apply(lambda row: set_protein_assembly_file(row))
        
        #iterate over the refseq_level_checklist and append resulting dataframe to a list
        pandas_genome_level_dataframes = []
        for genome_level in refseq_level_checklist:
            pandas_genome_level_dataframes.append(refseq_table[refseq_table['assembly_level'] == genome_level])
            
        #this generates a pandas dataframe with six columns ['assembly_accession', 'organism_name', 'taxid', 'species_taxid','assembly_level', 'ftp_path']
        #and at most four different assembly_level entries
        desired_refseq_genomes_dataframe = pd.concat(pandas_genome_level_dataframes)
    except Exception as e:
        raise Exception("[-] Exception during filtering for assembly levels from refseq_table dataframe ...\n\tException: {}".format(e))
    return desired_refseq_genomes_dataframe

#filter refseq table with taxids optained by the get_species_taxids.sh script
def read_taxonomy_table(filepath):
    if(isfile(filepath) == False):
        raise Exception("[-] There is no taxonomy file called: {}".format(filepath))
    taxonomy_file = pd.read_table(filepath,header=None)
    #species_taxid and taxid should normally be interchangeable, the species_taxid may inherit more informations
    #to current strain (have a look at the README description of the refseq summary file)
    taxonomy_file.columns = ['species_taxid']
    return taxonomy_file

def filter_table_by_taxonomy(refseq_table,taxonomy_table):
    #species_taxid
    return refseq_table.merge(taxonomy_table,how='inner', on=['species_taxid'])

#returns the amount of filtered genomes
def reduction_amount(refseq_table,filtered_table):
    reduct = len(refseq_table) - len(filtered_table)
    if(reduct <= 0):
        raise Exception("[-] After filtering, there is no data remaining!")
    return reduct

#downloading genomes with wget 
def download_genome_from_ftp_path(ftp_path):
    try:
        return wget.download(ftp_path)
    except:
        return None

#function with two responsibilities in order to save memory usage
def extract_downloaded_file_and_write_taxid_file(genome_downloaded_file,taxid):
    
    #decompression to fasta file
    try:
        output = open(genome_downloaded_file+".decompressed.faa","w")
        with gzip.open(genome_downloaded_file,"rb") as bytes_out:
            bytes_from_file = bytes_out.read()
            line = bytes_from_file.decode("utf-8")
            output.write(line)
            output.close()
        #remove gz file
        os.remove(genome_downloaded_file)
    except Exception as e:
        output.close()
        raise Exception("[-] Exception during decompressing: {}".format(e))
    
    #creation of taxmap file | taxid \t acc_id
    try:
        accession_id_pattern = re.compile('>(\S*)')
        taxmap = open('acc_taxid_map.table','a')
        for acc_id in re.findall(accession_id_pattern,line):
            taxmap.write(str(taxid)+"\t"+str(acc_id)+"\n")
        taxmap.close()
        #print("\t[*] Done writing taxonomic informations into taxmap file ...")
        return True
    except Exception as e:
        taxmap.close()
        raise Exception("[-] Exception during writing taxmap file: {}".format(e))
        
def download_assemblies(filtered_table):
    try:
        for genome_url,taxid in zip(filtered_table['ftp_path'],filtered_table['species_taxid']):
            genome_file = download_genome_from_ftp_path(genome_url)
            time.sleep(1)
            #genome_file = wget.download(genome_url)
            print("[+] downloaded genome: {}\n[+] taxonomic node: {}".format(genome_url,taxid))
            if(genome_file):
                extract_downloaded_file_and_write_taxid_file(genome_file,taxid)
    except Exception as e:
        print("[-] ERROR during download_assemblies")
        raise Exception("[-] Error during downloading assemblies with Exception: {}".format(e))
            
def download_assemblies_with_ftplib(filtered_table):
    def download_genome_assembly_ftplib(genome_url):
        #genome_url = entry in filtered_table['ftp_path']
        filename = genome_url.split('/')[-1]
        directory = '/'+'/'.join(str(path) for path in genome_url.split('/')[3:-1])
        try:
            ftp.cwd(directory)
            localfile = open(filename,'wb')
            #print("[*] starting download")
            ftp.retrbinary('RETR '+filename,localfile.write,blocksize=1024)
            localfile.close()
            #print("[*] ended download")
            #return filename of the downloaded genome assembly, this is then used in extract_downloaded_file_and_write_taxid_file
            return filename
        except ftplib.error_perm as e:
            #print("[-] ERROR DOWNLOADING")
            ftp.close()
            raise Exception("[-] Error during downloading refseq file with Exception: {}".format(e))
            
        
    ftp = ftplib.FTP('ftp.ncbi.nlm.nih.gov')
    ftp.login()
    
    for genome_url,taxid in zip(filtered_table['ftp_path'],filtered_table['species_taxid']):
        genome_file = download_genome_assembly_ftplib(genome_url)
        time.sleep(1)
        #genome_downloaded_file = wget.download(genome_url)
        print("[+] downloaded genome: {}\n[+] taxonomic node: {}".format(genome_url,taxid))
        if(genome_file):
            extract_downloaded_file_and_write_taxid_file(genome_file,taxid)
            
    ftp.close()

# Test run NCBI refseq genome assembly download

In [2]:
if(isfile("../refseq_summary/assembly_summary_refseq.txt")):
    start = time.time()
    print("[START]")
    refseq_table = read_current_assembly_summary_with_pandas('../refseq_summary/assembly_summary_refseq.txt',['Complete Genome','Chromosome'])
    taxonomy_table = read_taxonomy_table('../taxonomic_nodes/apes.taxid')
    filtered_table = filter_table_by_taxonomy(refseq_table,taxonomy_table)
    print("[*] reduced genome table to: {}".format(len(refseq_table)-reduction_amount(refseq_table,filtered_table)))
    print("[*] start to download assemblies ...")
    download_assemblies_with_ftplib(filtered_table)
    download_assemblies(filtered_table)
    end = time.time()
    print("[+] this took {} seconds".format(round(end - start),2))
    print("[DONE]")
else:
    print("[ERROR]")
    print("\t[-] There is no assembly_summary_refseq.txt file available!")
    print("[*] Downloading refseq summary file:")
    wget.download('ftp://ftp.ncbi.nih.gov/genomes/refseq/assembly_summary_refseq.txt')

[START]
[*] reduced genome table to: 6
[*] start to download assemblies ...
[+] downloaded genome: ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_protein.faa.gz
[+] taxonomic node: 9606
[+] downloaded genome: ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/880/755/GCF_002880755.1_Clint_PTRv2/GCF_002880755.1_Clint_PTRv2_protein.faa.gz
[+] taxonomic node: 9598
[+] downloaded genome: ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/880/775/GCF_002880775.1_Susie_PABv2/GCF_002880775.1_Susie_PABv2_protein.faa.gz
[+] taxonomic node: 9601
[+] downloaded genome: ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/006/542/625/GCF_006542625.1_Asia_NLE_v1/GCF_006542625.1_Asia_NLE_v1_protein.faa.gz
[+] taxonomic node: 61853
[+] downloaded genome: ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/008/122/165/GCF_008122165.1_Kamilah_GGO_v0/GCF_008122165.1_Kamilah_GGO_v0_protein.faa.gz
[+] taxonomic node: 9593
[+] downloaded genome: ftp://ftp.ncbi.nlm.nih.g

In [5]:
os.listdir('../')

['.git',
 '.gitignore',
 'Dockerfile',
 'README.md',
 'refseq_summary',
 'requirements.txt',
 'scripts',
 'taxonomic_nodes']

# Looping over pandas dataframe

In [None]:
start = time.time()
summary_dict={}
accession = []
organism_name = []
for index, row in refseq_table[refseq_table['assembly_level'] == 'Complete Genome'].iterrows():
    protein_genome = row['ftp_path'].split('/')[-1:][0]
    protein_genome = row['ftp_path']+'/'+str(protein_genome)+'_protein.faa.gz'
    accession.append(row['assembly_accession'])
    organism_name.append(str(row['assembly_accession'])+" "+str(row['organism_name']))
    summary_dict[row['assembly_accession']] = [protein_genome,row['taxid'],row['species_taxid'],row['organism_name']]
html_input_list = tuple(zip(accession,organism_name))
end = time.time()
print("[+] function took {} seconds".format(round(end - start),2))

# Pandas vectorized apply function

Transforming refseq summary table.

In [None]:
def set_protein_assembly_file(ftp_path):
    protein_genome = ftp_path.split('/')[-1:][0]
    protein_genome = ftp_path+'/'+str(protein_genome)+'_protein.faa.gz'
    return protein_genome

In [None]:
start = time.time()
#refseq_table = refseq_table[refseq_table['assembly_level'] == 'Complete Genome']
refseq_table = refseq_table[['assembly_accession','organism_name','taxid','species_taxid','assembly_level','ftp_path']]
refseq_table['ftp_path'] = refseq_table['ftp_path'].apply(lambda row:set_protein_assembly_file(row))
end = time.time()
print("[+] function took {} seconds".format(round(end - start),2))

In [None]:
refseq_table[refseq_table['assembly_level'] == 'Complete Genome']

In [None]:
len(refseq_table[refseq_table['assembly_level'] == 'Contig'])

In [None]:
html_input_list = tuple(zip(refseq_table['assembly_accession'], refseq_table['organism_name']))

In [None]:
len(summary_dict.keys()) == len(refseq_table['assembly_accession'])

In [None]:
len(refseq_table['assembly_accession'])

Function that is used in the reciprocal BLAST software:

In [None]:
refseq_table_function_test = read_current_assembly_summary_with_pandas('./assembly_summary_refseq (1).txt',['Complete Genome','Chromosome'])
refseq_table_function_test.head()

## extract desired genome level

In [None]:
refseq_level_checklist = list(refseq_table['assembly_level'].unique())
pandas_genome_level_dataframes = []
for genome_level in refseq_level_checklist:
    pandas_genome_level_dataframes.append(refseq_table[refseq_table['assembly_level'] == genome_level])
    
desired_refseq_genomes_dataframe = pd.concat(pandas_genome_level_dataframes)

In [None]:
len(desired_refseq_genomes_dataframe['assembly_level'])

In [None]:
desired_refseq_genomes_dataframe.head()

In [None]:
df = desired_refseq_genomes_dataframe
json_records = df.reset_index().to_json(orient ='records') 
data = [] 
data = json.loads(json_records) 

In [None]:
desired_refseq_genomes_dataframe.head().to_json()

In [None]:
list(desired_refseq_genomes_dataframe.columns)

## Download genome assembly

In [None]:
genome_url = refseq_table[refseq_table['assembly_level'] == "Chromosome"][0:2]['ftp_path'][1]
taxid = refseq_table[refseq_table['assembly_level'] == "Chromosome"][0:2]['species_taxid'][1]
genome_file = download_genome_from_ftp_path(genome_url)
#genome_downloaded_file = wget.download(genome_url)
print("[+] downloaded genome: {}\n[+] taxonomic node: {}".format(genome_url,taxid))

if(genome_file):
    extract_downloaded_file_and_write_taxid_file(genome_file,taxid)
#genome_downloaded_file

In [None]:
output = open(genome_downloaded_file+".decompressed.faa","w")
with gzip.open(genome_downloaded_file,"rb") as bytes_out:
    bytes_from_file = bytes_out.read()
    line = bytes_from_file.decode("utf-8")
    output.write(line)
    output.close()

print("[*] bytes length: {} type: {}".format(len(bytes_from_file),type(bytes_from_file)))

In [None]:
accession_id_pattern = re.compile('>(\S*)')
taxmap = open('acc_taxid_map.table','w')
for acc_id in re.findall(accession_id_pattern,line):
    taxmap.write(str(taxid)+"\t"+str(acc_id)+"\n")
taxmap.close()

In [None]:
genome_file = genome_downloaded_file+'.decompressed.faa'

In [None]:
#os.remove(genome_downloaded_file)

# regex for accession id's

In [None]:
import re
accession_id_pattern = re.compile('^>(\S*)')

In [None]:
res = re.match(regex,'>test hello')
res.group(1)

In [None]:
if(re.match(regex,'>test hello')):
    print("yes")

In [None]:
res = re.match(regex,'test hello')
res == None

# read process information

In [1]:
import subprocess
import time
import psutil

In [8]:
import subprocess

try:
    pid = subprocess.Popen(['get_species_taxids.sh','-t','Eubacteria','>','eubacteria.taxid'])
    #print(pid.pid)
    #pid.communicate()
    #outs, errs = pid.communicate(timeout=2)
    print("[+] spawned new process: {}".format(pid.pid))
    #print("[+] errs : {}".format(errs))
except Exception as e:
    print("[-] Exception occured: {}".format(e))

[-] Exception occured: [WinError 193] %1 ist keine zulÃ¤ssige Win32-Anwendung


In [5]:
pid.poll()

NameError: name 'pid' is not defined

In [66]:
pid.returncode

# limit by taxonomy

In [None]:
refseq_table.head()

In [None]:
taxonomy_file = pd.read_table('apes.taxid',header=None)

In [None]:
taxonomy_file.columns = ['species_taxid']

In [None]:
taxonomy_file.head()

In [None]:
filtered_table = refseq_table.merge(taxonomy_file,how='inner', on=['species_taxid'])

In [None]:
print("[+] filtered the original refseq table with {} entries against {} taxonomic nodes ...\n\t {} entries remain".format(len(refseq_table),len(taxonomy_file),len(filtered_table)))

In [None]:
refseq_table.head()

In [None]:
refseq_table = read_current_assembly_summary_with_pandas('./assembly_summary_refseq (1).txt',['Complete Genome','Chromosome'])
taxonomy_table = read_taxonomy_table('cyanobacteria.taxid')
filtered_table = filter_table_by_taxonomy(refseq_table,taxonomy_table)

print("[+] reduced genome table to: {}".format(reduction_amount(refseq_table,filtered_table)))

# Create taxid file with the get_species_taxids.sh BLAST C++ script

In [32]:
import subprocess

try:
    taxid = get_species_taxid('lukas.becker@hhu.de','eubacteria')[0]
    print("[*] Trying to get species level taxonomic nodes for {}".format(taxid))
    output=open('eubacteria.taxid','w')
    pid = subprocess.Popen(['get_species_taxids.sh -t {}'.format(taxid)],stdout=output,shell=True)
    print("[+] spawned new process: {}".format(pid.pid))
    os.waitpid(pid.pid,0)
    output.close()
    
    #signal = pid.communicate()
    #print("[*] waiting for get_species_taxids.sh to finish: {}".format(signal))
except Exception as e:
    print("[-] Exception occured: {}".format(e))

[*] Trying to get species level taxonomic nodes for 2
[+] spawned new process: 2616


In [8]:
import subprocess
from Bio import Entrez

def get_species_taxid(user_email,scientific_name):
    try:
        Entrez.email = user_email
        search = Entrez.esearch(term=scientific_name, db="taxonomy", retmode="xml")
        record = Entrez.read(search)
        taxid = record['IdList'][0]
        translation = record['QueryTranslation']
    except Exception as e:
        raise ValueError("[-] There is no taxonomic node defined by your specified scientific name: {} Exception: {}".format(scientific_name,e))
    return taxid, translation

def write_species_taxid_file(user_email,scientific_name):
    taxid = get_species_taxid(user_email,scientific_name)[0]
    try:
        taxid = get_species_taxid(user_email,scientific_name)[0]
        output_filename = '../taxonomic_nodes/'+scientific_name + '.taxid'
        print("[*] Trying to get species level taxonomic nodes for {}".format(taxid))
        output=open(output_filename,'w')
        pid = subprocess.Popen(['get_species_taxids.sh -t {}'.format(taxid)],stdout=output,shell=True)
        print("\t[+] spawned new process: {}".format(pid.pid))
        os.waitpid(pid.pid,0)
        output.close()
    
    #signal = pid.communicate()
    #print("[*] waiting for get_species_taxids.sh to finish: {}".format(signal))
    except Exception as e:
        print("[-] Exception occured: {}".format(e))

In [9]:
write_species_taxid_file('lukas.becker@hhu.de','bees')

[*] Trying to get species level taxonomic nodes for 34735
	[+] spawned new process: 606


# Django transactions

In [13]:
database_description = "Cyanobacteria Complete Genome"

In [14]:
path_to_database_file = 'media/' + 'databases/' + 'refseq_databases/' + database_description.replace(' ','_').upper() + '.database.faa'
path_to_database_file

'media/databases/refseq_databases/CYANOBACTERIA_COMPLETE_GENOME.database.faa'