# Comparison of genomic databases for P.putida

In this project I will gather genomic annotation information available in common databases (NCBI, Uniprot, etc.) in order to provide an assessment of the proportion of well- and poor- characterized genes found in the genome of P. putida, a synthetic biology workhorse.

## Retrieval of data

### Prepare file structure

In [None]:
# imports
from ftplib import FTP
import os
import requests
import re

# preparing the directory tree
data_path = "./data/"
out_path = "./output/"

if not os.path.exists(data_path):
  os.makedirs(data_path)

if not os.path.exists(out_path):
  os.makedirs(out_path)

#### Download data from NCBI

Genome annotation (AE015451.2) - GenBank

In [None]:
# creates the local directory where files will be downloaded
os.makedirs("data/ncbi", exist_ok=True)

# starting the ftp connection
ncbi = FTP('ftp.ncbi.nlm.nih.gov')
ncbi.login()

# changing the directory to where the reference genome is located 
directory = "/genomes/genbank/bacteria/Pseudomonas_putida/all_assembly_versions/"
ncbi.cwd(directory)

# downloading the reference files
files = ncbi.nlst()

for file in files:
  if file[-5:] == "756v2":
    ncbi.cwd(file)

files = ncbi.nlst()

for file in files:
    remote_filename = file
    local_filename = f"data/ncbi/{file}"

    with open(local_filename, "wb") as f:
        ncbi.retrbinary(f"RETR {remote_filename}", f.write)

# closing the connection
ncbi.quit()

'221 Goodbye.'

RefSeq (NC_002947.4)

In [None]:
# creates the local directory where files will be downloaded
os.makedirs("data/refseq", exist_ok=True)

# starting the ftp connection
ncbi = FTP('ftp.ncbi.nlm.nih.gov')
ncbi.login()

# changing the directory to where the reference genome is located 
directory = "/genomes/all/GCF/000/007/565/GCF_000007565.2_ASM756v2/"
ncbi.cwd(directory)

# downloading the reference files
files = ncbi.nlst()

for file in files:
    remote_filename = file
    local_filename = f"data/refseq/{file}"

    with open(local_filename, "wb") as f:
        ncbi.retrbinary(f"RETR {remote_filename}", f.write)

# closing the connection
ncbi.quit()

'221 Goodbye.'