# Comparison of genomic databases for P.putida

In this project I will gather genomic annotation information available in common databases (NCBI, Uniprot, etc.) in order to provide an assessment of the proportion of well- and poor- characterized genes found in the genome of P. putida, a synthetic biology workhorse.

This work is partly inspired by the original y-ome paper, published in 2020 by the Palsson group.

## 1. Retrieval of data

### Prepare file structure

In [2]:
# imports
from ftplib import FTP
import os
import requests
import re

# preparing the directory tree
data_path = "./data/"
out_path = "./output/"

if not os.path.exists(data_path):
  os.makedirs(data_path)

if not os.path.exists(out_path):
  os.makedirs(out_path)

#### Download data from NCBI

Genome annotation (ASM756v2) - TIGR

In [6]:
# creates the local directory where files will be downloaded
os.makedirs("data/ncbi", exist_ok=True)

# starting the ftp connection
ncbi = FTP('ftp.ncbi.nlm.nih.gov')
ncbi.login()

# changing the directory to where the reference genome is located 
directory = "/genomes/genbank/bacteria/Pseudomonas_putida/all_assembly_versions/"
ncbi.cwd(directory)

# downloading the reference files
files = ncbi.nlst()

for file in files:
  if file[-5:] == "756v2":
    ncbi.cwd(file)

files = ncbi.nlst()

for file in files:
    remote_filename = file
    local_filename = f"data/ncbi/{file}"

    with open(local_filename, "wb") as f:
        ncbi.retrbinary(f"RETR {remote_filename}", f.write)

# closing the connection
ncbi.quit()

'221 Goodbye.'

RefSeq (NC_002947.4)

In [5]:
# creates the local directory where files will be downloaded
os.makedirs("data/refseq", exist_ok=True)

# starting the ftp connection
ncbi = FTP('ftp.ncbi.nlm.nih.gov')
ncbi.login()

# changing the directory to where the reference genome is located 
directory = "/genomes/all/GCF/000/007/565/GCF_000007565.2_ASM756v2/"
ncbi.cwd(directory)

# downloading the reference files
files = ncbi.nlst()

for file in files:
    remote_filename = file
    local_filename = f"data/refseq/{file}"

    with open(local_filename, "wb") as f:
        ncbi.retrbinary(f"RETR {remote_filename}", f.write)

# closing the connection
ncbi.quit()

'221 Goodbye.'

Uniprot - Current release

In [7]:
# This codeblock downloads data from Uniprot

# creates the local directory where files will be downloaded
os.makedirs("data/uniprot", exist_ok=True)

# starting the ftp connection
uniprot = FTP('ftp.uniprot.org')
uniprot.login()

# changing the directory to where the reference genome is located 
directory = "pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Bacteria/UP000000556"
uniprot.cwd(directory)

files = uniprot.nlst()
for file in files:
    remote_file = file
    local_file = f"data/uniprot/{file}"

    #sets up the download
    with open(local_file, "wb") as f:
        uniprot.retrbinary(f"RETR {remote_file}", f.write)


# closing the connection
uniprot.quit()

'221 Goodbye.'

PseudomonasDB

In [9]:
# This codeblock downloads data from PseudomonasDB

# creates the local directory where files will be downloaded
os.makedirs("data/pseudomonasDB", exist_ok=True)

# This database does not have a FTP server, but they provide a direct link for downloads
urls = ["https://pseudomonas.com/downloads/pseudomonas/pgd_r_22_1/Pseudomonas_putida_KT2440_110/Pseudomonas_putida_KT2440_110.csv.gz", # gene annotations csv
        "https://pseudomonas.com/downloads/pseudomonas/pgd_r_22_1/Pseudomonas_putida_KT2440_110/Pseudomonas_putida_KT2440_110.gff.gz"] # gff file

for file in urls:
  out = re.search(r"(?<=110\/).+$", file)[0]
  print(out)
  
  response = requests.get(file)
  with open(f'data/pseudomonasDB/{out}', 'wb') as file:
    file.write(response.content)


Pseudomonas_putida_KT2440_110.csv.gz
Pseudomonas_putida_KT2440_110.gff.gz
