# E-Utilities

In [42]:
from Bio import SeqIO
import pandas as pd

In [2]:
!esearch -db pubmed -query "polysaccharides [MAJR:TIAB] AND symbiosis [MAJR:TIAB] AND review [PT]" | efetch -format docsum | xtract -pattern DocumentSummary -sep '\t' -element Id PubDate Source Author Title ELocationID > ../data/eps_in_symbiosis_reviews.txt

In [3]:
cols = ['Id','PubDate','Source','Title','ElocationID']
sym_reviews = pd.read_table('../data/eps_in_symbiosis_reviews.txt',header=None)
sym_reviews.columns = cols
sym_reviews.head()

Unnamed: 0,Id,PubDate,Source,Title,ElocationID
0,34207734,2021 Jun 9,Int J Mol Sci,Rhizobial Exopolysaccharides: Genetic Regulati...,pii: 6233. doi: 10.3390/ijms22126233
1,33320473,2020 Mar 31,Postepy Biochem,Structures of rhizobial lipopolysaccharides an...,doi: 10.18388/pb.2020_316
2,33090749,2020 Oct 22,Acta Biochim Pol,Biological activity of Nod factors.,doi: 10.18388/abp.2020_5353
3,32037451,2020 Mar 1,FEMS Microbiol Lett,3D biofilms: in search of the polysaccharides ...,pii: fnaa023. doi: 10.1093/femsle/fnaa023
4,29624823,2018 Jun,Cell Microbiol,Cyclic β-glucans at the bacteria-host cells in...,doi: 10.1111/cmi.12850


# Working with databases

In [13]:
!einfo -db assembly -fields

ACCN	Accession
ALL	All Fields
ALLN	All Names
ASAC	Assembly Accession
ASLV	Assembly Level
ASMM	Assembly Method
CL50	Contig L50
CN50	Contig N50
CNTG	Contig Count
COV	Coverage
DESC	Description
EXFV	Expected Final Version
FILT	Filter
FTYP	From Type Material
GCOV	Genome Coverage
GRLS	Date - GenBank Assembly Release
GUID	GenBank ID
INFR	Infraspecific Name
ISOL	Isolate
LEN	Total Sequence Length in Mbp
LINK	Linked Assembly
NAME	Assembly Name
NFRS	Excluded from RefSeq
ORGN	Organism
PLAC	Placed Scaffolds Count
PROJ	BioProject IDs and Accessions
PROP	Properties
RCAT	RefSeq Category
REPL	Chromosome Count
RGAS	Reference Guided Assembly
RLEN	Total Sequence Length
RRLS	Date - RefSeq Assembly Release
RTYP	Release Type
RUID	RefSeq ID
SAMP	BioSample
SCAM	Single Cell Amplification
SEX	Sex
SL50	Scaffold L50
SN50	Scaffold N50
SRDT	Date - Sequences Release
SUBO	Submitter Organization
TCS	Taxonomy Check Status
TECH	Sequencing Technology
TXID	Taxonomy ID
TYPE	Assemb

In [39]:
!esearch -db assembly -query "Curvibacter[ORGN] AND (contig[ASLV] OR chromosome[ASLV])" | efetch -format docsum | xtract -pattern DocumentSummary -element AssemblyName AssemblyStatus

ASM2062199v1	Contig
ASM2024890v1	Contig
ASM1923305v1	Contig
ASM1847400v1	Contig
freshwater MAG --- B4coas_bin-1074	Contig
freshwater MAG --- AM-2014_bin-3866	Contig
freshwater MAG --- AM_bin-3052	Contig
freshwater MAG --- C24_bin-2693	Contig
freshwater MAG --- Kiruna_bin-07108	Contig
freshwater MAG --- MJ-time_bin-9276	Contig
freshwater MAG --- MJ-time_bin-3905	Contig
freshwater MAG --- Ki1-0-7m_bin-0950	Contig
freshwater MAG --- B-4_bin-480	Contig
freshwater MAG --- AM-2014-D1_bin-129	Contig
freshwater MAG --- C-2_bin-299	Contig
ASM429372v1	Contig
ASM263308v1	Contig
ASM216371v1	Chromosome
ASM179731v1	Contig
ASM159226v1	Contig


In [40]:
!esearch -db assembly -query "Curvibacter [ORGN]" | efetch -format docsum | xtract -pattern DocumentSummary -element Id AssemblyAccession AssemblyName AssemblyStatus Taxid Organism SpeciesTaxid SpeciesName > ../data/curvibacter_assemblies.table

In [45]:
cols = ['Id', 'AssemblyAccession', 'AssemblyName', 'AssemblyStatus', 'Taxid', 'Organism', 'SpeciesTaxid', 'SpeciesName']
assemblies = pd.read_table('../data/curvibacter_assemblies.table',header=None)

assemblies.columns = cols
assemblies.head()

Unnamed: 0,Id,AssemblyAccession,AssemblyName,AssemblyStatus,Taxid,Organism,SpeciesTaxid,SpeciesName
0,11312701,GCA_020621995.1,ASM2062199v1,Contig,1888168,Curvibacter sp. (b-proteobacteria),1888168,Curvibacter sp.
1,11312681,GCA_020621965.1,ASM2062196v1,Scaffold,1888168,Curvibacter sp. (b-proteobacteria),1888168,Curvibacter sp.
2,11068931,GCA_020248905.1,ASM2024890v1,Contig,1888168,Curvibacter sp. (b-proteobacteria),1888168,Curvibacter sp.
3,10501171,GCA_019233055.1,ASM1923305v1,Contig,1888168,Curvibacter sp. (b-proteobacteria),1888168,Curvibacter sp.
4,10449501,GCA_019163435.1,ASM1916343v1,Scaffold,86182,Curvibacter lanceolatus (b-proteobacteria),86182,Curvibacter lanceolatus


## Linking pubmed queries to protein sequences

In [2]:
!esearch -db pubmed -query "EPS" | elink -target protein | efilter -organism curvibacter | efetch -format fasta > ../data/curvibacter_eps_proteins.faa

# Query examples for the protein database

In [1]:
!esearch -db protein -query "exopolysaccharide AND curvibacter [ORGN]" | efetch -format fasta > ../data/curvibacter_exopolysaccharides.fasta

In [2]:
!esearch -db protein -query "lipopolysaccharides AND curvibacter [ORGN]" | efetch -format fasta > ../data/curvibacter_lipopolysaccharides.fasta

In [3]:
!esearch -db protein -query "sigma factor AND curvibacter sp. AEP1-3 [ORGN]" | efetch -format fasta > ../data/curvibacter_sigma_factors.faa

In [4]:
fhandle_eps = '../data/curvibacter_exopolysaccharides.fasta'
fhandle_lps = '../data/curvibacter_lipopolysaccharides.fasta'

In [6]:
def fasta_file_to_dictionary_by_organism(fhandle, verbose=False):    
    print("[+] STARTING iteration through fasta file : {}".format(fhandle))
    curvibacter_eps_sequences_dict = {}
    try:
        for record in SeqIO.parse(fhandle,'fasta'):
            organism = record.description.split('[')[-1].split(']')[0]
            if verbose == True: print("[*] Found {} sequence in {}".format(record.id, organism))
            if organism not in curvibacter_eps_sequences_dict.keys():
                curvibacter_eps_sequences_dict[organism] = [[record.id,record.description,record.seq]]
            else:
                curvibacter_eps_sequences_dict[organism].append([record.id,record.description,record.seq])
            if verbose == True: print("\t[+] Attached sequence object to dictionary")
        print("[+] DONE parsing")
        return curvibacter_eps_sequences_dict
    except Exception as e:
        raise Exception('[-] ERROR occured during fasta file parsing. Exception: {}'.format(e))

In [7]:
curvibacter_eps_sequence_dict = fasta_file_to_dictionary_by_organism(fhandle_eps)

[+] STARTING iteration through fasta file : ../data/curvibacter_eps_proteins.faa
[+] DONE parsing


In [8]:
curvibacter_lps_sequence_dict = fasta_file_to_dictionary_by_organism(fhandle_lps)

[+] STARTING iteration through fasta file : ../data/curvibacter_lipopolysaccharide_proteins.faa
[+] DONE parsing


In [9]:
curvibacter_lps_sequence_dict.keys()

dict_keys(['Curvibacter sp. CHRR-16', 'Curvibacter sp. AEP1-3', 'Curvibacter lanceolatus', 'Curvibacter delicatus', 'Burkholderiales', 'Proteobacteria', 'Curvibacter sp. PAE-UM', 'Curvibacter gracilis', 'Curvibacter', 'Comamonadaceae', 'Betaproteobacteria', 'Curvibacter sp.', 'Curvibacter sp. PD_MW3', 'Curvibacter sp. RIFCSPHIGHO2_12_FULL_63_18', 'Curvibacter sp. GWA2_64_110', 'Curvibacter sp. GWA2_63_95', 'Curvibacter sp. IPC7', 'Curvibacter sp. UNPF65', 'uncultured Curvibacter sp.', 'Curvibacter putative symbiont of Hydra magnipapillata'])

In [11]:
len(curvibacter_lps_sequence_dict['Curvibacter sp. AEP1-3'])

7981