In [1]:
# Necessary imports
%load_ext autoreload
%autoreload 1

import networkx as nx
import numpy as np
import pandas as pd
import scipy.sparse as sps
import pickle

import dask.dataframe as dd

from openomics.multiomics import MultiOmics
from openomics.transcriptomics import *
from openomics.database import *

from bioservices import BioMart

import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 100)

In [None]:
string = STRING(path="https://stringdb-static.org/download/",
                  species_id="9606",
                  source_col_name="protein1", target_col_name="protein2", )

Fetching file from URL: https://stringdb-static.org/download/protein.links.v11.0/ 9606.protein.links.v11.0.txt.gz
Fetching file from URL: https://stringdb-static.org/download/protein.info.v11.0/ 9606.protein.info.v11.0.txt.gz
Fetching file from URL: https://stringdb-static.org/download/protein.sequences.v11.0/ 9606.protein.sequences.v11.0.fa.gz


In [None]:
string.df

In [3]:
bm = BioMart(host="www.ensembl.org")
bm.databases

['ensembl_mart_97',
 'genomic_features_mart_97',
 'mouse_mart_97',
 'ontology_mart_97',
 'regulation_mart_97',
 'sequence_mart_97',
 'snp_mart_97']

In [15]:
bm.lookfor('reg')

Candidate:
     database: regulation_mart_97 
    MART name: ENSEMBL_MART_GENOMIC 
  displayName: Genomic features 97 
        hosts: www.ensembl.org 
Candidate:
     database: snp_mart_97 
    MART name: ENSEMBL_MART_FUNCGEN 
  displayName: Ensembl Regulation 97 
        hosts: www.ensembl.org 


In [16]:
datasets = bm.datasets("ENSEMBL_MART_GENOMIC")
for d in datasets:
    print(d)

hsapiens_encode
rnorvegicus_qtl_feature
hsapiens_karyotype_end
gaculeatus_marker_start
hsapiens_marker_start
ggallus_qtl_feature
rnorvegicus_karyotype_end
rnorvegicus_marker_end
mmusculus_marker_end
cfamiliaris_marker_end
oaries_marker_end
btaurus_qtl_feature
gaculeatus_marker_end
hsapiens_marker_end
sscrofa_qtl_feature
oaries_marker_start
mmusculus_marker_start
rnorvegicus_karyotype_start
hsapiens_karyotype_start
dmelanogaster_karyotype_start
oaries_qtl_feature
cfamiliaris_marker_start
dmelanogaster_karyotype_end
ecaballus_qtl_feature
mmusculus_karyotype_end
mmusculus_karyotype_start
rnorvegicus_marker_start


In [6]:
temp = pd.DataFrame()
for k,v in bm.attributes("hsapiens_regulatory_feature").items():
    temp[k] = v
temp.T

Unnamed: 0,0,1,2,3,4,5
activity,Activity,,regulatory_feature,"html,txt,csv,tsv,xls",hsapiens_regulatory_feature__regulatory_activi...,activity_1036
regulatory_stable_id,Regulatory stable ID,,regulatory_feature,"html,txt,csv,tsv,xls",hsapiens_regulatory_feature__regulatory_featur...,stable_id_1051
bound_seq_region_start,Bound start (bp),,regulatory_feature,"html,txt,csv,tsv,xls",hsapiens_regulatory_feature__regulatory_featur...,bound_seq_region_start
bound_seq_region_end,Bound end (bp),,regulatory_feature,"html,txt,csv,tsv,xls",hsapiens_regulatory_feature__regulatory_featur...,bound_seq_region_end
chromosome_name,Chromosome/scaffold name,,regulatory_feature,"html,txt,csv,tsv,xls",hsapiens_regulatory_feature__regulatory_featur...,seq_region_name_1051
chromosome_start,Start (bp),,regulatory_feature,"html,txt,csv,tsv,xls",hsapiens_regulatory_feature__regulatory_featur...,seq_region_start_1051
chromosome_end,End (bp),,regulatory_feature,"html,txt,csv,tsv,xls",hsapiens_regulatory_feature__regulatory_featur...,seq_region_end_1051
feature_type_name,Feature type,,regulatory_feature,"html,txt,csv,tsv,xls",hsapiens_regulatory_feature__regulatory_featur...,feature_type_name_1051
feature_type_description,Feature type description,,regulatory_feature,"html,txt,csv,tsv,xls",hsapiens_regulatory_feature__regulatory_featur...,feature_type_description_1051
epigenome_name,Epigenome name,,regulatory_feature,"html,txt,csv,tsv,xls",hsapiens_regulatory_feature__regulatory_activi...,epigenome_name_1081


# Downloading Ensembl databases

In [None]:
ensembl_snp = EnsemblSNP(dataset="hsapiens_snp", 
                           host="useast.ensembl.org", )
print(ensembl_snp.filename)
ensembl_snp.df.head()

Querying hsapiens_snp from useast.ensembl.org with attributes ['synonym_name', 'variation_names', 'minor_allele', 'associated_variant_risk_allele', 'ensembl_gene_stable_id', 'ensembl_transcript_stable_id', 'phenotype_name', 'chr_name', 'chrom_start', 'chrom_end']...


In [None]:
ensembl_somatic = EnsemblSomaticVariation()
ensembl_somatic.filename

In [None]:
ensembl_geneseq = EnsemblGeneSequences()
ensembl_geneseq.filename

In [None]:
ensembl_transcriptseq = EnsemblTranscriptSequences()

In [None]:
bm.new_query()
bm.add_dataset_to_xml("hsapiens_gene_ensembl__mart_transcript_variation_som__dm")
# bm.attributes("ensembl_mart_sequence")

In [None]:

for at in attributes:
    bm.add_attribute_to_xml(at)
xml_query = bm.get_xml()

print("Querying {} from {}...".format(dataset, host))
results = bm.query(xml_query)
df = pd.read_csv(StringIO(results), header=None, names=attributes, sep="\t", index_col=None)

In [3]:
from gseapy.parser import Biomart
bm = Biomart()
results = bm.query(dataset='hsapiens_gene_ensembl',
                   attributes=['external_gene_name','entrezgene', 'go_id'],
                   # save output file
                   filename="query.results.txt")