Skip to content

Commit

Permalink
making pull request changes
Browse files Browse the repository at this point in the history
  • Loading branch information
mcmcgrath13 committed May 14, 2018
1 parent b53b405 commit e949b7f
Show file tree
Hide file tree
Showing 14 changed files with 48 additions and 59 deletions.
4 changes: 0 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,3 @@ To checkout the current master (development) branch:
```julia
Pkg.checkout("BioMedQuery")
```

### Note:

This package has recently undergone significant changes. EUtils and UMLs APIs are now part of [BioServices.jl](https://github.com/BioJulia/BioServices.jl). BioMedQuery helps parse and save results into MySQL, SQLite, DataFrames, CSV etc. The old master is now tag v0.2.3-depracate.
6 changes: 3 additions & 3 deletions docs/src/pubmed.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ This module provides utility functions to parse, store and export queries to Pub

## Basics of searching PubMed

We are often interseted in searching PubMed for all articles related to a search term, and possibly restricted by other search criteria. To do so we use [BioServices.EUtils](http://biojulia.net/BioServices.jl/latest/man/eutils). A basic example of how we may use the functions `esearch` and `efetch` to accomplish such task is illustrated below.
We are often interested in searching PubMed for all articles related to a search term, and possibly restricted by other search criteria. To do so we use [BioServices.EUtils](http://biojulia.net/BioServices.jl/latest/man/eutils). A basic example of how we may use the functions `esearch` and `efetch` to accomplish such task is illustrated below.

```julia
using BioServices.EUtils
Expand Down Expand Up @@ -111,7 +111,7 @@ pwd = ""
# Save results of efetch to database and cleanup intermediate CSV files
const conn = DBUtils.init_mysql_database(host, user, pwd, dbname)
PubMed.create_tables!(conn)
PubMed.save_efetch!(conn, efetch_doc, false, true) # verbose = false, cleanup = true
PubMed.save_efetch!(conn, efetch_doc, false, true) # verbose = false, drop_csv = true
```

### Save efetch response to SQLite database
Expand All @@ -133,7 +133,7 @@ format of the tables that are created for the sql saving functions (schema image
easily be saved to csv files.

```julia
dfs = PubMed.pubmed_to_dfs(efetch_doc)
dfs = PubMed.parse(efetch_doc)

PubMed.dfs_to_csv(dfs, "my/path", "my_file_prefix_")
```
Expand Down
2 changes: 1 addition & 1 deletion src/BioMedQuery.jl
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ export save_efetch!,
include("PubMed/eutils_sql_save.jl")

# eutils -> dfs -> csv
export pubmed_to_dfs,
export parse,
dfs_to_csv
include("PubMed/pubmed_to_csv.jl")

Expand Down
32 changes: 12 additions & 20 deletions src/Processes/medline_load.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@ using EzXML
using DataFrames

"""
load_medline(mysql_host, mysql_user, mysql_pwd, mysql_db; start_file = 1, end_file = 928, overwrite = true, year=2018)
load_medline(db_con, output_dir; start_file = 1, end_file = 928, year=2018, test=false)
Given MySQL connection info and optionally the start and end files, fetches the medline files, parses the xml, and loads into a MySQL DB (assumes tables already exist).
Given a MySQL connection and optionally the start and end files, fetches the medline files, parses the xml, and loads into a MySQL DB (assumes tables already exist). The raw (xml.gz) and parsed (csv) files will be stored in the output_dir.
"""
function load_medline(mysql_host::String, mysql_user::String, mysql_pwd::String, mysql_db::String, output_dir::String; start_file::Int = 1, end_file::Int = 928, overwrite::Bool=true, year::Int=2018, test::Bool = false)
function load_medline(db_con::MySQL.Connection, output_dir::String; start_file::Int = 1, end_file::Int = 928, year::Int=2018, test::Bool = false)

db_con, ftp_con = init_medline(mysql_host, mysql_user, mysql_pwd, mysql_db, output_dir, overwrite, test)
ftp_con = init_medline(output_dir, test)

set_innodb_checks!(db_con,0,0,0)
drop_mysql_keys!(db_con)
Expand Down Expand Up @@ -41,7 +41,7 @@ function load_medline(mysql_host::String, mysql_user::String, mysql_pwd::String,
set_innodb_checks!(db_con)
add_mysql_keys!(db_con)
info("All files processed - closing connections")
close_cons(db_con, ftp_con)
close_cons(ftp_con)

return nothing
end
Expand All @@ -51,9 +51,9 @@ end
Sets up environment (folders), and connects to MySQL DB and FTP Server returns these connections.
"""
function init_medline(mysql_host::String, mysql_user::String, mysql_pwd::String, mysql_db::String, output_dir::String, overwrite::Bool, test::Bool=false)
function init_medline(output_dir::String, test::Bool=false)
## SET UP ENVIRONMENT
info("======Setting up folders and creating FTP, DB Connections======")
info("======Setting up folders and creating FTP Connection======")

try
mkdir(joinpath(output_dir,"medline"))
Expand All @@ -71,14 +71,9 @@ function init_medline(mysql_host::String, mysql_user::String, mysql_pwd::String,
# Initialize FTP
ftp_init()

# Get MySQL Connection
db_con = init_mysql_database(mysql_host, mysql_user, mysql_pwd, mysql_db, overwrite)

ftp_con = get_ftp_con(test)

overwrite && PubMed.create_tables!(db_con)

return db_con, ftp_con
return ftp_con
end


Expand Down Expand Up @@ -143,24 +138,21 @@ function parse_ml_file(fname::String, output_dir::String)
doc = EzXML.readxml(path)
raw_articles = EzXML.root(doc)

dfs = pubmed_to_dfs(raw_articles)
dfs = PubMed.parse(raw_articles)

dfs_to_csv(dfs, joinpath(output_dir,"medline","parsed_files"), "$(fname[1:end-7])_")

return nothing
end

"""
close_cons(db_con, ftp_con)
closes connections and cleans up
close_cons(ftp_con)
closes connection and cleans up
"""
function close_cons(db_con::MySQL.Connection, ftp_con::ConnContext)
function close_cons(ftp_con::ConnContext)
# Close FTP Connection
ftp_close_connection(ftp_con)
ftp_cleanup()

# Close MySQL Connection
MySQL.disconnect(db_con)

return nothing
end
2 changes: 1 addition & 1 deletion src/Processes/pubmed_search_and_save.jl
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ function pubmed_search_and_parse(email, search_term::String, article_max, verbos
#save the results of an entrez fetch
println("------Save to dataframes--------")

this_dfs = pubmed_to_dfs(efetch_doc)
this_dfs = PubMed.parse(efetch_doc)

for (table, df) in this_dfs
if haskey(dfs, table)
Expand Down
2 changes: 1 addition & 1 deletion src/PubMed/citation_manager.jl
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ function save_efetch!(output::CitationOutput, articles::EzXML.Node, verbose=fals
fout = open(output_file, "a")
nsuccess=0

articles_df = pubmed_to_dfs(articles)
articles_df = PubMed.parse(articles)

for i = 1:n_articles
try
Expand Down
6 changes: 3 additions & 3 deletions src/PubMed/eutils_sql_save.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ pubmed_save_efetch(efetch_dict, conn)
Save the results (dictionary) of an entrez-pubmed fetch to the input database.
"""
function save_efetch!(conn::Union{MySQL.Connection, SQLite.DB}, articles::EzXML.Node, verbose=false, cleanup=false)
function save_efetch!(conn::Union{MySQL.Connection, SQLite.DB}, articles::EzXML.Node, verbose=false, drop_csv=true)

#Decide type of article based on structrure of efetch

Expand All @@ -20,9 +20,9 @@ function save_efetch!(conn::Union{MySQL.Connection, SQLite.DB}, articles::EzXML.

println("Saving " , countelements(articles) , " articles to database")

parsed = pubmed_to_dfs(articles)
parsed = PubMed.parse(articles)

db_insert!(conn, parsed, cleanup=true)
db_insert!(conn, parsed, drop_csv=drop_csv)

end

Expand Down
20 changes: 10 additions & 10 deletions src/PubMed/pubmed_sql_utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ end

function add_mysql_keys!(conn::MySQL.Connection)

res = db_query(conn, "SHOW INDEX FROM basic WHERE key_name = 'pub_year'")
res = MySQL.query(conn, "SHOW INDEX FROM basic WHERE key_name = 'pub_year'", DataFrame)
size(res)[1] == 1 && return nothing

MySQL.execute!(conn, "ALTER TABLE `basic`
Expand Down Expand Up @@ -213,7 +213,7 @@ end

function drop_mysql_keys!(conn::MySQL.Connection)

res = db_query(conn, "SHOW INDEX FROM basic WHERE key_name = 'pub_year'")
res = MySQL.query(conn, "SHOW INDEX FROM basic WHERE key_name = 'pub_year'", DataFrame)
size(res)[1] == 0 && return nothing

MySQL.execute!(conn, "ALTER TABLE `basic`
Expand Down Expand Up @@ -411,7 +411,7 @@ function get_article_mesh_by_concept(db, pmid::Integer, umls_concepts...; query_

end

function db_insert!(db::MySQL.Connection, articles::Dict{String,DataFrame}, csv_path::String = pwd(), csv_prefix::String = "$(Date(now()))_PubMed_"; verbose=false, cleanup=false)
function db_insert!(db::MySQL.Connection, articles::Dict{String,DataFrame}, csv_path::String = pwd(), csv_prefix::String = "$(Date(now()))_PubMed_"; verbose=false, drop_csv=false)

dfs_to_csv(articles, csv_path, csv_prefix)

Expand All @@ -437,15 +437,15 @@ function db_insert!(db::MySQL.Connection, articles::Dict{String,DataFrame}, csv_
meta_sql = """UPDATE file_meta SET ins_end_time = CURRENT_TIMESTAMP WHERE file_name = '$csv_prefix'"""
MySQL.execute!(db, meta_sql)

if cleanup
if drop_csv
remove_csvs(articles, csv_path, csv_prefix)
end

return nothing

end

function db_insert!(db::MySQL.Connection, csv_path::String = pwd(), csv_prefix::String = "$(Date(now()))_PubMed_"; verbose=false, cleanup=false)
function db_insert!(db::MySQL.Connection, csv_path::String = pwd(), csv_prefix::String = "$(Date(now()))_PubMed_"; verbose=false, drop_csv=false)
paths = Vector{String}()

#Insert csv prefix into files_meta talbe
Expand All @@ -456,7 +456,7 @@ function db_insert!(db::MySQL.Connection, csv_path::String = pwd(), csv_prefix::
# for all non-file_meta tables
if table != "file_meta"
path = joinpath(csv_path, "$(csv_prefix)$(table).csv")
cleanup && push!(paths,path)
drop_csv && push!(paths,path)

headers = CSV.read(path, rows = 1, datarow=1)
# return headers
Expand All @@ -476,15 +476,15 @@ function db_insert!(db::MySQL.Connection, csv_path::String = pwd(), csv_prefix::
meta_sql = """UPDATE file_meta SET ins_end_time = CURRENT_TIMESTAMP WHERE file_name = '$csv_prefix'"""
MySQL.execute!(db, meta_sql)

if cleanup
if drop_csv
remove_csvs(paths)
end

return nothing

end

function db_insert!(db::MySQL.Connection, pmid::Int64, articles::Dict{String,DataFrame}, csv_path::String = pwd(), csv_prefix::String = "$(Date(now()))_PubMed_"; verbose=false, cleanup=false)
function db_insert!(db::MySQL.Connection, pmid::Int64, articles::Dict{String,DataFrame}, csv_path::String = pwd(), csv_prefix::String = "$(Date(now()))_PubMed_"; verbose=false, drop_csv=false)

dfs_to_csv(articles, csv_path, csv_prefix)

Expand All @@ -504,15 +504,15 @@ function db_insert!(db::MySQL.Connection, pmid::Int64, articles::Dict{String,Dat
end
end

if cleanup
if drop_csv
remove_csvs(articles, csv_path, csv_prefix)
end

return nothing

end

function db_insert!(db::SQLite.DB, articles::Dict{String,DataFrame}, csv_path::String = pwd(), csv_prefix::String = "$(Date(now()))_PubMed_"; verbose=false, cleanup=false)
function db_insert!(db::SQLite.DB, articles::Dict{String,DataFrame}, csv_path::String = pwd(), csv_prefix::String = "$(Date(now()))_PubMed_"; verbose=false, drop_csv=false)

#Insert csv prefix into files_meta talbe
meta_sql = """INSERT INTO file_meta (file_name,ins_start_time) VALUES ('$csv_prefix',CURRENT_TIMESTAMP)"""
Expand Down
15 changes: 8 additions & 7 deletions src/PubMed/pubmed_to_csv.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ using Missings
using EzXML
using CSV
using DataFrames
import Base.parse

"""
dict_to_array(dict::Dict)
Expand Down Expand Up @@ -45,7 +46,7 @@ Parses the string year and returns an integer with the first year in range.
"""
function parse_year(yr::AbstractString)
try
parse(Int64, yr[1:4])
Base.parse(Int64, yr[1:4])
catch
missing
end
Expand Down Expand Up @@ -132,7 +133,7 @@ function parse_author(xml::EzXML.Node)
initials = nodecontent(names)
elseif names_name == "Suffix"
suffix = nodecontent(names)
elseif names_name == "Identifer" && names["Source"] == "ORCID"
elseif names_name == "Identifier" && names["Source"] == "ORCID"
orcid = parse_orcid(nodecontent(names))
elseif names_name == "CollectiveName"
collective = nodecontent(names)
Expand All @@ -154,7 +155,7 @@ end
Type that matches the NCBI-XML contents for a PubMedArticle
"""
#Constructor from EzXML article element
function pubmed_to_dfs(xml::EzXML.Node)
function parse(xml::EzXML.Node)

n_articles = countelements(xml)

Expand Down Expand Up @@ -226,7 +227,7 @@ function pubmed_to_dfs(xml::EzXML.Node)
if nodename(tdat) == "MedlineCitation"
for mc in eachelement(tdat)
if nodename(mc) == "PMID"
this_pmid = parse(Int64, nodecontent(mc)) ::Int64
this_pmid = Base.parse(Int64, nodecontent(mc)) ::Int64
@inbounds url[i] = string("http://www.ncbi.nlm.nih.gov/pubmed/", this_pmid)
@inbounds pmid[i] = this_pmid
elseif nodename(mc) == "Article"
Expand Down Expand Up @@ -365,7 +366,7 @@ function pubmed_to_dfs(xml::EzXML.Node)
for pt in eachelement(a_info)
desc = nodecontent(pt) :: String
ui = pt["UI"] :: String
uid = length(ui) > 1 ? parse(Int64, ui[2:end]) : -1
uid = length(ui) > 1 ? Base.parse(Int64, ui[2:end]) : -1
push!(pt_pmid, this_pmid)
push!(pt_uid, uid)
push!(pt_name, desc)
Expand All @@ -386,12 +387,12 @@ function pubmed_to_dfs(xml::EzXML.Node)
if header_name == "DescriptorName"
desc = nodecontent(header) :: String
desc_maj = header["MajorTopicYN"] == "Y" ? 1 : 0
desc_uid = parse(Int, header["UI"][2:end])
desc_uid = Base.parse(Int, header["UI"][2:end])
mesh_desc[desc_uid] = desc
elseif header_name == "QualifierName"
qual = nodecontent(header)
qual_maj = header["MajorTopicYN"] == "Y" ? 1 : 0 :: Int
qual_uid = parse(Int, header["UI"][2:end]) :: Int
qual_uid = Base.parse(Int, header["UI"][2:end]) :: Int

mesh_qual[qual_uid] = qual

Expand Down
3 changes: 0 additions & 3 deletions test/eutils_sql_save.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,6 @@ using BioMedQuery.DBUtils

dbname="test"

config = Dict(:host=>"127.0.0.1", :dbname=>dbname, :username=>"root",
:pswd=>"", :overwrite=>true)

con = init_mysql_database("127.0.0.1", dbname, "root", "", true)
init_mysql_database(con, dbname, true)
init_mysql_database(con, dbname, false)
Expand Down
8 changes: 5 additions & 3 deletions test/processes_mysql.jl
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,9 @@ end
println("-----------------------------------------")
println(" Testing Medline Loader")

load_medline(host, mysql_usr, mysql_pswd, dbname, dirname(@__FILE__), start_file=medline_file, end_file=medline_file, year=medline_year, test=true)
PubMed.create_tables!(conn) #drop and re-create pubmed article tables

load_medline(conn, dirname(@__FILE__), start_file=medline_file, end_file=medline_file, year=medline_year, test=true)

path = joinpath(dirname(@__FILE__),"medline","raw_files",Processes.get_file_name(medline_file, medline_year, true))
doc = EzXML.readxml(path)
Expand All @@ -96,8 +98,8 @@ println(" Testing Medline Loader")

all_pmids = PubMed.all_pmids(conn)
@test length(all_pmids) == countelements(raw_articles)
res = db_query(conn, "SELECT DISTINCT orcid FROM author_ref;")
@test length(res) > 0
res = MySQL.query(conn, "SELECT DISTINCT orcid FROM author_ref;", DataFrame)
@test size(res)[1] > 2

rm(joinpath(dirname(@__FILE__),"medline"), recursive=true)

Expand Down
3 changes: 2 additions & 1 deletion test/pubmed.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ using MySQL
using BioServices.EUtils
using XMLDict
using EzXML
import Base.parse

#------------------ BioMedQuery -------------------
@testset "Testing Eutils/PubMed" begin
Expand All @@ -29,7 +30,7 @@ using EzXML
@test haskey(esearch_dict, "IdList")

for id_node in esearch_dict["IdList"]["Id"]
push!(ids, parse(Int64, id_node))
push!(ids, Base.parse(Int64, id_node))
end

@test length(ids)==narticles
Expand Down
2 changes: 1 addition & 1 deletion test/pubmed_types.jl → test/pubmed_parse.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ if nodename(articles) != "PubmedArticleSet"
end


parsed = pubmed_to_dfs(articles)
parsed = PubMed.parse(articles)

@test !ismissing(parsed["basic"][1,:pmid])

Expand Down
2 changes: 1 addition & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ my_tests = [
("dbutils_sqlite.jl", " Testing: DBUtils SQLite"),
("dbutils_mysql.jl", " Testing: DBUtils MySQL"),
("pubmed.jl", " Testing: Eutils/PubMed"),
("pubmed_types.jl", " Testing: Entrez Types"),
("pubmed_parse.jl", " Testing: Entrez Parsing"),
("ct.jl", " Testing: CLINICAL TRIALS"),
("processes_mysql.jl", " Testing: Processes MySQL"),
("processes_sqlite.jl", " Testing: Processes SQLite"),
Expand Down

0 comments on commit e949b7f

Please sign in to comment.