making pull request changes

JuliaHealth · May 14, 2018 · e949b7f · e949b7f
1 parent b53b405
commit e949b7f
Show file tree

Hide file tree

Showing 14 changed files with 48 additions and 59 deletions.
diff --git a/README.md b/README.md
@@ -41,7 +41,3 @@ To checkout the current master (development) branch:
 ```julia
 Pkg.checkout("BioMedQuery")
 ```
-
-### Note:
-
-This package has recently undergone significant changes. EUtils and UMLs APIs are now part of [BioServices.jl](https://github.com/BioJulia/BioServices.jl). BioMedQuery helps parse and save results into MySQL, SQLite, DataFrames, CSV etc. The old master is now tag v0.2.3-depracate.
diff --git a/docs/src/pubmed.md b/docs/src/pubmed.md
@@ -11,7 +11,7 @@ This module provides utility functions to parse, store and export queries to Pub
 
 ## Basics of searching PubMed
 
-We are often interseted in searching PubMed for all articles related to a search term, and possibly restricted by other search criteria. To do so we use [BioServices.EUtils](http://biojulia.net/BioServices.jl/latest/man/eutils). A basic example of how we may use the functions `esearch` and `efetch` to accomplish such task is illustrated below.
+We are often interested in searching PubMed for all articles related to a search term, and possibly restricted by other search criteria. To do so we use [BioServices.EUtils](http://biojulia.net/BioServices.jl/latest/man/eutils). A basic example of how we may use the functions `esearch` and `efetch` to accomplish such task is illustrated below.
 
 ```julia
 using BioServices.EUtils
@@ -111,7 +111,7 @@ pwd = ""
 # Save results of efetch to database and cleanup intermediate CSV files
 const conn = DBUtils.init_mysql_database(host, user, pwd, dbname)
 PubMed.create_tables!(conn)
-PubMed.save_efetch!(conn, efetch_doc, false, true) # verbose = false, cleanup = true
+PubMed.save_efetch!(conn, efetch_doc, false, true) # verbose = false, drop_csv = true
 ```
 
 ### Save efetch response to SQLite database
@@ -133,7 +133,7 @@ format of the tables that are created for the sql saving functions (schema image
 easily be saved to csv files.
 
 ```julia
-    dfs = PubMed.pubmed_to_dfs(efetch_doc)
+    dfs = PubMed.parse(efetch_doc)
 
     PubMed.dfs_to_csv(dfs, "my/path", "my_file_prefix_")
 ```

diff --git a/src/BioMedQuery.jl b/src/BioMedQuery.jl
@@ -43,7 +43,7 @@ export  save_efetch!,
 include("PubMed/eutils_sql_save.jl")
 
 # eutils -> dfs -> csv
-export pubmed_to_dfs,
+export parse,
         dfs_to_csv
 include("PubMed/pubmed_to_csv.jl")
 

diff --git a/src/Processes/medline_load.jl b/src/Processes/medline_load.jl
@@ -6,13 +6,13 @@ using EzXML
 using DataFrames
 
 """
-    load_medline(mysql_host, mysql_user, mysql_pwd, mysql_db; start_file = 1, end_file = 928, overwrite = true, year=2018)
+    load_medline(db_con, output_dir; start_file = 1, end_file = 928, year=2018, test=false)
 
-Given MySQL connection info and optionally the start and end files, fetches the medline files, parses the xml, and loads into a MySQL DB (assumes tables already exist).
+Given a MySQL connection and optionally the start and end files, fetches the medline files, parses the xml, and loads into a MySQL DB (assumes tables already exist). The raw (xml.gz) and parsed (csv) files will be stored in the output_dir.
 """
-function load_medline(mysql_host::String, mysql_user::String, mysql_pwd::String, mysql_db::String, output_dir::String; start_file::Int = 1, end_file::Int = 928, overwrite::Bool=true, year::Int=2018, test::Bool = false)
+function load_medline(db_con::MySQL.Connection, output_dir::String; start_file::Int = 1, end_file::Int = 928, year::Int=2018, test::Bool = false)
 
-    db_con, ftp_con = init_medline(mysql_host, mysql_user, mysql_pwd, mysql_db, output_dir, overwrite, test)
+    ftp_con = init_medline(output_dir, test)
 
     set_innodb_checks!(db_con,0,0,0)
     drop_mysql_keys!(db_con)
@@ -41,7 +41,7 @@ function load_medline(mysql_host::String, mysql_user::String, mysql_pwd::String,
     set_innodb_checks!(db_con)
     add_mysql_keys!(db_con)
     info("All files processed - closing connections")
-    close_cons(db_con, ftp_con)
+    close_cons(ftp_con)
 
     return nothing
 end
@@ -51,9 +51,9 @@ end
 
 Sets up environment (folders), and connects to MySQL DB and FTP Server returns these connections.
 """
-function init_medline(mysql_host::String, mysql_user::String, mysql_pwd::String, mysql_db::String, output_dir::String, overwrite::Bool, test::Bool=false)
+function init_medline(output_dir::String, test::Bool=false)
     ## SET UP ENVIRONMENT
-    info("======Setting up folders and creating FTP, DB Connections======")
+    info("======Setting up folders and creating FTP Connection======")
 
     try
         mkdir(joinpath(output_dir,"medline"))
@@ -71,14 +71,9 @@ function init_medline(mysql_host::String, mysql_user::String, mysql_pwd::String,
     # Initialize FTP
     ftp_init()
 
-    # Get MySQL Connection
-    db_con = init_mysql_database(mysql_host, mysql_user, mysql_pwd, mysql_db, overwrite)
-
     ftp_con = get_ftp_con(test)
 
-    overwrite && PubMed.create_tables!(db_con)
-
-    return db_con, ftp_con
+    return ftp_con
 end
 
 
@@ -143,24 +138,21 @@ function parse_ml_file(fname::String, output_dir::String)
     doc = EzXML.readxml(path)
     raw_articles = EzXML.root(doc)
 
-    dfs = pubmed_to_dfs(raw_articles)
+    dfs = PubMed.parse(raw_articles)
 
     dfs_to_csv(dfs, joinpath(output_dir,"medline","parsed_files"), "$(fname[1:end-7])_")
 
     return nothing
 end
 
 """
-    close_cons(db_con, ftp_con)
-closes connections and cleans up
+    close_cons(ftp_con)
+closes connection and cleans up
 """
-function close_cons(db_con::MySQL.Connection, ftp_con::ConnContext)
+function close_cons(ftp_con::ConnContext)
     # Close FTP Connection
     ftp_close_connection(ftp_con)
     ftp_cleanup()
 
-    # Close MySQL Connection
-    MySQL.disconnect(db_con)
-
     return nothing
 end
diff --git a/src/Processes/pubmed_search_and_save.jl b/src/Processes/pubmed_search_and_save.jl
@@ -250,7 +250,7 @@ function pubmed_search_and_parse(email, search_term::String, article_max, verbos
         #save the results of an entrez fetch
         println("------Save to dataframes--------")
 
-        this_dfs = pubmed_to_dfs(efetch_doc)
+        this_dfs = PubMed.parse(efetch_doc)
 
         for (table, df) in this_dfs
             if haskey(dfs, table)

diff --git a/src/PubMed/citation_manager.jl b/src/PubMed/citation_manager.jl
@@ -148,7 +148,7 @@ function save_efetch!(output::CitationOutput, articles::EzXML.Node, verbose=fals
     fout = open(output_file, "a")
     nsuccess=0
 
-    articles_df = pubmed_to_dfs(articles)
+    articles_df = PubMed.parse(articles)
 
     for i = 1:n_articles
         try

diff --git a/src/PubMed/eutils_sql_save.jl b/src/PubMed/eutils_sql_save.jl
@@ -9,7 +9,7 @@ pubmed_save_efetch(efetch_dict, conn)
 
 Save the results (dictionary) of an entrez-pubmed fetch to the input database.
 """
-function save_efetch!(conn::Union{MySQL.Connection, SQLite.DB}, articles::EzXML.Node, verbose=false, cleanup=false)
+function save_efetch!(conn::Union{MySQL.Connection, SQLite.DB}, articles::EzXML.Node, verbose=false, drop_csv=true)
 
     #Decide type of article based on structrure of efetch
 
@@ -20,9 +20,9 @@ function save_efetch!(conn::Union{MySQL.Connection, SQLite.DB}, articles::EzXML.
 
     println("Saving " , countelements(articles) ,  " articles to database")
 
-    parsed = pubmed_to_dfs(articles)
+    parsed = PubMed.parse(articles)
 
-    db_insert!(conn, parsed, cleanup=true)
+    db_insert!(conn, parsed, drop_csv=drop_csv)
 
 end
 

diff --git a/src/PubMed/pubmed_sql_utils.jl b/src/PubMed/pubmed_sql_utils.jl
@@ -164,7 +164,7 @@ end
 
 function add_mysql_keys!(conn::MySQL.Connection)
 
-    res = db_query(conn, "SHOW INDEX FROM basic WHERE key_name = 'pub_year'")
+    res = MySQL.query(conn, "SHOW INDEX FROM basic WHERE key_name = 'pub_year'", DataFrame)
     size(res)[1] == 1 && return nothing
 
     MySQL.execute!(conn, "ALTER TABLE `basic`
@@ -213,7 +213,7 @@ end
 
 function drop_mysql_keys!(conn::MySQL.Connection)
 
-    res = db_query(conn, "SHOW INDEX FROM basic WHERE key_name = 'pub_year'")
+    res = MySQL.query(conn, "SHOW INDEX FROM basic WHERE key_name = 'pub_year'", DataFrame)
     size(res)[1] == 0 && return nothing
 
     MySQL.execute!(conn, "ALTER TABLE `basic`
@@ -411,7 +411,7 @@ function get_article_mesh_by_concept(db, pmid::Integer, umls_concepts...; query_
 
 end
 
-function db_insert!(db::MySQL.Connection, articles::Dict{String,DataFrame}, csv_path::String = pwd(), csv_prefix::String = "$(Date(now()))_PubMed_"; verbose=false, cleanup=false)
+function db_insert!(db::MySQL.Connection, articles::Dict{String,DataFrame}, csv_path::String = pwd(), csv_prefix::String = "$(Date(now()))_PubMed_"; verbose=false, drop_csv=false)
 
     dfs_to_csv(articles, csv_path, csv_prefix)
 
@@ -437,15 +437,15 @@ function db_insert!(db::MySQL.Connection, articles::Dict{String,DataFrame}, csv_
     meta_sql = """UPDATE file_meta SET ins_end_time = CURRENT_TIMESTAMP WHERE file_name = '$csv_prefix'"""
     MySQL.execute!(db, meta_sql)
 
-    if cleanup
+    if drop_csv
         remove_csvs(articles, csv_path, csv_prefix)
     end
 
     return nothing
 
 end
 
-function db_insert!(db::MySQL.Connection, csv_path::String = pwd(), csv_prefix::String = "$(Date(now()))_PubMed_"; verbose=false, cleanup=false)
+function db_insert!(db::MySQL.Connection, csv_path::String = pwd(), csv_prefix::String = "$(Date(now()))_PubMed_"; verbose=false, drop_csv=false)
     paths = Vector{String}()
 
     #Insert csv prefix into files_meta talbe
@@ -456,7 +456,7 @@ function db_insert!(db::MySQL.Connection, csv_path::String = pwd(), csv_prefix::
         # for all non-file_meta tables
         if table != "file_meta"
             path = joinpath(csv_path, "$(csv_prefix)$(table).csv")
-            cleanup && push!(paths,path)
+            drop_csv && push!(paths,path)
 
             headers = CSV.read(path, rows = 1, datarow=1)
             # return headers
@@ -476,15 +476,15 @@ function db_insert!(db::MySQL.Connection, csv_path::String = pwd(), csv_prefix::
     meta_sql = """UPDATE file_meta SET ins_end_time = CURRENT_TIMESTAMP WHERE file_name = '$csv_prefix'"""
     MySQL.execute!(db, meta_sql)
 
-    if cleanup
+    if drop_csv
         remove_csvs(paths)
     end
 
     return nothing
 
 end
 
-function db_insert!(db::MySQL.Connection, pmid::Int64, articles::Dict{String,DataFrame}, csv_path::String = pwd(), csv_prefix::String = "$(Date(now()))_PubMed_"; verbose=false, cleanup=false)
+function db_insert!(db::MySQL.Connection, pmid::Int64, articles::Dict{String,DataFrame}, csv_path::String = pwd(), csv_prefix::String = "$(Date(now()))_PubMed_"; verbose=false, drop_csv=false)
 
     dfs_to_csv(articles, csv_path, csv_prefix)
 
@@ -504,15 +504,15 @@ function db_insert!(db::MySQL.Connection, pmid::Int64, articles::Dict{String,Dat
         end
     end
 
-    if cleanup
+    if drop_csv
         remove_csvs(articles, csv_path, csv_prefix)
     end
 
     return nothing
 
 end
 
-function db_insert!(db::SQLite.DB, articles::Dict{String,DataFrame}, csv_path::String = pwd(), csv_prefix::String = "$(Date(now()))_PubMed_"; verbose=false, cleanup=false)
+function db_insert!(db::SQLite.DB, articles::Dict{String,DataFrame}, csv_path::String = pwd(), csv_prefix::String = "$(Date(now()))_PubMed_"; verbose=false, drop_csv=false)
 
     #Insert csv prefix into files_meta talbe
     meta_sql = """INSERT INTO file_meta (file_name,ins_start_time) VALUES ('$csv_prefix',CURRENT_TIMESTAMP)"""

diff --git a/src/PubMed/pubmed_to_csv.jl b/src/PubMed/pubmed_to_csv.jl
@@ -2,6 +2,7 @@ using Missings
 using EzXML
 using CSV
 using DataFrames
+import Base.parse
 
 """
     dict_to_array(dict::Dict)
@@ -45,7 +46,7 @@ Parses the string year and returns an integer with the first year in range.
 """
 function parse_year(yr::AbstractString)
     try
-        parse(Int64, yr[1:4])
+        Base.parse(Int64, yr[1:4])
     catch
         missing
     end
@@ -132,7 +133,7 @@ function parse_author(xml::EzXML.Node)
             initials = nodecontent(names)
         elseif names_name == "Suffix"
             suffix = nodecontent(names)
-        elseif names_name == "Identifer" && names["Source"] == "ORCID"
+        elseif names_name == "Identifier" && names["Source"] == "ORCID"
             orcid = parse_orcid(nodecontent(names))
         elseif names_name == "CollectiveName"
             collective = nodecontent(names)
@@ -154,7 +155,7 @@ end
 Type that matches the NCBI-XML contents for a PubMedArticle
 """
 #Constructor from EzXML article element
-function pubmed_to_dfs(xml::EzXML.Node)
+function parse(xml::EzXML.Node)
 
     n_articles = countelements(xml)
 
@@ -226,7 +227,7 @@ function pubmed_to_dfs(xml::EzXML.Node)
             if nodename(tdat) == "MedlineCitation"
                 for mc in eachelement(tdat)
                     if nodename(mc) == "PMID"
-                        this_pmid = parse(Int64, nodecontent(mc)) ::Int64
+                        this_pmid = Base.parse(Int64, nodecontent(mc)) ::Int64
                         @inbounds url[i] = string("http://www.ncbi.nlm.nih.gov/pubmed/", this_pmid)
                         @inbounds pmid[i] = this_pmid
                     elseif nodename(mc) == "Article"
@@ -365,7 +366,7 @@ function pubmed_to_dfs(xml::EzXML.Node)
                                 for pt in eachelement(a_info)
                                     desc = nodecontent(pt) :: String
                                     ui = pt["UI"] :: String
-                                    uid = length(ui) > 1 ? parse(Int64, ui[2:end]) : -1
+                                    uid = length(ui) > 1 ? Base.parse(Int64, ui[2:end]) : -1
                                     push!(pt_pmid, this_pmid)
                                     push!(pt_uid, uid)
                                     push!(pt_name, desc)
@@ -386,12 +387,12 @@ function pubmed_to_dfs(xml::EzXML.Node)
                                 if header_name == "DescriptorName"
                                     desc = nodecontent(header) :: String
                                     desc_maj = header["MajorTopicYN"] == "Y" ? 1 : 0
-                                    desc_uid = parse(Int, header["UI"][2:end])
+                                    desc_uid = Base.parse(Int, header["UI"][2:end])
                                     mesh_desc[desc_uid] = desc
                                 elseif header_name == "QualifierName"
                                     qual = nodecontent(header)
                                     qual_maj = header["MajorTopicYN"] == "Y" ? 1 : 0 :: Int
-                                    qual_uid = parse(Int, header["UI"][2:end]) :: Int
+                                    qual_uid = Base.parse(Int, header["UI"][2:end]) :: Int
 
                                     mesh_qual[qual_uid] = qual
 

diff --git a/test/eutils_sql_save.jl b/test/eutils_sql_save.jl
@@ -3,9 +3,6 @@ using BioMedQuery.DBUtils
 
 dbname="test"
 
-config = Dict(:host=>"127.0.0.1", :dbname=>dbname, :username=>"root",
-:pswd=>"", :overwrite=>true)
-
 con = init_mysql_database("127.0.0.1", dbname, "root", "", true)
 init_mysql_database(con, dbname, true)
 init_mysql_database(con, dbname, false)

diff --git a/test/processes_mysql.jl b/test/processes_mysql.jl
@@ -87,7 +87,9 @@ end
 println("-----------------------------------------")
 println("       Testing Medline Loader")
 
-    load_medline(host, mysql_usr, mysql_pswd, dbname, dirname(@__FILE__), start_file=medline_file, end_file=medline_file, year=medline_year, test=true)
+    PubMed.create_tables!(conn) #drop and re-create pubmed article tables
+
+    load_medline(conn, dirname(@__FILE__), start_file=medline_file, end_file=medline_file, year=medline_year, test=true)
 
     path = joinpath(dirname(@__FILE__),"medline","raw_files",Processes.get_file_name(medline_file, medline_year, true))
     doc = EzXML.readxml(path)
@@ -96,8 +98,8 @@ println("       Testing Medline Loader")
 
     all_pmids = PubMed.all_pmids(conn)
     @test length(all_pmids) == countelements(raw_articles)
-    res = db_query(conn, "SELECT DISTINCT orcid FROM author_ref;")
-    @test length(res) > 0
+    res = MySQL.query(conn, "SELECT DISTINCT orcid FROM author_ref;", DataFrame)
+    @test size(res)[1] > 2
 
     rm(joinpath(dirname(@__FILE__),"medline"), recursive=true)
 

diff --git a/test/pubmed.jl b/test/pubmed.jl
@@ -3,6 +3,7 @@ using MySQL
 using BioServices.EUtils
 using XMLDict
 using EzXML
+import Base.parse
 
 #------------------ BioMedQuery -------------------
     @testset "Testing Eutils/PubMed" begin
@@ -29,7 +30,7 @@ using EzXML
         @test haskey(esearch_dict, "IdList")
 
         for id_node in esearch_dict["IdList"]["Id"]
-            push!(ids, parse(Int64, id_node))
+            push!(ids, Base.parse(Int64, id_node))
         end
 
         @test length(ids)==narticles

diff --git a/test/pubmed_types.jl → test/pubmed_parse.jl b/test/pubmed_types.jl → test/pubmed_parse.jl
@@ -10,7 +10,7 @@ if nodename(articles) != "PubmedArticleSet"
 end
 
 
-parsed = pubmed_to_dfs(articles)
+parsed = PubMed.parse(articles)
 
 @test !ismissing(parsed["basic"][1,:pmid])
 

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -16,7 +16,7 @@ my_tests = [
             ("dbutils_sqlite.jl",   "       Testing: DBUtils SQLite"),
             ("dbutils_mysql.jl",    "       Testing: DBUtils MySQL"),
             ("pubmed.jl",           "       Testing: Eutils/PubMed"),
-            ("pubmed_types.jl",     "       Testing: Entrez Types"),
+            ("pubmed_parse.jl",     "       Testing: Entrez Parsing"),
             ("ct.jl",               "       Testing: CLINICAL TRIALS"),
             ("processes_mysql.jl",  "       Testing: Processes MySQL"),
             ("processes_sqlite.jl", "       Testing: Processes SQLite"),