Permalink
Browse files

work on parallel parsing

  • Loading branch information...
1 parent b0bd697 commit 8dacf6a309e602cb8ed48fb5cc7cf44a74fd4267 @dimus dimus committed Nov 17, 2011
Showing with 27 additions and 9 deletions.
  1. +1 −1 .rvmrc
  2. +4 −2 Gemfile
  3. +0 −2 Gemfile.lock
  4. +5 −0 README.rdoc
  5. +1 −0 Rakefile
  6. +7 −4 lib/biodiversity/parser.rb
  7. +9 −0 spec/parser/scientific_name.spec.rb
View
2 .rvmrc
@@ -1 +1 @@
-rvm use ruby-1.9.2-p290@biodiversity--create
+rvm use ruby-1.9.2-p290@biodiversity --create
View
@@ -1,10 +1,12 @@
source "http://rubygems.org"
-gem "jeweler"
gem "treetop"
-gem "facter"
gem "parallel"
+group :development do
+ gem "jeweler"
+end
+
group :test do
gem "ruby-debug19", :require => "ruby-debug"
gem "rspec"
View
@@ -4,7 +4,6 @@ GEM
archive-tar-minitar (0.5.2)
columnize (0.3.4)
diff-lcs (1.1.3)
- facter (1.6.3)
git (1.2.5)
jeweler (1.6.4)
bundler (~> 1.0)
@@ -41,7 +40,6 @@ PLATFORMS
ruby
DEPENDENCIES
- facter
jeweler
parallel
rspec
View
@@ -85,6 +85,11 @@ You can use it as a library
# to get detailed information about elements of the name
parser.parse("Pseudocercospora dendrobii (H.C. Burnett 1883) U. Braun & Crous 2003")[:scientificName][:details]
+ # to parse using several CPUs (4 seem to be optimal)
+ parser = ParallelParser.new # ParallelParser.new(4) will try to run 4 processes if hardware allows
+ array_of_names = ["Betula alba", "Homo sapiens"....]
+ parser.parse(array_of_names) # -> {"Betula alba" => "{scientificName...}", "Homo sapiens" => "{scientificName...}", ...}
+
# to resolve lsid and get back RDF file
LsidResolver.resolve("urn:lsid:ubio.org:classificationbank:2232671")
View
@@ -30,6 +30,7 @@ begin
gem.bindir = 'bin'
gem.executables = ['nnparse', 'parserver']
gem.add_dependency('treetop')
+ gem.add_dependency('parallel')
gem.add_dependency('json') if ruby_version < 19
gem.add_development_dependency "rspec"
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
View
@@ -25,11 +25,14 @@ def self.clean(a_string)
class ParallelParser
- def initialize
- require 'facter'
+ def initialize(processes_num = nil)
require 'parallel'
cpu_num
- @processes_num = cpu_num > 1 ? cpu_num - 1 : 1
+ if processes_num.to_i > 0
+ @processes_num = [processes_num, cpu_num - 1].min
+ else
+ @processes_num = cpu_num > 3 ? cpu_num - 2 : 1
+ end
end
def parse(names_list)
@@ -38,7 +41,7 @@ def parse(names_list)
end
def cpu_num
- @cpu_num ||= Facter.processorcount.to_i
+ @cpu_num ||= Parallel.processor_count
end
private
@@ -61,4 +61,13 @@
res.keys.size.should == names.size
end
+ it "should parse several names in parallel with given num of processes" do
+ names = []
+ read_test_file { |n| names << (n[:name]) if n[:name] }
+ names.uniq!
+ pparser = ParallelParser.new(4)
+ res = pparser.parse(names)
+ names.size.should > 100
+ res.keys.size.should == names.size
+ end
end

0 comments on commit 8dacf6a

Please sign in to comment.