Permalink
Browse files

v 2.0.0 backward incompatible change in parserver options

  • Loading branch information...
dimus committed Mar 12, 2013
1 parent fa53e05 commit 1d9ed79be354ba456f562d0b491549033ae04a15
Showing with 85 additions and 77 deletions.
  1. +1 −1 .rvmrc
  2. +8 −0 CHANGELOG
  3. +4 −4 README.rdoc
  4. +0 −1 Rakefile
  5. +32 −43 bin/parserver
  6. +40 −28 lib/biodiversity/parser.rb
View
2 .rvmrc
@@ -1 +1 @@
-rvm use ruby-1.9.3-p194@biodiversity --create
+rvm use ruby-1.9.3-p392@biodiversity --create
View
@@ -1,3 +1,11 @@
+2.0.0 -- backward incompatibe change in parserver, therefore new major number.
+In parserver removed option --output=canonical_with_rank, instead added -r
+option which allows to have canonical with rank with either json or canonical
+outputs
+
+1.2.0 -- changed method invocation signature ScientificNameParser.new
+Now it can take options
+
1.1.3 -- added 'fo' as rank
1.1.2 -- static method for fixins all-caps canonical names, fixing caps
View
@@ -31,14 +31,14 @@ options:
to return a canonical form of the name string
- parserver --output=canonical_with_rank
-
-the same as above, but infraspecies' rank is shown if available
-
parserver --port 5555
run socket server on a different port
+ parserver --canonical_with_rank
+
+to add rank to canonical forms with infraspecific epithet, if it is given
+
Then you can access it via 4334 port using a socket client library of your programming language. You can find socket client script example in the examples directory of the gem.
If you want to check if socket server works for you:
View
@@ -63,4 +63,3 @@ task :tt do
`mv #{rf}.tmp #{rf}`
end
end
-
View
@@ -5,78 +5,67 @@ require 'socket'
require 'biodiversity' # Get sockets from stdlib
DEFAULT_PORT = 4334
-RUBY_VERSION_INT = RUBY_VERSION.split(".")[0..1].join('').to_i
+RUBY_VERSION_INT = RUBY_VERSION.split('.')[0..1].join('').to_i
OPTIONS = {
- :output => "json",
- :port => DEFAULT_PORT
+ output: 'json',
+ canonical_with_rank: false,
+ port: DEFAULT_PORT
}
options = {}
ARGV.options do |opts|
script_name = File.basename($0)
opts.banner = "Usage: ruby #{script_name} [options]"
- opts.separator ""
+ opts.separator ''
- opts.on("-o", "--output=output", String,
- "Specifies the type of the output:
+ opts.on('-r',
+ '--canonical_with_rank',
+ 'Adds infraspecies rank to canonical forms'
+ ) { |rank| options[:canonical_with_rank] = rank }
+
+ opts.separator ''
+
+ opts.on('-o', '--output=output', String,
+ 'Specifies the type of the output:
json - parsed results in json
- canonical - canonical version
- canonical_with_rank - canonical with rank",
- "Default: json") { |output| options[:output] = output }
+ canonical - canonical form only',
+ 'Default: json') { |output| options[:output] = output }
- opts.separator ""
+ opts.separator ''
- opts.on("-p", "--port=port", String,
- "Specifies the port number",
+ opts.on('-p', '--port=port', String,
+ 'Specifies the port number',
"Default: #{DEFAULT_PORT}") { |port| options[:port] = port }
- opts.separator ""
+ opts.separator ''
- opts.on("-h", "--help",
- "Show this help message.") { puts opts; exit }
+ opts.on('-h', '--help',
+ 'Show this help message.') { puts opts; exit }
opts.parse!
end
-OPTIONS[:output] = options[:output] if ['canonical', 'canonical_with_rank'].include?(options[:output])
+OPTIONS[:output] = options[:output] if ['canonical'].include?(options[:output])
OPTIONS[:port] = options[:port].to_i if options[:port].to_i > 0
-
-def parser_error(name_string)
- {:scientificName => {:parsed => false, :verbatim => name_string, :error => 'Parser error'}}
-end
+OPTIONS[:canonical_with_rank] = !!options[:canonical_with_rank]
def get_output(name_string, parser)
begin
- if RUBY_VERSION_INT < 19
- old_kcode = $KCODE
- $KCODE = 'NONE'
- end
parsed = parser.parse(name_string)
- if RUBY_VERSION_INT < 19
- $KCODE = old_kcode
- end
rescue
- parsed = parser_error(name_string)
+ parsed = ScientificNameParser::FAILED_RESULT.(name_string)
end
output = OPTIONS[:output]
return parsed.to_json if output == 'json'
- canonical = parsed[:scientificName][:canonical]
- return canonical.to_s if output == 'canonical' || canonical == nil || parsed[:scientificName][:hybrid] || !parsed[:scientificName][:parsed]
- parts = parsed[:scientificName][:canonical].split(" ")
-
- if parts.size > 2 && parsed[:scientificName][:details][0][:infraspecies]
- name_ary = parts[0..1]
- parsed[:scientificName][:details][0][:infraspecies].each do |data|
- name_ary << (data[:rank] && data[:rank] != 'n/a'? "#{data[:rank]} #{data[:string]}" : data[:string])
- end
- canonical = name_ary.join(" ")
- end
- canonical
+ parsed[:scientificName][:canonical].to_s
end
-puts "Running parser service on port #{OPTIONS[:port]}, output type is '#{OPTIONS[:output]}'"
-parser = ScientificNameParser.new
+puts "Running parser service on port %s, output type is '%s'" %
+ [OPTIONS[:port], OPTIONS[:output]]
+opts = {}
+opts = {canonical_with_rank: true} if OPTIONS[:canonical_with_rank]
+parser = ScientificNameParser.new(opts)
server = TCPServer.open(OPTIONS[:port]) # Socket to listen on a port
loop do # Servers run forever
Thread.start(server.accept) do |client|
@@ -85,7 +74,7 @@ loop do # Servers run forever
while a = client.readline rescue nil
count += 1
puts "parsed %s'th name" % count if count % 1000 == 0
- a.force_encoding("utf-8") if a && RUBY_VERSION_INT >= 19
+ a.force_encoding('utf-8') if a && RUBY_VERSION_INT >= 19
if ['end','exit','q', '.'].include? a.strip
client.close
break
View
@@ -15,7 +15,8 @@ module PreProcessor
LAST_WORD_JUNK = /(,\s*|\s+)(spp\.|spp|var\.|var|von|van|ined\.|ined|sensu|new|non|nec|nudum|cf\.|cf|sp\.|sp|ssp\.|ssp|subsp|subgen|hybrid|hort\.|hort)\??\s*$/i
def self.clean(a_string)
- [NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2, TAXON_CONCEPTS3, NOMEN_CONCEPTS, LAST_WORD_JUNK].each do |i|
+ [NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2,
+ TAXON_CONCEPTS3, NOMEN_CONCEPTS, LAST_WORD_JUNK].each do |i|
a_string = a_string.gsub(i, '')
end
a_string = a_string.tr('ſ','s') #old 's'
@@ -49,10 +50,7 @@ def cpu_num
private
def parse_process(name)
p = ScientificNameParser.new
- failed_res = { scientificName: { parsed: false,
- verbatim: name,
- error: 'Parser error' } }
- p.parse(name) rescue failed_res
+ p.parse(name) rescue ScientificNameParser::FAILED_RESULT.(name)
end
end
@@ -73,6 +71,12 @@ class ScientificNameParser
'..',
'..',
'VERSION')).readline.strip
+
+ FAILED_RESULT = ->(name) do
+ { scientificName:
+ { parsed: false, verbatim: name.to_s.strip, error: 'Parser error' }
+ }
+ end
def self.fix_case(name_string)
name_ary = name_string.split(/\s+/)
@@ -87,17 +91,21 @@ def self.fix_case(name_string)
end
else
if name_ary[0].size > 1
- word1 = UnicodeUtils.upcase(name_ary[0][0]) + UnicodeUtils.downcase(name_ary[0][1..-1])
+ word1 = UnicodeUtils.upcase(name_ary[0][0]) +
+ UnicodeUtils.downcase(name_ary[0][1..-1])
else
word1 = name_ary[0]
end
if name_ary[1].match(/^\(/)
- word2 = name_ary[1].gsub(/\)$/, '') + ")"
- word2 = word2[0] + UnicodeUtils.upcase(word2[1]) + UnicodeUtils.downcase(word2[2..-1])
+ word2 = name_ary[1].gsub(/\)$/, '') + ')'
+ word2 = word2[0] + UnicodeUtils.upcase(word2[1]) +
+ UnicodeUtils.downcase(word2[2..-1])
else
word2 = UnicodeUtils.downcase(name_ary[1])
end
- res = word1 + " " + word2 + " " + name_ary[2..-1].map { |w| UnicodeUtils.downcase(w) }.join(" ")
+ res = word1 + ' ' +
+ word2 + ' ' +
+ name_ary[2..-1].map { |w| UnicodeUtils.downcase(w) }.join(' ')
res.strip!
end
res
@@ -114,7 +122,9 @@ def initialize(opts = {})
end
def virus?(a_string)
- !!(a_string.match(/\sICTV\s*$/) || a_string.match(/\b(virus|viruses|phage|phages|viroid|viroids|satellite|satellites|prion|prions)\b/i) || a_string.match(/[A-Z]?[a-z]+virus\b/))
+ !!(a_string.match(/\sICTV\s*$/) ||
+ a_string.match(/\b(virus|viruses|phage|phages|viroid|viroids|satellite|satellites|prion|prions)\b/i) ||
+ a_string.match(/[A-Z]?[a-z]+virus\b/))
end
def unknown_placement?(a_string)
@@ -126,26 +136,26 @@ def parsed
end
def parse(a_string)
- @verbatim = a_string
+ @verbatim = a_string.strip
a_string = PreProcessor::clean(a_string)
if virus?(a_string)
- @parsed = { :verbatim => a_string, :virus => true }
+ @parsed = { verbatim: a_string, virus: true }
elsif unknown_placement?(a_string)
- @parsed = { :verbatim => a_string }
+ @parsed = { verbatim: a_string }
else
begin
@parsed = @clean.parse(a_string) || @dirty.parse(a_string)
unless @parsed
index = @dirty.index || @clean.index
salvage_match = a_string[0..index].split(/\s+/)[0..-2]
salvage_string = salvage_match ? salvage_match.join(' ') : a_string
- @parsed = @dirty.parse(salvage_string) || @canonical.parse(a_string) || { :verbatim => a_string }
+ @parsed = @dirty.parse(salvage_string) ||
+ @canonical.parse(a_string) ||
+ { verbatim: a_string }
end
rescue
- @parsed = { scientificName: { parsed: false,
- verbatim: name,
- error: 'Parser error' } }
+ @parsed = FAILED_RESULT.(@verbatim)
end
end
@@ -156,22 +166,24 @@ def @parsed.verbatim=(a_string)
def @parsed.all(opts = {})
canonical_with_rank = !!opts[:canonical_with_rank]
parsed = self.class != Hash
- res = { :parsed => parsed, :parser_version => ScientificNameParser::VERSION}
+ res = { parsed: parsed, parser_version: ScientificNameParser::VERSION}
if parsed
hybrid = self.hybrid rescue false
res.merge!({
- :verbatim => @verbatim,
- :normalized => self.value,
- :canonical => self.canonical,
- :hybrid => hybrid,
- :details => self.details,
- :parser_run => self.parser_run,
- :positions => self.pos
+ verbatim: @verbatim,
+ normalized: self.value,
+ canonical: self.canonical,
+ hybrid: hybrid,
+ details: self.details,
+ parser_run: self.parser_run,
+ positions: self.pos
})
else
res.merge!(self)
end
- if canonical_with_rank && canonical.count(" ") > 1 && res[:details][0][:infraspecies]
+ if (canonical_with_rank &&
+ canonical.count(' ') > 1 &&
+ res[:details][0][:infraspecies])
ScientificNameParser.add_rank_to_canonical(res)
end
res = {:scientificName => res}
@@ -192,14 +204,14 @@ def @parsed.all_json
private
def self.add_rank_to_canonical(parsed)
- parts = parsed[:canonical].split(" ")
+ parts = parsed[:canonical].split(' ')
name_ary = parts[0..1]
parsed[:details][0][:infraspecies].each do |data|
infrasp = data[:string]
rank = data[:rank]
name_ary << (rank && rank != 'n/a' ? "#{rank} #{infrasp}" : infrasp)
end
- parsed[:canonical] = name_ary.join(" ")
+ parsed[:canonical] = name_ary.join(' ')
end
end

0 comments on commit 1d9ed79

Please sign in to comment.