Skip to content

Commit

Permalink
Bug fix, removal of BOM, :shipit: v3.1.3
Browse files Browse the repository at this point in the history
We did not have consistent number of fields in csv file.
I added tests for all possible combinations of options
to check consistency of row columns in all test files.

BOM character \u{feff} is added by Windows when it saves
UTF-8 files. It supposed to be the very first character
in the file. It prevented the first header to be recognized.
The character is now removed while we are collecting headers.
  • Loading branch information
dimus committed Aug 9, 2017
1 parent 9cad50d commit d3f28d8
Show file tree
Hide file tree
Showing 6 changed files with 34 additions and 9 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# ``gn_crossmap`` CHANGELOG

## 3.1.3

* @dimus - Fix formatting bug for empty results, remove BOM char from headers

## 3.1.2

* @dimus - Fixes #37 tab is now default if separator is not found
Expand Down
2 changes: 1 addition & 1 deletion lib/gn_crossmap/collector.rb
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def init_fields_collector
end

def prepare_field(field)
field = field.to_s.tr(":", "/")
field = field.to_s.tr(":", "/").delete("\u{feff}")
return :none if field == ""
field.split("/")[-1].strip.downcase.to_sym
end
Expand Down
17 changes: 12 additions & 5 deletions lib/gn_crossmap/result_processor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,19 @@ def rubyfy(result)
def write_empty_result(datum)
@stats.stats[:matches][0] += 1
@stats.stats[:resolved_records] += 1
res = @original_data[datum[:supplied_id]]
res += [GnCrossmap::MATCH_TYPES[0], datum[:supplied_name_string], nil,
datum[:supplied_canonical_form], nil,
@input[datum[:supplied_id]][:rank], nil, nil, nil, nil]
res = compile_empty_result(datum)
@writer.write(res)
end

def compile_empty_result(datum)
res = @original_data[datum[:supplied_id]]
res += [GnCrossmap::MATCH_TYPES[0], datum[:supplied_name_string],
nil, nil, nil, nil,
@input[datum[:supplied_id]][:rank], nil, nil, nil, nil, nil]
res << nil if @with_classification
res
end

def write_result(datum)
collect_stats(datum)
datum[:results].each do |result|
Expand Down Expand Up @@ -72,7 +78,8 @@ def new_data(datum, result)

def canonical(name_string)
parsed = @parser.parse(name_string)[:scientificName]
parsed[:canonical].nil? || parsed[:hybrid] ? nil : parsed[:canonical]
return nil if parsed[:canonical].nil? || parsed[:hybrid]
parsed[:canonical]
rescue StandardError
@parser = ScientificNameParser.new
nil
Expand Down
2 changes: 1 addition & 1 deletion lib/gn_crossmap/version.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

# Namespace module for crossmapping checklists to GN sources
module GnCrossmap
VERSION = "3.1.2"
VERSION = "3.1.3"

def self.version
VERSION
Expand Down
7 changes: 5 additions & 2 deletions spec/features/checklist_resolver_spec.rb
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
describe "features" do
context "resolving variety of csv files" do
%i(all_fields sciname sciname_auth sciname_rank csv_relaxed).each do |input|
%i(single_field all_fields sciname sciname_auth sciname_rank csv_relaxed).
each do |input|
context input do
it "resolves #{input}" do
opts = { output: "/tmp/#{input}-processed.csv",
input: FILES[input],
data_source_id: 1,
skip_original: true }
with_classification: [true, false].sample,
skip_original: [true, false].sample }
FileUtils.rm(opts[:output]) if File.exist?(opts[:output])
GnCrossmap.run(opts)
expect(File.exist?(opts[:output])).to be true
expect(uniform_rows?(opts[:output])).to be true
end
end
end
Expand Down
11 changes: 11 additions & 0 deletions spec/support/helpers.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,17 @@ def io(file, mode = "r:utf-8")
IO.new(fd, mode: mode)
end

def uniform_rows?(file_path)
headers = nil
File.open(file_path).each do |l|
fields = l.split("\t")
headers = fields unless headers
require "byebug"; byebug if fields.size != headers.size
return false if fields.size != headers.size
end
true
end

FILES = {
all_fields: "#{files_path}/all-fields-semicolon.csv",
all_fields_tiny: "#{files_path}/all-fields-tiny.csv",
Expand Down

0 comments on commit d3f28d8

Please sign in to comment.