Bug fix, removal of BOM, v3.1.3

We did not have consistent number of fields in csv file. I added tests for all possible combinations of options to check consistency of row columns in all test files. BOM character \u{feff} is added by Windows when it saves UTF-8 files. It supposed to be the very first character in the file. It prevented the first header to be recognized. The character is now removed while we are collecting headers.
GlobalNamesArchitecture · Aug 9, 2017 · d3f28d8 · d3f28d8
1 parent 9cad50d
commit d3f28d8
Show file tree

Hide file tree

Showing 6 changed files with 34 additions and 9 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,9 @@
 # ``gn_crossmap`` CHANGELOG
 
+## 3.1.3
+
+* @dimus - Fix formatting bug for empty results, remove BOM char from headers
+
 ## 3.1.2
 
 * @dimus - Fixes #37 tab is now default if separator is not found

diff --git a/lib/gn_crossmap/collector.rb b/lib/gn_crossmap/collector.rb
@@ -26,7 +26,7 @@ def init_fields_collector
     end
 
     def prepare_field(field)
-      field = field.to_s.tr(":", "/")
+      field = field.to_s.tr(":", "/").delete("\u{feff}")
       return :none if field == ""
       field.split("/")[-1].strip.downcase.to_sym
     end

diff --git a/lib/gn_crossmap/result_processor.rb b/lib/gn_crossmap/result_processor.rb
@@ -30,13 +30,19 @@ def rubyfy(result)
     def write_empty_result(datum)
       @stats.stats[:matches][0] += 1
       @stats.stats[:resolved_records] += 1
-      res = @original_data[datum[:supplied_id]]
-      res += [GnCrossmap::MATCH_TYPES[0], datum[:supplied_name_string], nil,
-              datum[:supplied_canonical_form], nil,
-              @input[datum[:supplied_id]][:rank], nil, nil, nil, nil]
+      res = compile_empty_result(datum)
       @writer.write(res)
     end
 
+    def compile_empty_result(datum)
+      res = @original_data[datum[:supplied_id]]
+      res += [GnCrossmap::MATCH_TYPES[0], datum[:supplied_name_string],
+              nil, nil, nil, nil,
+              @input[datum[:supplied_id]][:rank], nil, nil, nil, nil, nil]
+      res <<  nil if @with_classification
+      res
+    end
+
     def write_result(datum)
       collect_stats(datum)
       datum[:results].each do |result|
@@ -72,7 +78,8 @@ def new_data(datum, result)
 
     def canonical(name_string)
       parsed = @parser.parse(name_string)[:scientificName]
-      parsed[:canonical].nil? || parsed[:hybrid] ? nil : parsed[:canonical]
+      return nil if parsed[:canonical].nil? || parsed[:hybrid]
+      parsed[:canonical]
     rescue StandardError
       @parser = ScientificNameParser.new
       nil

diff --git a/lib/gn_crossmap/version.rb b/lib/gn_crossmap/version.rb
@@ -2,7 +2,7 @@
 
 # Namespace module for crossmapping checklists to GN sources
 module GnCrossmap
-  VERSION = "3.1.2"
+  VERSION = "3.1.3"
 
   def self.version
     VERSION

diff --git a/spec/features/checklist_resolver_spec.rb b/spec/features/checklist_resolver_spec.rb
@@ -1,15 +1,18 @@
 describe "features" do
   context "resolving variety of csv files" do
-    %i(all_fields sciname sciname_auth sciname_rank csv_relaxed).each do |input|
+    %i(single_field all_fields sciname sciname_auth sciname_rank csv_relaxed).
+      each do |input|
       context input do
         it "resolves #{input}" do
           opts = { output: "/tmp/#{input}-processed.csv",
                    input: FILES[input],
                    data_source_id: 1,
-                   skip_original: true }
+                   with_classification: [true, false].sample,
+                   skip_original: [true, false].sample }
           FileUtils.rm(opts[:output]) if File.exist?(opts[:output])
           GnCrossmap.run(opts)
           expect(File.exist?(opts[:output])).to be true
+          expect(uniform_rows?(opts[:output])).to be true
         end
       end
     end

diff --git a/spec/support/helpers.rb b/spec/support/helpers.rb
@@ -5,6 +5,17 @@ def io(file, mode = "r:utf-8")
   IO.new(fd, mode: mode)
 end
 
+def uniform_rows?(file_path)
+  headers = nil
+  File.open(file_path).each do |l|
+    fields = l.split("\t")
+    headers = fields unless headers
+    require "byebug"; byebug if fields.size != headers.size
+    return false if fields.size != headers.size
+  end
+  true
+end
+
 FILES = {
   all_fields: "#{files_path}/all-fields-semicolon.csv",
   all_fields_tiny: "#{files_path}/all-fields-tiny.csv",