Skip to content

Commit

Permalink
Merge pull request #58 from seajhawk/main
Browse files Browse the repository at this point in the history
Add ability to specify encoding to help with UnicodeDecodeError errors
  • Loading branch information
pnadolny13 committed Aug 26, 2022
2 parents 0496f7b + 65b83ad commit 04525c6
Show file tree
Hide file tree
Showing 6 changed files with 31 additions and 4 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ The `config.json` contains an array called `files` that consists of dictionary o
* `entity`: The entity name to be passed to singer (i.e. the table)
* `path`: Local path to the file to be ingested. Note that this may be a directory, in which case all files in that directory and any of its subdirectories will be recursively processed
* `keys`: The names of the columns that constitute the unique keys for that entity
* `encoding`: [Optional] The file encoding to use when reading the file (i.e. "latin1", "UTF-8"). Use this setting when you get a `UnicodeDecodeError` error.

Example:

Expand All @@ -38,7 +39,8 @@ Example:
},
{ "entity" : "opportunities",
"path" : "/path/to/opportunities.csv",
"keys" : ["Id"]
"keys" : ["Id"],
"encoding" : "latin1"
}
]
}
Expand Down
4 changes: 2 additions & 2 deletions meltano.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@ plugins:
keys:
- col1
settings:
- description: Array of objects with `entity`, `file`, and `keys` keys
- description: Array of objects containing keys: `entity`, `file`, `keys`, and `encoding` (Optional)
kind: array
name: files
- description: Project-relative path to JSON file holding array of objects with
`entity`, `file`, and `keys` keys
keys: `entity`, `file`, `keys`, and `encoding` (Optional).
documentation: https://gitlab.com/meltano/tap-csv#run
label: CSV Files Definition
name: csv_files_definition
Expand Down
3 changes: 2 additions & 1 deletion tap_csv/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ def is_valid_filename(self, file_path: str) -> bool:

def get_rows(self, file_path: str) -> Iterable[list]:
"""Return a generator of the rows in a particular CSV file."""
with open(file_path, "r") as f:
encoding = self.file_config.get("encoding", None)
with open(file_path, "r", encoding=encoding) as f:
reader = csv.reader(f)
for row in reader:
yield row
Expand Down
1 change: 1 addition & 0 deletions tap_csv/tap.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class TapCSV(Tap):
th.Property("entity", th.StringType, required=True),
th.Property("path", th.StringType, required=True),
th.Property("keys", th.ArrayType(th.StringType), required=True),
th.Property("encoding", th.StringType, required=False),
)
),
description="An array of csv file stream settings.",
Expand Down
4 changes: 4 additions & 0 deletions tap_csv/tests/data/alphabet_encoding.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
col1,col2,col3
Á,b,c
d,e,f
g,h,i
19 changes: 19 additions & 0 deletions tap_csv/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,22 @@ def test_standard_tap_tests():
tests = get_standard_tap_tests(TapCSV, config=SAMPLE_CONFIG)
for test in tests:
test()


# Run standard built-in tap tests from the SDK, with different encoding:
def test_standard_tap_tests_encoding():
"""Run standard built-in tap tests from the SDK, with different encoding."""
test_data_dir = os.path.dirname(os.path.abspath(__file__))
SAMPLE_CONFIG = {
"files": [
{
"entity": "test",
"path": f"{test_data_dir}/data/alphabet_encoding.csv",
"keys": [],
"encoding": "latin1",
}
]
}
tests = get_standard_tap_tests(TapCSV, config=SAMPLE_CONFIG)
for test in tests:
test()

0 comments on commit 04525c6

Please sign in to comment.