Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP

We’re showing branches in this repository, but you can also compare across forks.

base fork: JoeGermuska/csvkit
...
head fork: JoeGermuska/csvkit
  • 1 commit
  • 2 files changed
  • 0 commit comments
  • 1 contributor
Commits on Dec 20, 2011
Joe Germuska is it possible to do type inference (specifically for csvsql) without…
… having to hold all of the file data in memory?
c5eff7f
Showing with 188 additions and 0 deletions.
  1. +109 −0 csvkit/typeinference.py
  2. +79 −0 tests/test_typeinference.py
109 csvkit/typeinference.py
View
@@ -219,3 +219,112 @@ def normalize_table(rows, normal_types=None, accumulate_errors=False):
return new_normal_types, new_normal_columns
+AVAILABLE_TYPES = (bool, int, float, datetime.time, datetime.date, datetime.datetime, unicode)
+
+def can_be_null(val):
+ if not val:
+ return True
+ return val.lower() in NULL_VALUES
+
+def can_be_bool(val):
+ return val.lower() in TRUE_VALUES or val.lower() in FALSE_VALUES
+
+def can_be_int(val):
+ try:
+ int_val = int(val.replace(',', ''))
+ if val[0] == '0' and int(val) != 0:
+ return False
+ return True
+ except ValueError:
+ return False
+
+def can_be_float(val):
+ try:
+ float_val = float(val.replace(',',''))
+ return True
+ except ValueError:
+ return False
+
+def can_be_time(val):
+ try:
+ d = parse(val, default=DEFAULT_DATETIME)
+ return d.date() == NULL_DATE
+ except:
+ return False
+
+def can_be_date(val):
+ try:
+ d = parse(val, default=DEFAULT_DATETIME)
+ return d.time() == NULL_TIME and d.date() != NULL_DATE
+ except:
+ return False
+
+def can_be_datetime(val):
+ try:
+ d = parse(val, default=DEFAULT_DATETIME)
+ return not can_be_date(val) and not can_be_time(val) and d != DEFAULT_DATETIME
+ except:
+ return False
+
+can_be = {
+ bool: can_be_bool,
+ int: can_be_int,
+ float: can_be_float,
+ datetime.time: can_be_time,
+ datetime.date: can_be_date,
+ datetime.datetime: can_be_datetime,
+ unicode: lambda x: True,
+}
+
+def assess_row(row,limitations=None):
+ """Given a row of data, return a sequence whose members are lists of types which could possibly apply to values in the given row.
+ If limitations is not None, it should be a sequence of the same length as 'row', and the return value will not include any types which
+ were not in the input limitations. The expected usage model would be to iteratively call this for each row, passing
+ back the return as the limitations for the next row.
+
+ If 'limitations' is a sequence of all 'unicode' (the "widest" data type) then this call will return
+ the same list immediately. To short circuit iterative calls to this function after that equilibrium has
+ been reached, consider testing before calling using 'all_unicode' defined elsewhere in this module."""
+ if limitations:
+ if all_unicode(limitations): return limitations
+ else:
+ limitations = [list(AVAILABLE_TYPES) for item in row]
+
+ result = []
+ for value, column_limits in zip(row,limitations):
+ new_column_limits = []
+ for limit in column_limits:
+ if not value or can_be_null(value) or can_be[limit](value):
+ new_column_limits.append(limit)
+ result.append(new_column_limits)
+
+ return result
+
+def reduce_assessment(limitations):
+ """In some cases, an entire dataset might be reviewed and assess_row might not have boiled it down to unique values.
+ And in any case, we want our list of lists to be a list of singular values.
+ """
+ result = []
+ for item in limitations:
+ if len(item) > 1:
+ item.remove(unicode)
+ if datetime.datetime in item and datetime.date in item:
+ item.remove(datetime.date)
+ if len(item) == 1:
+ result.append(item[0])
+ elif int in item:
+ result.append(int)
+ elif float in item:
+ result.append(float)
+ else:
+ raise Exception("Don't know how to reduce [%s]" % item)
+
+ return result
+
+def all_unicode(seq):
+ if seq:
+ for item in seq:
+ if item != unicode:
+ return False
+ return True
+ return False
79 tests/test_typeinference.py
View
@@ -216,3 +216,82 @@ def test_normalize_table_known_types_invalid(self):
self.assertEqual(e.errors[2].value, '2.1')
self.assertEqual(e.errors[2].normal_type, int)
+ def test_can_be_bool(self):
+ for val in ['yes', 'y', 'true', 't', 'True', 'TRUE', 'YES', 'no', 'n', 'false', 'f', 'NO', 'False']:
+ self.assertTrue(typeinference.can_be_bool(val), val + " should be a valid boolean string")
+ for val in ['1', '0', 'bob', 'nyet', 'si']:
+ self.assertFalse(typeinference.can_be_bool(val), val + " should not be a valid boolean string")
+
+ def test_can_be_int(self):
+ for val in ['1', '100', '1,000', '5215', '5,235,125']:
+ self.assertTrue(typeinference.can_be_int(val), val + " should be a valid int string")
+ for val in ['bob', 'nyet', '01010', '(312) 555-1212', 'palindrome', '3.14', '1.21']:
+ self.assertFalse(typeinference.can_be_int(val), val + " should not be a valid int string")
+
+ def test_can_be_float(self):
+ for val in ['1', '100', '1,000', '5215', '5,235,125', '1.0', '3.14', '1.21', '0.19', '01010']:
+ self.assertTrue(typeinference.can_be_float(val), val + " should be a valid float string")
+ for val in ['bob', 'nyet', '(312) 555-1212', 'palindrome']:
+ self.assertFalse(typeinference.can_be_float(val), val + " should not be a valid float string")
+
+ def test_can_be_date(self):
+ for val in ['12/25/2011', 'July 6, 1971', '1941-12-07']:
+ self.assertTrue(typeinference.can_be_date(val), val + " should be a valid date string")
+ for val in ['12/25/2011 5:00 pm', '1941-12-07 4:32am', '5:00 pm', '12am', '4:32am']:
+ self.assertFalse(typeinference.can_be_date(val), val + " should not be a valid date string")
+
+ def test_can_be_time(self):
+ for val in ['5:00 pm', '12am', '4:32am']:
+ self.assertTrue(typeinference.can_be_time(val), val + " should be a valid time string")
+ for val in ['12/25/2011 5:00 pm', 'July 6, 1971 12am', '1941-12-07 4:32am', '12/25/2011', 'July 6, 1971', '1941-12-07']:
+ self.assertFalse(typeinference.can_be_time(val), val + " should be a valid time string")
+
+ def test_can_be_datetime(self):
+ for val in ['12/25/2011 5:00 pm', 'July 6, 1971 12:01am', '1941-12-07 4:32am']:
+ self.assertTrue(typeinference.can_be_datetime(val), val + " should be a valid datetime string")
+ for val in ['5:00 pm', '12am', '4:32am', '12/25/2011', 'July 6, 1971', '1941-12-07']:
+ self.assertFalse(typeinference.can_be_datetime(val), val + " should not be a valid datetime string")
+
+ def test_assess_row(self):
+ rows = [
+ ['5', '1', 'bob', 'true', '1/1/2001', '1/1/2002 5:00pm'],
+ ['4', '2.0', 'more strings', 'false', '12-31-2009', '1/1/2002'],
+ ['4', '', 'more strings', '', '4/1/2010', ''],
+ ]
+
+ limits = typeinference.assess_row(rows[0])
+ self.assert_assessments(limits[0], (int, float, datetime.date, unicode))
+ self.assert_assessments(limits[1], (int, float, datetime.date, unicode))
+ self.assert_assessments(limits[2], (unicode,))
+ self.assert_assessments(limits[3], (bool, unicode))
+ self.assert_assessments(limits[4], (datetime.date,unicode))
+ self.assert_assessments(limits[5], (datetime.datetime,unicode))
+
+ limits = typeinference.assess_row(rows[1], limits)
+ self.assert_assessments(limits[0], (int, float, datetime.date, unicode))
+ self.assert_assessments(limits[1], (float, datetime.date, unicode))
+ self.assert_assessments(limits[2], (unicode,))
+ self.assert_assessments(limits[3], (bool, unicode))
+ self.assert_assessments(limits[4], (datetime.date,unicode))
+ self.assert_assessments(limits[5], (datetime.date,unicode))
+
+ limits = typeinference.assess_row(rows[2], limits)
+ self.assert_assessments(limits[0], (int, float, datetime.date, unicode))
+ self.assert_assessments(limits[1], (float, datetime.date, unicode))
+ self.assert_assessments(limits[2], (unicode,))
+ self.assert_assessments(limits[3], (bool, unicode))
+ self.assert_assessments(limits[4], (datetime.date,unicode))
+ self.assert_assessments(limits[5], (datetime.date,unicode))
+
+ limits = typeinference.reduce_assessment(limits)
+ self.assertEqual(int,limits[0])
+ self.assertEqual(float,limits[1])
+ self.assertEqual(unicode,limits[2])
+ self.assertEqual(bool,limits[3])
+
+ def assert_assessments(self, assessment, allowed):
+ self.assertEqual(len(assessment),len(allowed), "len of assessment should be %i, not %i [%s]" % (len(allowed),len(assessment),assessment))
+ for item in allowed:
+ self.assertTrue(item in assessment, 'expected %s' % item)
+
+

No commit comments for this range

Something went wrong with that request. Please try again.