Permalink
Browse files

is it possible to do type inference (specifically for csvsql) without…

… having to hold all of the file data in memory?
  • Loading branch information...
1 parent fca1705 commit c5eff7f7a090220a4b2178eba106fcd1cffbfb1d @JoeGermuska committed Dec 20, 2011
Showing with 188 additions and 0 deletions.
  1. +109 −0 csvkit/typeinference.py
  2. +79 −0 tests/test_typeinference.py
View
@@ -219,3 +219,112 @@ def normalize_table(rows, normal_types=None, accumulate_errors=False):
return new_normal_types, new_normal_columns
+AVAILABLE_TYPES = (bool, int, float, datetime.time, datetime.date, datetime.datetime, unicode)
+
+def can_be_null(val):
+ if not val:
+ return True
+ return val.lower() in NULL_VALUES
+
+def can_be_bool(val):
+ return val.lower() in TRUE_VALUES or val.lower() in FALSE_VALUES
+
+def can_be_int(val):
+ try:
+ int_val = int(val.replace(',', ''))
+ if val[0] == '0' and int(val) != 0:
+ return False
+ return True
+ except ValueError:
+ return False
+
+def can_be_float(val):
+ try:
+ float_val = float(val.replace(',',''))
+ return True
+ except ValueError:
+ return False
+
+def can_be_time(val):
+ try:
+ d = parse(val, default=DEFAULT_DATETIME)
+ return d.date() == NULL_DATE
+ except:
+ return False
+
+def can_be_date(val):
+ try:
+ d = parse(val, default=DEFAULT_DATETIME)
+ return d.time() == NULL_TIME and d.date() != NULL_DATE
+ except:
+ return False
+
+def can_be_datetime(val):
+ try:
+ d = parse(val, default=DEFAULT_DATETIME)
+ return not can_be_date(val) and not can_be_time(val) and d != DEFAULT_DATETIME
+ except:
+ return False
+
+can_be = {
+ bool: can_be_bool,
+ int: can_be_int,
+ float: can_be_float,
+ datetime.time: can_be_time,
+ datetime.date: can_be_date,
+ datetime.datetime: can_be_datetime,
+ unicode: lambda x: True,
+}
+
+def assess_row(row,limitations=None):
+ """Given a row of data, return a sequence whose members are lists of types which could possibly apply to values in the given row.
+ If limitations is not None, it should be a sequence of the same length as 'row', and the return value will not include any types which
+ were not in the input limitations. The expected usage model would be to iteratively call this for each row, passing
+ back the return as the limitations for the next row.
+
+ If 'limitations' is a sequence of all 'unicode' (the "widest" data type) then this call will return
+ the same list immediately. To short circuit iterative calls to this function after that equilibrium has
+ been reached, consider testing before calling using 'all_unicode' defined elsewhere in this module."""
+ if limitations:
+ if all_unicode(limitations): return limitations
+ else:
+ limitations = [list(AVAILABLE_TYPES) for item in row]
+
+ result = []
+ for value, column_limits in zip(row,limitations):
+ new_column_limits = []
+ for limit in column_limits:
+ if not value or can_be_null(value) or can_be[limit](value):
+ new_column_limits.append(limit)
+ result.append(new_column_limits)
+
+ return result
+
+def reduce_assessment(limitations):
+ """In some cases, an entire dataset might be reviewed and assess_row might not have boiled it down to unique values.
+ And in any case, we want our list of lists to be a list of singular values.
+ """
+ result = []
+ for item in limitations:
+ if len(item) > 1:
+ item.remove(unicode)
+ if datetime.datetime in item and datetime.date in item:
+ item.remove(datetime.date)
+ if len(item) == 1:
+ result.append(item[0])
+ elif int in item:
+ result.append(int)
+ elif float in item:
+ result.append(float)
+ else:
+ raise Exception("Don't know how to reduce [%s]" % item)
+
+ return result
+
+def all_unicode(seq):
+ if seq:
+ for item in seq:
+ if item != unicode:
+ return False
+ return True
+ return False
@@ -216,3 +216,82 @@ def test_normalize_table_known_types_invalid(self):
self.assertEqual(e.errors[2].value, '2.1')
self.assertEqual(e.errors[2].normal_type, int)
+ def test_can_be_bool(self):
+ for val in ['yes', 'y', 'true', 't', 'True', 'TRUE', 'YES', 'no', 'n', 'false', 'f', 'NO', 'False']:
+ self.assertTrue(typeinference.can_be_bool(val), val + " should be a valid boolean string")
+ for val in ['1', '0', 'bob', 'nyet', 'si']:
+ self.assertFalse(typeinference.can_be_bool(val), val + " should not be a valid boolean string")
+
+ def test_can_be_int(self):
+ for val in ['1', '100', '1,000', '5215', '5,235,125']:
+ self.assertTrue(typeinference.can_be_int(val), val + " should be a valid int string")
+ for val in ['bob', 'nyet', '01010', '(312) 555-1212', 'palindrome', '3.14', '1.21']:
+ self.assertFalse(typeinference.can_be_int(val), val + " should not be a valid int string")
+
+ def test_can_be_float(self):
+ for val in ['1', '100', '1,000', '5215', '5,235,125', '1.0', '3.14', '1.21', '0.19', '01010']:
+ self.assertTrue(typeinference.can_be_float(val), val + " should be a valid float string")
+ for val in ['bob', 'nyet', '(312) 555-1212', 'palindrome']:
+ self.assertFalse(typeinference.can_be_float(val), val + " should not be a valid float string")
+
+ def test_can_be_date(self):
+ for val in ['12/25/2011', 'July 6, 1971', '1941-12-07']:
+ self.assertTrue(typeinference.can_be_date(val), val + " should be a valid date string")
+ for val in ['12/25/2011 5:00 pm', '1941-12-07 4:32am', '5:00 pm', '12am', '4:32am']:
+ self.assertFalse(typeinference.can_be_date(val), val + " should not be a valid date string")
+
+ def test_can_be_time(self):
+ for val in ['5:00 pm', '12am', '4:32am']:
+ self.assertTrue(typeinference.can_be_time(val), val + " should be a valid time string")
+ for val in ['12/25/2011 5:00 pm', 'July 6, 1971 12am', '1941-12-07 4:32am', '12/25/2011', 'July 6, 1971', '1941-12-07']:
+ self.assertFalse(typeinference.can_be_time(val), val + " should be a valid time string")
+
+ def test_can_be_datetime(self):
+ for val in ['12/25/2011 5:00 pm', 'July 6, 1971 12:01am', '1941-12-07 4:32am']:
+ self.assertTrue(typeinference.can_be_datetime(val), val + " should be a valid datetime string")
+ for val in ['5:00 pm', '12am', '4:32am', '12/25/2011', 'July 6, 1971', '1941-12-07']:
+ self.assertFalse(typeinference.can_be_datetime(val), val + " should not be a valid datetime string")
+
+ def test_assess_row(self):
+ rows = [
+ ['5', '1', 'bob', 'true', '1/1/2001', '1/1/2002 5:00pm'],
+ ['4', '2.0', 'more strings', 'false', '12-31-2009', '1/1/2002'],
+ ['4', '', 'more strings', '', '4/1/2010', ''],
+ ]
+
+ limits = typeinference.assess_row(rows[0])
+ self.assert_assessments(limits[0], (int, float, datetime.date, unicode))
+ self.assert_assessments(limits[1], (int, float, datetime.date, unicode))
+ self.assert_assessments(limits[2], (unicode,))
+ self.assert_assessments(limits[3], (bool, unicode))
+ self.assert_assessments(limits[4], (datetime.date,unicode))
+ self.assert_assessments(limits[5], (datetime.datetime,unicode))
+
+ limits = typeinference.assess_row(rows[1], limits)
+ self.assert_assessments(limits[0], (int, float, datetime.date, unicode))
+ self.assert_assessments(limits[1], (float, datetime.date, unicode))
+ self.assert_assessments(limits[2], (unicode,))
+ self.assert_assessments(limits[3], (bool, unicode))
+ self.assert_assessments(limits[4], (datetime.date,unicode))
+ self.assert_assessments(limits[5], (datetime.date,unicode))
+
+ limits = typeinference.assess_row(rows[2], limits)
+ self.assert_assessments(limits[0], (int, float, datetime.date, unicode))
+ self.assert_assessments(limits[1], (float, datetime.date, unicode))
+ self.assert_assessments(limits[2], (unicode,))
+ self.assert_assessments(limits[3], (bool, unicode))
+ self.assert_assessments(limits[4], (datetime.date,unicode))
+ self.assert_assessments(limits[5], (datetime.date,unicode))
+
+ limits = typeinference.reduce_assessment(limits)
+ self.assertEqual(int,limits[0])
+ self.assertEqual(float,limits[1])
+ self.assertEqual(unicode,limits[2])
+ self.assertEqual(bool,limits[3])
+
+ def assert_assessments(self, assessment, allowed):
+ self.assertEqual(len(assessment),len(allowed), "len of assessment should be %i, not %i [%s]" % (len(allowed),len(assessment),assessment))
+ for item in allowed:
+ self.assertTrue(item in assessment, 'expected %s' % item)
+
+

0 comments on commit c5eff7f

Please sign in to comment.