Skip to content

Commit

Permalink
is it possible to do type inference (specifically for csvsql) without…
Browse files Browse the repository at this point in the history
… having to hold all of the file data in memory?
  • Loading branch information
JoeGermuska committed Dec 20, 2011
1 parent fca1705 commit c5eff7f
Show file tree
Hide file tree
Showing 2 changed files with 188 additions and 0 deletions.
109 changes: 109 additions & 0 deletions csvkit/typeinference.py
Expand Up @@ -219,3 +219,112 @@ def normalize_table(rows, normal_types=None, accumulate_errors=False):

return new_normal_types, new_normal_columns

AVAILABLE_TYPES = (bool, int, float, datetime.time, datetime.date, datetime.datetime, unicode)

def can_be_null(val):
if not val:
return True
return val.lower() in NULL_VALUES

def can_be_bool(val):
return val.lower() in TRUE_VALUES or val.lower() in FALSE_VALUES

def can_be_int(val):
try:
int_val = int(val.replace(',', ''))
if val[0] == '0' and int(val) != 0:
return False
return True
except ValueError:
return False

def can_be_float(val):
try:
float_val = float(val.replace(',',''))
return True
except ValueError:
return False

def can_be_time(val):
try:
d = parse(val, default=DEFAULT_DATETIME)
return d.date() == NULL_DATE
except:
return False

def can_be_date(val):
try:
d = parse(val, default=DEFAULT_DATETIME)
return d.time() == NULL_TIME and d.date() != NULL_DATE
except:
return False

def can_be_datetime(val):
try:
d = parse(val, default=DEFAULT_DATETIME)
return not can_be_date(val) and not can_be_time(val) and d != DEFAULT_DATETIME
except:
return False

can_be = {
bool: can_be_bool,
int: can_be_int,
float: can_be_float,
datetime.time: can_be_time,
datetime.date: can_be_date,
datetime.datetime: can_be_datetime,
unicode: lambda x: True,
}

def assess_row(row,limitations=None):
"""Given a row of data, return a sequence whose members are lists of types which could possibly apply to values in the given row.
If limitations is not None, it should be a sequence of the same length as 'row', and the return value will not include any types which
were not in the input limitations. The expected usage model would be to iteratively call this for each row, passing
back the return as the limitations for the next row.
If 'limitations' is a sequence of all 'unicode' (the "widest" data type) then this call will return
the same list immediately. To short circuit iterative calls to this function after that equilibrium has
been reached, consider testing before calling using 'all_unicode' defined elsewhere in this module."""
if limitations:
if all_unicode(limitations): return limitations
else:
limitations = [list(AVAILABLE_TYPES) for item in row]

result = []
for value, column_limits in zip(row,limitations):
new_column_limits = []
for limit in column_limits:
if not value or can_be_null(value) or can_be[limit](value):
new_column_limits.append(limit)
result.append(new_column_limits)

return result

def reduce_assessment(limitations):
"""In some cases, an entire dataset might be reviewed and assess_row might not have boiled it down to unique values.
And in any case, we want our list of lists to be a list of singular values.
"""
result = []
for item in limitations:
if len(item) > 1:
item.remove(unicode)
if datetime.datetime in item and datetime.date in item:
item.remove(datetime.date)
if len(item) == 1:
result.append(item[0])
elif int in item:
result.append(int)
elif float in item:
result.append(float)
else:
raise Exception("Don't know how to reduce [%s]" % item)

return result

def all_unicode(seq):
if seq:
for item in seq:
if item != unicode:
return False
return True
return False
79 changes: 79 additions & 0 deletions tests/test_typeinference.py
Expand Up @@ -216,3 +216,82 @@ def test_normalize_table_known_types_invalid(self):
self.assertEqual(e.errors[2].value, '2.1')
self.assertEqual(e.errors[2].normal_type, int)

def test_can_be_bool(self):
for val in ['yes', 'y', 'true', 't', 'True', 'TRUE', 'YES', 'no', 'n', 'false', 'f', 'NO', 'False']:
self.assertTrue(typeinference.can_be_bool(val), val + " should be a valid boolean string")
for val in ['1', '0', 'bob', 'nyet', 'si']:
self.assertFalse(typeinference.can_be_bool(val), val + " should not be a valid boolean string")

def test_can_be_int(self):
for val in ['1', '100', '1,000', '5215', '5,235,125']:
self.assertTrue(typeinference.can_be_int(val), val + " should be a valid int string")
for val in ['bob', 'nyet', '01010', '(312) 555-1212', 'palindrome', '3.14', '1.21']:
self.assertFalse(typeinference.can_be_int(val), val + " should not be a valid int string")

def test_can_be_float(self):
for val in ['1', '100', '1,000', '5215', '5,235,125', '1.0', '3.14', '1.21', '0.19', '01010']:
self.assertTrue(typeinference.can_be_float(val), val + " should be a valid float string")
for val in ['bob', 'nyet', '(312) 555-1212', 'palindrome']:
self.assertFalse(typeinference.can_be_float(val), val + " should not be a valid float string")

def test_can_be_date(self):
for val in ['12/25/2011', 'July 6, 1971', '1941-12-07']:
self.assertTrue(typeinference.can_be_date(val), val + " should be a valid date string")
for val in ['12/25/2011 5:00 pm', '1941-12-07 4:32am', '5:00 pm', '12am', '4:32am']:
self.assertFalse(typeinference.can_be_date(val), val + " should not be a valid date string")

def test_can_be_time(self):
for val in ['5:00 pm', '12am', '4:32am']:
self.assertTrue(typeinference.can_be_time(val), val + " should be a valid time string")
for val in ['12/25/2011 5:00 pm', 'July 6, 1971 12am', '1941-12-07 4:32am', '12/25/2011', 'July 6, 1971', '1941-12-07']:
self.assertFalse(typeinference.can_be_time(val), val + " should be a valid time string")

def test_can_be_datetime(self):
for val in ['12/25/2011 5:00 pm', 'July 6, 1971 12:01am', '1941-12-07 4:32am']:
self.assertTrue(typeinference.can_be_datetime(val), val + " should be a valid datetime string")
for val in ['5:00 pm', '12am', '4:32am', '12/25/2011', 'July 6, 1971', '1941-12-07']:
self.assertFalse(typeinference.can_be_datetime(val), val + " should not be a valid datetime string")

def test_assess_row(self):
rows = [
['5', '1', 'bob', 'true', '1/1/2001', '1/1/2002 5:00pm'],
['4', '2.0', 'more strings', 'false', '12-31-2009', '1/1/2002'],
['4', '', 'more strings', '', '4/1/2010', ''],
]

limits = typeinference.assess_row(rows[0])
self.assert_assessments(limits[0], (int, float, datetime.date, unicode))
self.assert_assessments(limits[1], (int, float, datetime.date, unicode))
self.assert_assessments(limits[2], (unicode,))
self.assert_assessments(limits[3], (bool, unicode))
self.assert_assessments(limits[4], (datetime.date,unicode))
self.assert_assessments(limits[5], (datetime.datetime,unicode))

limits = typeinference.assess_row(rows[1], limits)
self.assert_assessments(limits[0], (int, float, datetime.date, unicode))
self.assert_assessments(limits[1], (float, datetime.date, unicode))
self.assert_assessments(limits[2], (unicode,))
self.assert_assessments(limits[3], (bool, unicode))
self.assert_assessments(limits[4], (datetime.date,unicode))
self.assert_assessments(limits[5], (datetime.date,unicode))

limits = typeinference.assess_row(rows[2], limits)
self.assert_assessments(limits[0], (int, float, datetime.date, unicode))
self.assert_assessments(limits[1], (float, datetime.date, unicode))
self.assert_assessments(limits[2], (unicode,))
self.assert_assessments(limits[3], (bool, unicode))
self.assert_assessments(limits[4], (datetime.date,unicode))
self.assert_assessments(limits[5], (datetime.date,unicode))

limits = typeinference.reduce_assessment(limits)
self.assertEqual(int,limits[0])
self.assertEqual(float,limits[1])
self.assertEqual(unicode,limits[2])
self.assertEqual(bool,limits[3])

def assert_assessments(self, assessment, allowed):
self.assertEqual(len(assessment),len(allowed), "len of assessment should be %i, not %i [%s]" % (len(allowed),len(assessment),assessment))
for item in allowed:
self.assertTrue(item in assessment, 'expected %s' % item)


0 comments on commit c5eff7f

Please sign in to comment.