From 45538f4499d865228816111faf54841e95dd0640 Mon Sep 17 00:00:00 2001 From: Seth Denner Date: Wed, 6 May 2015 12:52:42 -0400 Subject: [PATCH] Range Filtering, Deleting and Counting Improvements There was an error in how the backend determines wether or not a range query can be evaluated efficiently by Cassandra. It was allowing attempts of efficent filtering on non-clustering columns. Had some errors in how I was handling deleting and counting methods. I thought the queries were build at that point in code execution. I was mistaken but now the queries are build correctly and the results are as expected. --- .../db/backends/cassandra/compiler.py | 56 +++++++++++++------ .../db/backends/cassandra/predicate.py | 56 ++++++++++++++----- djangocassandra/db/fields.py | 13 +++++ 3 files changed, 94 insertions(+), 31 deletions(-) diff --git a/djangocassandra/db/backends/cassandra/compiler.py b/djangocassandra/db/backends/cassandra/compiler.py index 6ec392d3..f1007925 100644 --- a/djangocassandra/db/backends/cassandra/compiler.py +++ b/djangocassandra/db/backends/cassandra/compiler.py @@ -65,7 +65,11 @@ def __init__( else: self.cassandra_meta = None - self.pk_column = self.meta.pk.column + self.pk_column = ( + self.meta.pk.db_column + if self.meta.pk.db_column + else self.meta.pk.column + ) self.column_family = self.meta.db_table self.columns = self.meta.fields self.where = None @@ -91,12 +95,12 @@ def __init__( ] if hasattr(self.cassandra_meta, 'clustering_keys'): - self.clustering_keys = ( + self.clustering_columns = ( self.query.model.CassandraMeta.clustering_keys ) else: - self.clustering_keys = [] + self.clustering_columns = [] self.cql_query = self.column_family_class.objects.values_list( *self.column_names @@ -106,7 +110,7 @@ def __init__( def filterable_columns(self): return itertools.chain( [self.pk_column], - self.clustering_keys, + self.clustering_columns, self.indexed_columns ) @@ -122,7 +126,7 @@ def _get_rows_by_indexed_column(self, range_predicates): predicates_by_column[self.pk_column] ) - for column in self.clustering_keys: + for column in self.clustering_columns: if column in predicates_by_column: sorted_predicates.append( predicates_by_column[column] @@ -180,20 +184,19 @@ def filter_range(query, predicate): end_op: predicate.end }) - query = self.cql_query for predicate in sorted_predicates: - query = filter_range( - query, + self.cql_query = filter_range( + self.cql_query, predicate ) for predicate in indexed_predicates: - query = filter_range( - query, + self.cql_query = filter_range( + self.cql_query, predicate ) - return query + return self.cql_query def get_row_range(self, range_predicates): ''' @@ -242,13 +245,30 @@ def count( self, limit=None ): - return self.cql_query.count() + return len( + self.root_predicate.get_matching_rows(self) + ) + def delete( self, columns=set() ): - return self.cql_query.delete() + if self.root_predicate.can_evaluate_efficiently( + self.pk_column, + self.clustering_columns, + self.indexed_columns + ): + self.root_predicate.get_matching_rows(self) + for r in self.cql_query: + r.delete() + + else: + rows = self.root_predicate.get_matching_rows(self) + for row in rows: + self.column_family_class.get(**{ + self.pk_column: row[self.pk_column] + }).delete() def order_by( self, @@ -291,14 +311,14 @@ def order_by( ) if ( - not partition_key_filtered or - field_name not in self.clustering_keys or - self.inefficient_ordering + partition_key_filtered + and field_name in self.clustering_columns + and not self.inefficient_ordering ): - self.add_inefficient_order_by(order_string) + self.ordering.append(order_string) else: - self.ordering.append(order_string) + self.add_inefficient_order_by(order_string) @safe_call def add_inefficient_order_by(self, ordering): diff --git a/djangocassandra/db/backends/cassandra/predicate.py b/djangocassandra/db/backends/cassandra/predicate.py index 45572ef0..3aee30bd 100644 --- a/djangocassandra/db/backends/cassandra/predicate.py +++ b/djangocassandra/db/backends/cassandra/predicate.py @@ -85,11 +85,21 @@ def _is_exact(self): self.end_inclusive ) - def can_evaluate_efficiently(self, pk_column, indexed_columns): - return ( - self.column == pk_column or - self.column in indexed_columns - ) + def can_evaluate_efficiently( + self, + pk_column, + clustering_columns, + indexed_columns + ): + if self._is_exact(): + return ( + self.column == pk_column or + self.column in indexed_columns or + self.column in clustering_columns + ) + + else: + return self.column in clustering_columns def incorporate_range_op(self, column, op, value, parent_compound_op): if column != self.column: @@ -213,7 +223,12 @@ def __init__(self, column, op, value=None): def __repr__(self): return '(OP: ' + self.op + ':' + unicode(self.value) + ')' - def can_evaluate_efficiently(self, pk_column, indexed_columns): + def can_evaluate_efficiently( + self, + pk_column, + clustering_columns, + indexed_columns + ): return False def row_matches(self, row): @@ -275,18 +290,31 @@ def __repr__(self): s += ')' return s - def can_evaluate_efficiently(self, pk_column, indexed_columns): + def can_evaluate_efficiently( + self, + pk_column, + clustering_columns, + indexed_columns + ): if self.negated: return False if self.op == COMPOUND_OP_AND: for child in self.children: - if child.can_evaluate_efficiently(pk_column, indexed_columns): + if child.can_evaluate_efficiently( + pk_column, + clustering_columns, + indexed_columns + ): return True else: return False elif self.op == COMPOUND_OP_OR: for child in self.children: - if not child.can_evaluate_efficiently(pk_column, indexed_columns): + if not child.can_evaluate_efficiently( + pk_column, + clustering_columns, + indexed_columns + ): return False else: return True @@ -378,16 +406,18 @@ def get_matching_rows(self, query): # of rows so we only have to run the inefficient query predicates # over this smaller number of rows. if self.can_evaluate_efficiently( - pk_column, - query.filterable_columns + pk_column, + query.clustering_columns, + query.indexed_columns ): range_predicates = [] inefficient_predicates = [] result = None for predicate in self.children: if predicate.can_evaluate_efficiently( - pk_column, - query.filterable_columns + pk_column, + query.clustering_columns, + query.filterable_columns ): range_predicates.append(predicate) diff --git a/djangocassandra/db/fields.py b/djangocassandra/db/fields.py index 33bc7533..f73373e7 100644 --- a/djangocassandra/db/fields.py +++ b/djangocassandra/db/fields.py @@ -45,3 +45,16 @@ def get_internal_type(self): def get_auto_value(self): return uuid.uuid4() + + def value_to_string(self, value): + if isinstance(value, basestring): + return value + + try: + return value.hex + except (TypeError, ValueError): + raise exceptions.ValidationError( + self.error_messages['invalid'], + code='invalid', + params={'value': value}, + )