JBKahn · JBKahn · Aug 20, 2016 · May 1, 2016 · May 1, 2016 · May 1, 2016
diff --git a/.gitignore b/.gitignore
@@ -30,3 +30,7 @@ _book
 
 # node
 node_modules
+
+# Local dev
+Dockerfile
+docker-compose.yml
diff --git a/django_sharding_library/constants.py b/django_sharding_library/constants.py
@@ -1,4 +1,4 @@
 class Backends(object):
-    MYSQL = 'django.db.backends.mysql'
-    POSTGRES = 'django.db.backends.postgresql_psycopg2'
-    SQLITE = 'django.db.backends.sqlite3'
+    MYSQL = ('django.db.backends.mysql', 'django.contrib.gis.db.backends.mysql')
+    POSTGRES = ('django.db.backends.postgresql_psycopg2', 'django.db.backends.postgresql', 'django.contrib.gis.db.backends.postgis')
+    SQLITE = ('django.db.backends.sqlite3', 'django.contrib.gis.db.backends.spacialite')
diff --git a/django_sharding_library/decorators.py b/django_sharding_library/decorators.py
@@ -1,9 +1,15 @@
 from django.conf import settings
+from django.apps import apps
+from django_sharding_library.constants import Backends
+from django.utils.six import iteritems
+from django.db.models import Manager
 
 from django_sharding_library.exceptions import NonExistentDatabaseException, ShardedModelInitializationException
-from django_sharding_library.fields import ShardedIDFieldMixin
 from django_sharding_library.manager import ShardManager
-from django.db.models import Manager
+from django_sharding_library.fields import ShardedIDFieldMixin, PostgresShardGeneratedIDField
+from django_sharding_library.utils import register_migration_signal_for_model_receiver
+
+PRE_MIGRATION_DISPATCH_UID = "PRE_MIGRATE_FOR_MODEL_%s"
 
 
 def model_config(shard_group=None, database=None, sharded_by_field=None):
@@ -26,13 +32,29 @@ def configure(cls):
                 )
             setattr(cls, 'django_sharding__database', database)
 
+        postgres_shard_id_fields = list(filter(lambda field: issubclass(type(field), PostgresShardGeneratedIDField), cls._meta.fields))
+        if postgres_shard_id_fields:
+            database_dicts = [settings.DATABASES[database]] if database else [db_settings for db, db_settings in
+                                                                              iteritems(settings.DATABASES) if
+                                                                              db_settings["SHARD_GROUP"] == shard_group]
+            if any([database_dict['ENGINE'] not in Backends.POSTGRES for database_dict in database_dicts]):
+                raise ShardedModelInitializationException(
+                    'You cannot use a PostgresShardGeneratedIDField on a non-Postgres database.')
+
+            register_migration_signal_for_model_receiver(apps.get_app_config(cls._meta.app_label),
+                                                         PostgresShardGeneratedIDField.migration_receiver,
+                                                         dispatch_uid=PRE_MIGRATION_DISPATCH_UID % cls._meta.app_label)
+
         if shard_group:
             sharded_fields = list(filter(lambda field: issubclass(type(field), ShardedIDFieldMixin), cls._meta.fields))
-            if not sharded_fields:
-                raise ShardedModelInitializationException('All sharded models require a ShardedIDFieldMixin.')
+            if not sharded_fields and not postgres_shard_id_fields:
+                raise ShardedModelInitializationException('All sharded models require a ShardedIDFieldMixin or a '
+                                                          'PostgresShardGeneratedIDField.')
 
-            if not list(filter(lambda field: field == cls._meta.pk, sharded_fields)):
-                raise ShardedModelInitializationException('All sharded models require the ShardedAutoIDField to be the primary key. Set primary_key=True on the field.')
+            if not list(filter(lambda field: field == cls._meta.pk, sharded_fields)) and not postgres_shard_id_fields:
+                raise ShardedModelInitializationException('All sharded models require the ShardedAutoIDField or '
+                                                          'PostgresShardGeneratedIDFieldto be the primary key. Set '
+                                                          'primary_key=True on the field.')
 
             if not callable(getattr(cls, 'get_shard', None)):
                 raise ShardedModelInitializationException('You must define a get_shard method on the sharded model.')

diff --git a/django_sharding_library/fields.py b/django_sharding_library/fields.py
@@ -1,8 +1,15 @@
 from django.apps import apps
 from django.conf import settings
-from django.db.models import AutoField, CharField, ForeignKey
+from django.db.models import AutoField, CharField, ForeignKey, BigIntegerField, OneToOneField
 
 from django_sharding_library.constants import Backends
+from django.db import connections, transaction, DatabaseError
+from django_sharding_library.utils import create_postgres_global_sequence, create_postgres_shard_id_function
+
+try:
+    from django.db.backends.postgresql.base import DatabaseWrapper as PostgresDatabaseWrapper
+except ImportError:
+    from django.db.backends.postgresql_psycopg2.base import DatabaseWrapper as PostgresDatabaseWrapper
 
 
 class BigAutoField(AutoField):
@@ -11,12 +18,15 @@ class BigAutoField(AutoField):
     9223372036854775807.
     """
     def db_type(self, connection):
-        if connection.settings_dict['ENGINE'] == Backends.MYSQL:
+        if connection.settings_dict['ENGINE'] in Backends.MYSQL:
             return 'serial'
-        if connection.settings_dict['ENGINE'] == Backends.POSTGRES:
+        if connection.settings_dict['ENGINE'] in Backends.POSTGRES:
             return 'bigserial'
         return super(BigAutoField, self).db_type(connection)
 
+    def get_internal_type(self):
+        return "BigIntegerField"
+
 
 class ShardedIDFieldMixin(object):
     """
@@ -156,3 +166,64 @@ class ShardForeignKeyStorageField(ShardForeignKeyStorageFieldMixin, ForeignKey):
     the shard using a pre_save signal.
     """
     pass
+
+
+class PostgresShardGeneratedIDField(AutoField):
+    """
+    A field that uses a Postgres stored procedure to return an ID generated on the database.
+    """
+    def db_type(self, connection, *args, **kwargs):
+
+        if not hasattr(settings, 'SHARD_EPOCH'):
+            raise ValueError("PostgresShardGeneratedIDField requires a SHARD_EPOCH to be defined in your settings file.")
+
+        if connection.vendor == PostgresDatabaseWrapper.vendor:
+            return "bigint DEFAULT next_sharded_id()"
+        else:
+            return super(PostgresShardGeneratedIDField, self).db_type(connection)
+
+    def get_internal_type(self):
+        return 'BigIntegerField'
+
+    def rel_db_type(self, connection):
+        return BigIntegerField().db_type(connection=connection)
+
+    @staticmethod
+    def migration_receiver(*args, **kwargs):
+        sequence_name = "global_id_sequence"
+        db_alias = kwargs.get('using')
+        if not db_alias:
+            raise EnvironmentError("A pre-migration receiver did not receive a database alias. "
+                                   "Perhaps your app is not registered correctly?")
+        if settings.DATABASES[db_alias]['ENGINE'] in Backends.POSTGRES:
+            shard_id = settings.DATABASES[db_alias].get('SHARD_ID', 0)
+            create_postgres_global_sequence(sequence_name, db_alias, True)
+            create_postgres_shard_id_function(sequence_name, db_alias, shard_id)
+
+
+class PostgresShardForeignKey(ForeignKey):
+    def db_type(self, connection):
+        # The database column type of a ForeignKey is the column type
+        # of the field to which it points. An exception is if the ForeignKey
+        # points to an AutoField/PositiveIntegerField/PositiveSmallIntegerField,
+        # in which case the column type is simply that of an IntegerField.
+        # If the database needs similar types for key fields however, the only
+        # thing we can do is making AutoField an IntegerField.
+        rel_field = self.target_field
+        if rel_field.get_internal_type() is "BigIntegerField":
+            return BigIntegerField().db_type(connection=connection)
+        return super(PostgresShardForeignKey, self).db_type(connection)
+
+
+class PostgresShardOneToOne(OneToOneField):
+    def db_type(self, connection):
+        # The database column type of a ForeignKey is the column type
+        # of the field to which it points. An exception is if the ForeignKey
+        # points to an AutoField/PositiveIntegerField/PositiveSmallIntegerField,
+        # in which case the column type is simply that of an IntegerField.
+        # If the database needs similar types for key fields however, the only
+        # thing we can do is making AutoField an IntegerField.
+        rel_field = self.target_field
+        if rel_field.get_internal_type() is "BigIntegerField":
+            return BigIntegerField().db_type(connection=connection)
+        return super(PostgresShardOneToOne, self).db_type(connection)
diff --git a/django_sharding_library/id_generation_strategies.py b/django_sharding_library/id_generation_strategies.py
@@ -36,7 +36,7 @@ def get_next_id(self, database=None):
         """
         from django.conf import settings
         backing_table_db = getattr(self.backing_model, 'database', 'default')
-        if settings.DATABASES[backing_table_db]['ENGINE'] == Backends.MYSQL:
+        if settings.DATABASES[backing_table_db]['ENGINE'] in Backends.MYSQL:
             with transaction.atomic(backing_table_db):
                 cursor = connections[backing_table_db].cursor()
                 sql = "REPLACE INTO `{0}` (`stub`) VALUES ({1})".format(

diff --git a/django_sharding_library/management/commands/migrate.py b/django_sharding_library/management/commands/migrate.py
@@ -17,7 +17,7 @@ def handle(self, *args, **options):
             options['database'] = database
             # Writen in green text to stand out from the surrouding headings
             if options['verbosity'] >= 1:
-                self.stdout.write(self.style.MIGRATE_SUCCESS("\nDatabase: {}\n").format(database))
+                self.stdout.write(getattr(self.style, "MIGRATE_SUCCESS", getattr(self.style, "SUCCESS"))("\nDatabase: {}\n").format(database))
             super(Command, self).handle(*args, **options)
 
     def get_all_but_replica_dbs(self):

diff --git a/django_sharding_library/settings_helpers.py b/django_sharding_library/settings_helpers.py
@@ -70,8 +70,9 @@ def database_configs(databases_dict):
     }
     """
     configuration = {}
+    shard_id_hash = {}  # Keep track of the IDs of the shards currently. Used to help with migrations.
     for (databases, is_sharded) in [(databases_dict.get('unsharded_databases', []), False), (databases_dict.get('sharded_databases', []), True)]:
-        for database in databases:
+        for idx, database in enumerate(databases):
             db_config = database_config(
                 database['environment_variable'],
                 database['default_database_url'],
@@ -89,4 +90,13 @@ def database_configs(databases_dict):
                 )
                 if db_config:
                     configuration[replica['name']] = db_config
+
+            # We assume the numeric shard ID is constant based on the entries in the configuration helper (we assume
+            # they wont change order, and that new shards will be appended and not inserted randomly)
+            # This is noted in the docs, leaving this comment for whomever may work on this in the future.
+            if is_sharded:
+                shard_id = shard_id_hash.get(configuration[database['name']]['SHARD_GROUP'], 0)
+                configuration[database['name']]['SHARD_ID'] = shard_id
+                shard_id_hash[configuration[database['name']]['SHARD_GROUP']] = shard_id + 1
+
     return configuration
diff --git a/django_sharding_library/sql.py b/django_sharding_library/sql.py
@@ -0,0 +1,16 @@
+postgres_shard_id_function_sql = """CREATE OR REPLACE FUNCTION next_sharded_id(OUT result bigint) AS $$
+DECLARE
+    start_epoch bigint := %(shard_epoch)d;
+    seq_id bigint;
+    now_millis bigint;
+    shard_id int := %(shard_id)d;
+BEGIN
+    -- there is a typo here in the online example, which is corrected here
+    SELECT nextval('%(sequence_name)s') %% 1024 INTO seq_id;
+
+    SELECT FLOOR(EXTRACT(EPOCH FROM clock_timestamp()) * 1000) INTO now_millis;
+    result := (now_millis - start_epoch) << 23;
+    result := result | (shard_id << 10);
+    result := result | (seq_id);
+END;
+$$ LANGUAGE PLPGSQL;"""
diff --git a/django_sharding_library/utils.py b/django_sharding_library/utils.py
@@ -0,0 +1,32 @@
+from django.db import connections, DatabaseError, transaction
+from django.conf import settings
+from django_sharding_library.sql import postgres_shard_id_function_sql
+from django.db.models import signals
+
+
+def create_postgres_global_sequence(sequence_name, db_alias, reset_sequence=False):
+    cursor = connections[db_alias].cursor()
+    sid = transaction.savepoint(db_alias)
+    try:
+        cursor.execute("CREATE SEQUENCE %s;" % sequence_name)
+    except DatabaseError:
+        transaction.savepoint_rollback(sid, using=db_alias)
+        if reset_sequence:
+            cursor.execute("SELECT setval('%s', 1, false)" % (sequence_name,))
+    else:
+        transaction.savepoint_commit(sid, using=db_alias)
+    cursor.close()
+
+
+def create_postgres_shard_id_function(sequence_name, db_alias, shard_id):
+    cursor = connections[db_alias].cursor()
+    cursor.execute(postgres_shard_id_function_sql % {'shard_epoch': settings.SHARD_EPOCH,
+                                                     'shard_id': shard_id,
+                                                     'sequence_name': sequence_name})
+    cursor.close()
+
+
+def register_migration_signal_for_model_receiver(model, function, dispatch_uid=None):
+    signals.pre_migrate.connect(function, sender=model, dispatch_uid=dispatch_uid)
+
+
diff --git a/docs/components/IDGeneration.md b/docs/components/IDGeneration.md
@@ -2,7 +2,7 @@
 
 In order to shard your database, one of the first decisions to makee is how you assign identifiers to the sharded objects. While it is not required, it is highly recommended that you choose a unique identifier. The main reason here being that you may want to either move data across shards later or that you may choose to analyze data across various shards for analytics and you will have to differentiate those objects before moving them to another server.
 
-This repository is initially shipping with two strategies but you may impliment your own. The base requirement at the moment is that you define a class like this:
+This repository is initially shipping with three strategies but you may impliment your own. The base requirement for defining your own strategy at the moment is that you define a class like this:
 
 ```python
 class BaseIDGenerationStrategy(object):
@@ -22,6 +22,7 @@ The two included in the package are:
 
 1. Use an autoincrement field to mimic the way a default table handles the operation
 2. Assign each item a UUID with the shard name appended to the end.
+3. A postgres-specific field that works similarly to Django's auto field, but in a shard safe way (only works for Postgres, don't try it with anything else!)
 
 ##### The Autoincrement Method
 
@@ -33,6 +34,10 @@ Note: The MySQL implementation uses a single row to accomplish this task while P
 
 While the odds of a UUID collision are very low, it is still possible and so we append the database shard name as a way to guarantee that they remain unique. The only drawback to this method is that the items cannot be moved across shards. However, it is the recommendation of the author that you refrain from shard rebalancing and instead focus on maintaining lots of shards rather than worry about balancing few large ones.
 
+##### The PostgresShardGeneratedIDField Method
+
+This strategy is an automated implementation of how Instagram does shard IDs. It uses built-in Postgres functionality to generate a shard-safe ID on the database server at the time of the insert. A stored procedure is created and uses a user-defined epoch time and a shard ID to make sure the IDs it generates are unique. This method (currently) supports up to 8191 shards and up to 1024 inserts per millisecond, which should be more than enough for most use cases, up to and including Instagram scale usage!
+
 ##### Pinterest
 
 They recently wrote a [lovely article](https://engineering.pinterest.com/blog/sharding-pinterest-how-we-scaled-our-mysql-fleet) about their sharding strategy. They use a 64 bit ID that works like so:

diff --git a/docs/usage/Migrations.md b/docs/usage/Migrations.md
@@ -65,3 +65,14 @@ class Command(MigrationCommand):
 ```
 
 By using the included router, it's as simple as calling migrate on all the primary databases in the system and allowing the system to decide which databases to run the migration on. The above changes were made to make the interface more simple than having to specify all the relevant databases.
+
+### PostgresShardGeneratedIDField Migration Info
+
+This library hooks into the Django migrations and creates (or updates) the necessary stored procedures before every migration. We made it work this way for two reasons:
+
+1. Django does not have a good way to force a field-specific migration dependency without having to edit the migration files themselves after they are generated
+2. This allows unit tests to be run on any arbitrary (PostgreSQL) database without any administrative overhead.
+
+The migration hooks should not affect you in any way, but you should be aware that there is a little bit of "magic" going on to make this field work with Django's migrations, without actually being part of the migration file itself.
+
+If the Django team ever makes migrations easier to customize by adding dependency injection based on specific fields, we will update this and add the migration step to your migration files when they are generated!
diff --git a/docs/usage/ShardingAModel.md b/docs/usage/ShardingAModel.md
@@ -111,3 +111,26 @@ CoolGuyShardedModel.objects.filter(user_pk=123, some_field='some_value')
 ```
 
 Once you've defined your model, we can move onto how to run migrations.
+
+### Using the PostgresShardGeneratedIDField
+
+If you would like to use the PostgresShardGeneratedIDField, there are a few subtle differences and caveats that you need to be aware of.
+
+1. If you define a PostgresShardGeneratedIDField, you should not use another shard ID generation strategy with that model. Additionally, the field should be marked as the primary key. An example of a model with a PostgresShardIDField:
+```python
+@model_config(shard_group='default')
+class CoolGuyShardedModel(models.Model):
+    id = PostgresShardGeneratedIDField(primary_key=True)
+    cool_guy_string = models.CharField(max_length=120)
+    user_pk = models.PositiveIntegerField()
+```
+2. You must define a "SHARD_EPOCH" variable in your Django settings file. This can be any epoch start time you want, but once chosen, should NEVER be changed. Here is an example of what it should look like (which will make your shard epoch Jan 1, 2016):
+```python
+import time
+from datetime import datetime
+# other settings go here...
+SHARD_EPOCH=int(time.mktime(datetime(2016, 1, 1).timetuple()) * 1000)
+```
+
+3. When you are editing your DATABASES settings, the order of the shards MUST be maintained. If you add a new shard, it needs to be added to the end of the list of databases, not to the beginning or middle.
+4. There is a maximum number of logical shards supported by this field. You can only have up to 8191 logical shards: if you try to go beyond, you will get duplicate IDs between your shards. Do not try to add more than 8191 shards. If you need more than that, I recommend you choose one of the other ID generation strategies.
diff --git a/runtests.py b/runtests.py
@@ -1,5 +1,7 @@
 import os
 import sys
+from datetime import datetime
+import time
 
 try:
     import django
@@ -64,6 +66,7 @@
     ],
     SITE_ID=1,
     MIDDLEWARE_CLASSES=(),
+    SHARD_EPOCH=int(time.mktime(datetime(2016, 1, 1).timetuple()) * 1000),
 )
 django.setup()
 

diff --git a/tests/migrations/0001_initial.py b/tests/migrations/0001_initial.py
@@ -107,4 +107,12 @@ class Migration(migrations.Migration):
             name='test',
             field=models.ForeignKey(to='tests.UnshardedTestModel'),
         ),
+        migrations.CreateModel(
+            name='PostgresCustomIDModel',
+            fields=[
+                ('id', django_sharding_library.fields.PostgresShardGeneratedIDField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)),
+                ('random_string', models.CharField(max_length=120)),
+                ('user_pk', models.PositiveIntegerField()),
+            ],
+        ),
     ]