adding distributed_by support from annotation in ddlog

HazyResearch · Dec 12, 2015 · 90ccb78 · 90ccb78
1 parent 959282f
commit 90ccb78
Showing 1 changed file with 52 additions and 1 deletion.
diff --git a/database/db-driver/greenplum/schema_json_to_sql b/database/db-driver/greenplum/schema_json_to_sql
diff --git a/database/db-driver/greenplum/schema_json_to_sql b/database/db-driver/greenplum/schema_json_to_sql
@@ -0,0 +1,52 @@
+#! /usr/bin/env python
+# Generate create table statement given a ddlog exported schema and a table name.
+# Usage: ddlog_initdb SCHEMA.JSON TABLE_NAME
+
+#This is a variation of the postgresql driver's version.
+#It generates the extra sql component "DISTRIBUTED BY (column_name(s))" when the annotation @distributed_by is added below one or several columns in a table declaration in ddlog
+
+import json, sys
+
+def generate_create_table_sql(schema, table):
+  columns_json = schema["relations"][table]["columns"]
+  # variable relation
+  if "variable_type" in schema["relations"][table]:
+    columns = range(len(columns_json) + 2)
+    columns[-2] = "id bigint"
+    label_type = "boolean" if schema["relations"][table]["variable_type"] == "boolean" else "int"
+    columns[-1] = "label " + label_type
+  else:
+    columns = range(len(columns_json))
+  distributed_by_array=[]
+  for k, v in columns_json.iteritems():
+    columns[v["index"]] = "%s %s" %(k, v["type"])
+    if "annotations" in v.keys():
+      for sub_dict_annotations in v["annotations"]:
+        if sub_dict_annotations["name"] == "distributed_by":
+          distributed_by_array.append(k)
+  if len(distributed_by_array) == 0:
+    return "DROP TABLE IF EXISTS %s CASCADE; CREATE TABLE %s(%s);" %(table, table, ", ".join(columns))
+  else:
+    return "DROP TABLE IF EXISTS %s CASCADE; CREATE TABLE %s(%s) DISTRIBUTED BY (%s);" %(table, table, ", ".join(columns), ", ".join(distributed_by_array))
+
+
+def main():
+  # load schema.json
+  with open(sys.argv[1]) as schema_file:
+    schema = json.load(schema_file)
+  if "relations" not in schema:
+    # nothing to do
+    sys.exit(0)
+  # initialize all tables
+  if len(sys.argv) <= 2:
+    print ' '.join([generate_create_table_sql(schema, table) for table in (schema["relations"].keys() if "relations" in schema else [])])
+  else:
+    table = sys.argv[2]
+    # the given table is not in the schema, exit with error
+    if table in schema["relations"]:
+      print generate_create_table_sql(schema, table)
+    else:
+      sys.exit(1)
+
+if __name__ == "__main__":
+  main()