In [1]:
import json
import os
import sys

from argparse import ArgumentParser
from collections import OrderedDict
from contextlib import contextmanager
from operator import itemgetter
from time import time

from pyspark import broadcast
from pyspark.sql import Row, SparkSession, Window
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
LABEL_COL = 0
INT_COLS = list(range(1, 14))
CAT_COLS = list(range(14, 40))

In [3]:
def get_column_counts_with_frequency_limit(df, frequency_limit = None):
    cols = ['_c%d' % i for i in CAT_COLS]
    df = (df
        .select(posexplode(array(*cols)))
        .withColumnRenamed('pos', 'column_id')
        .withColumnRenamed('col', 'data')
        .filter('data is not null')
        .groupBy('column_id', 'data')
        .count())

    if frequency_limit:
        frequency_limit = frequency_limit.split(",")
        exclude = []
        default_limit = None
        for fl in frequency_limit:
            frequency_pair = fl.split(":")
            if len(frequency_pair) == 1:
                default_limit = int(frequency_pair[0])
            elif len(frequency_pair) == 2:
                df = df.filter((col('column_id') != int(frequency_pair[0]) - CAT_COLS[0]) | (col('count') >= int(frequency_pair[1])))
                exclude.append(int(frequency_pair[0]))
        if default_limit:
            remain = [x - CAT_COLS[0] for x in CAT_COLS if x not in exclude]
            df = df.filter((~col('column_id').isin(remain)) | (col('count') >= default_limit))
            # for comparing isin and separate filter
            # for i in remain:
            #     df = df.filter((col('column_id') != i - CAT_COLS[0]) | (col('count') >= default_limit))
    return df


def assign_id_with_window(df):
    windowed = Window.partitionBy('column_id').orderBy(desc('count'))
    return (df
            .withColumn('id', row_number().over(windowed))
            .withColumnRenamed('count', 'model_count'))


def assign_low_mem_partial_ids(df):
    # To avoid some scaling issues with a simple window operation, we use a more complex method
    # to compute the same thing, but in a more distributed spark specific way
    df = df.orderBy(asc('column_id'), desc('count'))
    # The monotonically_increasing_id is the partition id in the top 31 bits and the rest
    # is an increasing count of the rows within that partition.  So we split it into two parts,
    # the partion id part_id and the count mono_id
    df = df.withColumn('part_id', spark_partition_id())
    return df.withColumn('mono_id', monotonically_increasing_id() - shiftLeft(col('part_id'), 33))


def assign_low_mem_final_ids(df):
    # Now we can find the minimum and maximum mono_ids within a given column/partition pair
    sub_model = df.groupBy('column_id', 'part_id').agg(max('mono_id').alias('top'), min('mono_id').alias('bottom'))
    sub_model = sub_model.withColumn('diff', col('top') - col('bottom') + 1)
    sub_model = sub_model.drop('top')
    # This window function is over aggregated column/partition pair table. It will do a running sum of the rows
    # within that column
    windowed = Window.partitionBy('column_id').orderBy('part_id').rowsBetween(Window.unboundedPreceding, -1)
    sub_model = sub_model.withColumn('running_sum', sum('diff').over(windowed)).na.fill(0, ["running_sum"])

    joined = df.withColumnRenamed('column_id', 'i_column_id')
    joined = joined.withColumnRenamed('part_id', 'i_part_id')
    joined = joined.withColumnRenamed('count', 'model_count')

    # Then we can join the original input with the pair it is a part of
    joined = joined.join(sub_model, (col('i_column_id') == col('column_id')) & (col('part_id') == col('i_part_id')))

    # So with all that we can subtract bottom from mono_id makeing it start at 0 for each partition
    # and then add in the running_sum so the id is contiguous and unique for the entire column. + 1 to make it match the 1 based indexing
    # for row_number
    ret = joined.select(col('column_id'),
                        col('data'),
                        (col('mono_id') - col('bottom') + col('running_sum') + 1).cast(IntegerType()).alias('id'),
                        col('model_count'))
    return ret


def get_column_models(combined_model):
    for i in CAT_COLS:
        model = (combined_model
            .filter('column_id == %d' % (i - CAT_COLS[0]))
            .drop('column_id'))
        yield i, model


def col_of_rand_long():
    return (rand() * (1 << 52)).cast(LongType())

def skewed_join(df, model, col_name, cutoff):
    # Most versions of spark don't have a good way
    # to deal with a skewed join out of the box.
    # Some do and if you want to replace this with
    # one of those that would be great.
    
    # Because we have statistics about the skewedness
    # that we can used we divide the model up into two parts
    # one part is the highly skewed part and we do a
    # broadcast join for that part, but keep the result in
    # a separate column
    b_model = broadcast(model.filter(col('model_count') >= cutoff)
            .withColumnRenamed('data', col_name)
            .drop('model_count'))
    
    df = (df
            .join(b_model, col_name, how='left')
            .withColumnRenamed('id', 'id_tmp'))
    
    # We also need to spread the skewed data that matched
    # evenly.  We will use a source of randomness for this
    # but use a -1 for anything that still needs to be matched
    if 'ordinal' in df.columns:
        rand_column = col('ordinal')
    else:
        rand_column = col_of_rand_long()

    df = df.withColumn('join_rand',
            # null values are not in the model, they are filtered out
            # but can be a source of skewedness so include them in
            # the even distribution
            when(col('id_tmp').isNotNull() | col(col_name).isNull(), rand_column)
            .otherwise(lit(-1)))
    
    # Null out the string data that already matched to save memory
    df = df.withColumn(col_name,
            when(col('id_tmp').isNotNull(), None)
            .otherwise(col(col_name)))
    
    # Now do the second join, which will be a non broadcast join.
    # Sadly spark is too smart for its own good and will optimize out
    # joining on a column it knows will always be a constant value.
    # So we have to make a convoluted version of assigning a -1 to the
    # randomness column for the model itself to work around that.
    nb_model = (model
            .withColumn('join_rand', when(col('model_count') < cutoff, lit(-1)).otherwise(lit(-2)))
            .filter(col('model_count') < cutoff)
            .withColumnRenamed('data', col_name)
            .drop('model_count'))
    
    df = (df
            .join(nb_model, ['join_rand', col_name], how='left')
            .drop(col_name, 'join_rand')
            # Pick either join result as an answer
            .withColumn(col_name, coalesce(col('id'), col('id_tmp')))
            .drop('id', 'id_tmp'))

    return df


def apply_models(df, models, broadcast_model = False, skew_broadcast_pct = 1.0):
    # sort the models so broadcast joins come first. This is
    # so we reduce the amount of shuffle data sooner than later
    # If we parsed the string hex values to ints early on this would
    # not make a difference.
    models = sorted(models, key=itemgetter(3), reverse=True)
    for i, model, original_rows, would_broadcast in models:
        col_name = '_c%d' % i
        if not (would_broadcast or broadcast_model):
            # The data is highly skewed so we need to offset that
            cutoff = int(original_rows * skew_broadcast_pct/100.0)
            df = skewed_join(df, model, col_name, cutoff)
        else:
            # broadcast joins can handle skewed data so no need to
            # do anything special
            model = (model.drop('model_count')
                          .withColumnRenamed('data', col_name))
            model = broadcast(model) if broadcast_model else model
            df = (df
                .join(model, col_name, how='left')
                .drop(col_name)
                .withColumnRenamed('id', col_name))
    return df.fillna(0, ['_c%d' % i for i in CAT_COLS])


def transform_log(df, transform_log = False):
    cols = ['_c%d' % i for i in INT_COLS]
    if transform_log:
        for col_name in cols:
            df = df.withColumn(col_name, log(df[col_name] + 3))
    return df.fillna(0, cols)


def would_broadcast(spark, str_path):
    sc = spark.sparkContext
    config = sc._jsc.hadoopConfiguration()
    path = sc._jvm.org.apache.hadoop.fs.Path(str_path)
    fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(config)
    stat = fs.listFiles(path, True)
    sum = 0
    while stat.hasNext():
       sum = sum + stat.next().getLen()
    sql_conf = sc._jvm.org.apache.spark.sql.internal.SQLConf()
    cutoff = sql_conf.autoBroadcastJoinThreshold() * sql_conf.fileCompressionFactor()
    return sum <= cutoff

def delete_data_source(spark, path):
    sc = spark.sparkContext
    config = sc._jsc.hadoopConfiguration()
    path = sc._jvm.org.apache.hadoop.fs.Path(path)
    sc._jvm.org.apache.hadoop.fs.FileSystem.get(config).delete(path, True)


def load_raw(spark, folder, day_range):
    label_fields = [StructField('_c%d' % LABEL_COL, IntegerType())]
    int_fields = [StructField('_c%d' % i, IntegerType()) for i in INT_COLS]
    str_fields = [StructField('_c%d' % i, StringType()) for i in CAT_COLS]

    schema = StructType(label_fields + int_fields + str_fields)
    paths = [os.path.join(folder, 'day_%d' % i) for i in day_range]
    return (spark
        .read
        .schema(schema)
        .option('sep', '\t')
        .csv(paths))

def rand_ordinal(df):
    # create a random long from the double precision float.  
    # The fraction part of a double is 52 bits, so we try to capture as much
    # of that as possible
    return df.withColumn('ordinal', col_of_rand_long())

def day_from_ordinal(df, num_days):
    return df.withColumn('day', (col('ordinal') % num_days).cast(IntegerType()))

def day_from_input_file(df):
    return df.withColumn('day', substring_index(input_file_name(), '_', -1).cast(IntegerType()))

def psudo_sort_by_day_plus(spark, df, num_days):
    # Sort is very expensive because it needs to calculate the partitions
    # which in our case may involve rereading all of the data.  In some cases
    # we can avoid this by repartitioning the data and sorting within a single partition
    shuffle_parts = int(spark.conf.get('spark.sql.shuffle.partitions'))
    extra_parts = int(shuffle_parts/num_days)
    if extra_parts <= 0:
        df = df.repartition('day')
    else:
        #We want to spread out the computation to about the same amount as shuffle_parts
        divided = (col('ordinal') / num_days).cast(LongType())
        extra_ident = divided % extra_parts
        df = df.repartition(col('day'), extra_ident)
    return df.sortWithinPartitions('day', 'ordinal')


def load_combined_model(spark, model_folder):
    path = os.path.join(model_folder, 'combined.parquet')
    return spark.read.parquet(path)


def save_combined_model(df, model_folder, mode=None):
    path = os.path.join(model_folder, 'combined.parquet')
    df.write.parquet(path, mode=mode)


def delete_combined_model(spark, model_folder):
    path = os.path.join(model_folder, 'combined.parquet')
    delete_data_source(spark, path)


def load_low_mem_partial_ids(spark, model_folder):
    path = os.path.join(model_folder, 'partial_ids.parquet')
    return spark.read.parquet(path)


def save_low_mem_partial_ids(df, model_folder, mode=None):
    path = os.path.join(model_folder, 'partial_ids.parquet')
    df.write.parquet(path, mode=mode)


def delete_low_mem_partial_ids(spark, model_folder):
    path = os.path.join(model_folder, 'partial_ids.parquet')
    delete_data_source(spark, path)


def load_column_models(spark, model_folder, count_required):
    for i in CAT_COLS:
        path = os.path.join(model_folder, '%d.parquet' % i)
        df = spark.read.parquet(path)
        if count_required:
            values = df.agg(sum('model_count').alias('sum'), count('*').alias('size')).collect()
        else:
            values = df.agg(sum('model_count').alias('sum')).collect()
        yield i, df, values[0], would_broadcast(spark, path)

def save_column_models(column_models, model_folder, mode=None):
    for i, model in column_models:
        path = os.path.join(model_folder, '%d.parquet' % i)
        model.write.parquet(path, mode=mode)


def save_model_size(model_size, path, write_mode):
    if os.path.exists(path) and write_mode == 'errorifexists':
        print('Error: model size file %s exists' % path)
        sys.exit(1)

    os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
    with open(path, 'w') as fp:
        json.dump(model_size, fp, indent=4)


_benchmark = {}


@contextmanager
def _timed(step):
    start = time()
    yield
    end = time()
    _benchmark[step] = end - start


In [4]:
def _parse_args(input_args):
    parser = ArgumentParser()

    parser.add_argument(
        '--mode',
        required=True,
        choices=['generate_models', 'transform'])

    parser.add_argument('--days', required=True)
    parser.add_argument('--input_folder', required=True)
    parser.add_argument('--output_folder')
    parser.add_argument('--model_size_file')
    parser.add_argument('--model_folder', required=True)
    parser.add_argument(
        '--write_mode',
        choices=['overwrite', 'errorifexists'],
        default='errorifexists')

    parser.add_argument('--frequency_limit')
    parser.add_argument('--no_numeric_log_col', action='store_true')
    #Support for running in a lower memory environment
    parser.add_argument('--low_mem', action='store_true')
    parser.add_argument(
        '--output_ordering',
        choices=['total_random', 'day_random', 'any', 'input'],
        default='total_random')

    parser.add_argument(
        '--output_partitioning',
        choices=['day', 'none'],
        default='none')

    parser.add_argument('--dict_build_shuffle_parallel_per_day', type=int, default=2)
    parser.add_argument('--apply_shuffle_parallel_per_day', type=int, default=25)
    parser.add_argument('--skew_broadcast_pct', type=float, default=1.0)

    parser.add_argument('--debug_mode', action='store_true')
    
    # Parameters are defined here
    args = parser.parse_args(args=input_args)

    start, end = args.days.split('-')
    args.day_range = list(range(int(start), int(end) + 1))
    args.days = len(args.day_range)

    return args

In [5]:
def _main(input_args):
    args = _parse_args(input_args)
    spark = SparkSession.builder.getOrCreate()

    df = load_raw(spark, args.input_folder, args.day_range)

    if args.mode == 'generate_models':
        spark.conf.set('spark.sql.shuffle.partitions', args.days * args.dict_build_shuffle_parallel_per_day)
        with _timed('generate models'):
            col_counts = get_column_counts_with_frequency_limit(df, args.frequency_limit)
            if args.low_mem:
                # in low memory mode we have to save an intermediate result
                # because if we try to do it in one query spark ends up assigning the
                # partial ids in two different locations that are not guaranteed to line up
                # this prevents that from happening by assigning the partial ids
                # and then writeing them out.
                save_low_mem_partial_ids(
                        assign_low_mem_partial_ids(col_counts),
                        args.model_folder,
                        args.write_mode)
                save_combined_model(
                        assign_low_mem_final_ids(load_low_mem_partial_ids(spark, args.model_folder)),
                        args.model_folder,
                        args.write_mode)
                if not args.debug_mode:
                    delete_low_mem_partial_ids(spark, args.model_folder)

            else:
                save_combined_model(
                        assign_id_with_window(col_counts),
                        args.model_folder,
                        args.write_mode)
            save_column_models(
                get_column_models(load_combined_model(spark, args.model_folder)),
                args.model_folder,
                args.write_mode)
            if not args.debug_mode:
                delete_combined_model(spark, args.model_folder)

    if args.mode == 'transform':
        spark.conf.set('spark.sql.shuffle.partitions', args.days * args.apply_shuffle_parallel_per_day)
        with _timed('transform'):
            if args.output_ordering == 'total_random':
                df = rand_ordinal(df)
                if args.output_partitioning == 'day':
                    df = day_from_ordinal(df, args.days)
            elif args.output_ordering == 'day_random':
                df = rand_ordinal(df)
                df = day_from_input_file(df)
            elif args.output_ordering == 'input':
                df = df.withColumn('ordinal', monotonically_increasing_id())
                if args.output_partitioning == 'day':
                    df = day_from_input_file(df)
            else: # any ordering
                if args.output_partitioning == 'day':
                    df = day_from_input_file(df)

            models = list(load_column_models(spark, args.model_folder, bool(args.model_size_file)))
            if args.model_size_file:
                save_model_size(
                    OrderedDict(('_c%d' % i, agg.size) for i, _, agg, _ in models),
                    args.model_size_file,
                    args.write_mode)
            models = [(i, df, agg.sum, flag) for i, df, agg, flag in models]

            df = apply_models(
                df,
                models,
                not args.low_mem,
                args.skew_broadcast_pct)
            df = transform_log(df, not args.no_numeric_log_col)


            if args.output_partitioning == 'day':
                partitionBy = 'day'
            else:
                partitionBy = None

            if args.output_ordering == 'total_random':
                if args.output_partitioning == 'day':
                    df = psudo_sort_by_day_plus(spark, df, args.days)
                else: # none
                    # Don't do a full sort it is expensive. Order is random so
                    # just make it random
                    df = df.repartition('ordinal').sortWithinPartitions('ordinal')

                df = df.drop('ordinal')
            elif args.output_ordering == 'day_random':
                df = psudo_sort_by_day_plus(spark, df, args.days)
                df = df.drop('ordinal')
                if args.output_partitioning != 'day':
                    df = df.drop('day')
            elif args.output_ordering == 'input':
                if args.low_mem:
                    # This is the slowest option. We totally messed up the order so we have to put
                    # it back in the correct order
                    df = df.orderBy('ordinal')
                else:
                    # Applying the dictionary happened within a single task so we are already really
                    # close to the correct order, just need to sort within the partition
                    df = df.sortWithinPartitions('ordinal')
                df = df.drop('ordinal')
                if args.output_partitioning != 'day':
                    df = df.drop('day')
            # else: any ordering so do nothing the ordering does not matter

            df.write.parquet(
                args.output_folder,
                mode=args.write_mode,
                partitionBy=partitionBy)

    print('=' * 100)
    print(_benchmark)

In [6]:
_main(['--mode', 'generate_models',
        '--frequency_limit', '15',
        '--input_folder', '/data',
        '--days', '4-4',
        '--model_folder', '/models',
        '--write_mode', 'overwrite',
        '--low_mem'])

21/12/22 12:48:56 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
21/12/22 12:50:23 WARN GpuOverrides:                                            
              !Exec <WindowExec> cannot run on GPU because not all expressions can be replaced
                @Expression <Alias> sum(diff#143L) windowspecdefinition(column_id#120, part_id#123 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), -1)) AS running_sum#154L could run on GPU
                  !Expression <WindowExpression> sum(diff#143L) windowspecdefinition(column_id#120, part_id#123 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), -1)) cannot run on GPU because upper-bounds behind the current row is not supported. Found -1
                    @Expression <AggregateExpression> sum(diff#143L) could run on GPU
                      @Expression <Sum> sum(diff#143L) could run on G

{'generate models': 94.8656804561615}


In [7]:
_main(['--mode', 'transform',
        '--input_folder', '/data',
        '--days', '4-4',
       '--output_folder', '/data/train',
       '--model_size_file', '/data/model_size.json',
        '--model_folder', '/models',
        '--write_mode', 'overwrite',
        '--low_mem'])

21/12/22 12:50:34 WARN GpuOverrides: 
                                                                                                                !Exec <ProjectExec> cannot run on GPU because not all expressions can be replaced
                                                                                                                  @Expression <AttributeReference> _c0#390 could run on GPU
                                                                                                                  @Expression <AttributeReference> _c1#391 could run on GPU
                                                                                                                  @Expression <AttributeReference> _c2#392 could run on GPU
                                                                                                                  @Expression <AttributeReference> _c3#393 could run on GPU
                                                                                



{'generate models': 94.8656804561615, 'transform': 276.87314462661743}


                                                                                

In [8]:
import pandas as pd
import json
with open('/data/model_size.json', 'r') as f:
    data = json.load(f)
    rename_df = pd.DataFrame(data, index=[0])
    print(rename_df)
    rename_df.to_csv("/data/dimensions.csv")

     _c14   _c15   _c16  _c17   _c18  _c19  _c20  _c21  _c22    _c23  ...  \
0  327597  21882  14396  7063  19090     3  6484  1261    49  296349  ...   

   _c30  _c31  _c32    _c33    _c34    _c35   _c36  _c37  _c38  _c39  
0     3   938    14  327656  217544  319293  85130  9545    72    33  

[1 rows x 26 columns]


# training

In [9]:
import argparse
import math
import pprint
import sys
# This needs to happen first to avoid pyarrow serialization errors.
from pyspark.sql import SparkSession

# Make sure pyarrow is referenced before anything else to avoid segfault due to conflict
# with TensorFlow libraries.  Use `pa` package reference to ensure it's loaded before
# functions like `deserialize_model` which are implemented at the top level.
# See https://jira.apache.org/jira/browse/ARROW-3346
import pyarrow as pa

import horovod
import horovod.tensorflow.keras as hvd
import tensorflow as tf
from horovod.spark.common.backend import SparkBackend
from tensorflow.keras.layers import BatchNormalization, Input, Embedding, Concatenate, Dense, Flatten
from tensorflow.keras.layers.experimental.preprocessing import CategoryEncoding

In [10]:
PETASTORM_DATALOADER = 'petastorm'
NVTABULAR_DATALOADER = 'nvtabular'

CONTINUOUS_COLUMNS = [f'_c{i}' for i in range(1,14)]
CATEGORICAL_COLUMNS = [f'_c{c}' for c in range(14,40)]
ALL_COLUMNS = CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS
LABEL_COLUMNS = ['_c0']

In [11]:
def get_category_dimensions(spark, data_dir):
    df = spark.read.csv(f'{data_dir}/dimensions.csv', header=True).toPandas()
    dimensions = df.to_dict('records')[0]
    pprint.pprint(dimensions)
    return dimensions

In [12]:
def build_model(dimensions, args):
    
    inputs = {
        **{i: Input(shape=(1,), name=i, dtype=tf.float32) for i in CONTINUOUS_COLUMNS},
        **{c: Input(shape=(1,), name=c, dtype=tf.int32) for c in CATEGORICAL_COLUMNS}
    }

    one_hots = []
    embeddings = []
    for c in CATEGORICAL_COLUMNS:
        dimension = int(dimensions[c]) + 1
        # dimension <= 128, smaller size for demo
        if dimension <= 4:
            one_hots.append(CategoryEncoding(num_tokens=dimension, name=f'one_hot_{c}')(inputs[c]))
        else:
            # embedding_size = int(math.floor(0.6 * dimension ** 0.25)), smaller model size for demo
            embedding_size = 2
            embeddings.append(Embedding(input_dim=dimension,
                                        output_dim=embedding_size,
                                        input_length=1,
                                        name=f'embedding_{c}')(inputs[c]))

    x = Concatenate(name='embeddings_concat')(embeddings)
    x = Flatten(name='embeddings_flatten')(x)
    x = Concatenate(name='inputs_concat')([x] + one_hots + [inputs[i] for i in CONTINUOUS_COLUMNS])
    x = BatchNormalization()(x)
#     x = Flatten(input_shape=(39,)
    x = Dense(64, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dense(64, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dense(64, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dense(32, activation='relu')(x)
    output = Dense(1, activation='sigmoid', name='output')(x)
    model = tf.keras.Model(inputs=[inputs[c] for c in ALL_COLUMNS], outputs=output)
    if hvd.rank() == 0:
        model.summary()

    opt = tf.keras.optimizers.Adam(learning_rate=args.learning_rate)
    opt = hvd.DistributedOptimizer(opt)
    model.compile(optimizer=opt, loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC()])

    return model

In [13]:
def train_fn(dimensions, train_rows, val_rows, args):
    # Make sure pyarrow is referenced before anything else to avoid segfault due to conflict
    # with TensorFlow libraries.  Use `pa` package reference to ensure it's loaded before
    # functions like `deserialize_model` which are implemented at the top level.
    # See https://jira.apache.org/jira/browse/ARROW-3346
    pa

    import atexit
    import horovod.tensorflow.keras as hvd
    from horovod.spark.task import get_available_devices
    import os
    import tempfile
    import tensorflow as tf
    import tensorflow.keras.backend as K
    import shutil

    gpus = get_available_devices()
    if gpus:
        os.environ['CUDA_VISIBLE_DEVICES'] = gpus[0]
    if args.dataloader == NVTABULAR_DATALOADER:
        os.environ['TF_MEMORY_ALLOCATION'] = '0.55'
        from nvtabular.loader.tensorflow import KerasSequenceLoader

    # Horovod: initialize Horovod inside the trainer.
    hvd.init()

    # Horovod: restore from checkpoint, use hvd.load_model under the hood.
    model = build_model(dimensions, args)

    # Horovod: adjust learning rate based on number of processes.
    scaled_lr = K.get_value(model.optimizer.lr) * hvd.size()
    K.set_value(model.optimizer.lr, scaled_lr)

    # Horovod: print summary logs on the first worker.
    verbose = 1 if hvd.rank() == 0 else 0

    callbacks = [
        # Horovod: broadcast initial variable states from rank 0 to all other processes.
        # This is necessary to ensure consistent initialization of all workers when
        # training is started with random weights or restored from a checkpoint.
        hvd.callbacks.BroadcastGlobalVariablesCallback(root_rank=0),

        # Horovod: average metrics among workers at the end of every epoch.
        #
        # Note: This callback must be in the list before the ReduceLROnPlateau,
        # TensorBoard, or other metrics-based callbacks.
        hvd.callbacks.MetricAverageCallback(),

        # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
        # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
        # the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
        hvd.callbacks.LearningRateWarmupCallback(initial_lr=scaled_lr, warmup_epochs=5, verbose=verbose),

        # Reduce LR if the metric is not improved for 10 epochs, and stop training
        # if it has not improved for 20 epochs.
        tf.keras.callbacks.ReduceLROnPlateau(monitor='val_auc', patience=10, verbose=verbose),
        tf.keras.callbacks.EarlyStopping(monitor='val_auc', mode='min', patience=20, verbose=verbose),
        tf.keras.callbacks.TerminateOnNaN(),

        # Log Tensorboard events.
        tf.keras.callbacks.TensorBoard(log_dir=args.logs_dir, write_steps_per_second=True, update_freq=10)
    ]

    # Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them.
    if hvd.rank() == 0:
        ckpt_dir = tempfile.mkdtemp()
        ckpt_file = os.path.join(ckpt_dir, 'checkpoint.h5')
        atexit.register(lambda: shutil.rmtree(ckpt_dir))
        callbacks.append(tf.keras.callbacks.ModelCheckpoint(
            ckpt_file, monitor='val_auc', mode='min', save_best_only=True))

    if args.dataloader == PETASTORM_DATALOADER:
        from petastorm import make_batch_reader
        from petastorm.tf_utils import make_petastorm_dataset

        # Make Petastorm readers.
        with make_batch_reader(f'{args.data_dir}/train',
                               num_epochs=None,
                               cur_shard=hvd.rank(),
                               shard_count=hvd.size(),
                               hdfs_driver='libhdfs') as train_reader:
            with make_batch_reader(f'{args.data_dir}/val',
                                   num_epochs=None,
                                   cur_shard=hvd.rank(),
                                   shard_count=hvd.size(),
                                   hdfs_driver='libhdfs') as val_reader:
                # Convert readers to tf.data.Dataset.
                train_ds = make_petastorm_dataset(train_reader) \
                    .unbatch() \
                    .shuffle(10 * args.batch_size) \
                    .batch(args.batch_size) \
                    .map(lambda x: (tuple(getattr(x, c) for c in ALL_COLUMNS), x.clicked))

                val_ds = make_petastorm_dataset(val_reader) \
                    .unbatch() \
                    .batch(args.batch_size) \
                    .map(lambda x: (tuple(getattr(x, c) for c in ALL_COLUMNS), x.clicked))

                history = model.fit(train_ds,
                                    validation_data=val_ds,
                                    steps_per_epoch=int(train_rows / args.batch_size / hvd.size()),
                                    validation_steps=int(val_rows / args.batch_size / hvd.size()),
                                    callbacks=callbacks,
                                    verbose=verbose,
                                    epochs=args.epochs)

    else:
        import cupy

        def seed_fn():
            """
            Generate consistent dataloader shuffle seeds across workers
            Reseeds each worker's dataloader each epoch to get fresh a shuffle
            that's consistent across workers.
            """
            min_int, max_int = tf.int32.limits
            max_rand = max_int // hvd.size()
            # Generate a seed fragment on each worker
            seed_fragment = cupy.random.randint(0, max_rand).get()
            # Aggregate seed fragments from all Horovod workers
            seed_tensor = tf.constant(seed_fragment)
            reduced_seed = hvd.allreduce(seed_tensor, name="shuffle_seed", op=hvd.Sum)
            return reduced_seed % max_rand

        train_ds = KerasSequenceLoader(
            f'{args.data_dir}/train',
            batch_size=args.batch_size,
            label_names=LABEL_COLUMNS,
            cat_names=CATEGORICAL_COLUMNS,
            cont_names=CONTINUOUS_COLUMNS,
            engine="parquet",
            shuffle=True,
            buffer_size=0.06,  # how many batches to load at once
            parts_per_chunk=1,
            global_size=hvd.size(),
            global_rank=hvd.rank(),
            seed_fn=seed_fn)

        val_ds = KerasSequenceLoader(
            f'{args.data_dir}/val',
            batch_size=args.batch_size,
            label_names=LABEL_COLUMNS,
            cat_names=CATEGORICAL_COLUMNS,
            cont_names=CONTINUOUS_COLUMNS,
            engine="parquet",
            shuffle=False,
            buffer_size=0.06,  # how many batches to load at once
            parts_per_chunk=1,
            global_size=hvd.size(),
            global_rank=hvd.rank())

        history = model.fit(train_ds,
                            validation_data=val_ds,
                            steps_per_epoch=int(train_rows / args.batch_size / hvd.size()),
                            validation_steps=int(val_rows / args.batch_size / hvd.size()),
                            callbacks=callbacks,
                            verbose=verbose,
                            epochs=args.epochs)

    if hvd.rank() == 0:
        return history.history

In [14]:
def train(dimensions, train_rows, val_rows, args):
    # Horovod: run training.
    history = horovod.spark.run(train_fn,
                                args=(dimensions, train_rows, val_rows, args),
                                env={'PATH':os.environ['PATH']},
                                num_proc=args.num_proc,
                                extra_mpi_args='-mca btl_tcp_if_include enp134s0f0 -x NCCL_IB_GID_INDEX=3',
                                stdout=sys.stdout,
                                stderr=sys.stderr,
                                verbose=2,
                                nics={},
                                prefix_output_with_timestamp=True)[0]

    best_val_loss = min(history['val_loss'])
    print('Best Loss: %f' % best_val_loss)

In [15]:
def main():
    parser = argparse.ArgumentParser(description='Criteo Spark Keras Training Example',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--data-dir', default='file:///opt/data/criteo/parquet',
                        help='location of the transformed Criteo dataset in Parquet format')
    parser.add_argument('--logs-dir', default='/opt/experiments/criteo', help='location of TensorFlow logs')
    parser.add_argument('--dataloader', default=PETASTORM_DATALOADER,
                        choices=[PETASTORM_DATALOADER, NVTABULAR_DATALOADER],
                        help='dataloader to use')
    parser.add_argument('--num-proc', type=int, default=1, help='number of worker processes for training')
    parser.add_argument('--learning-rate', type=float, default=0.0001, help='initial learning rate')
    parser.add_argument('--batch-size', type=int, default=64 * 1024, help='batch size')
    parser.add_argument('--epochs', type=int, default=3, help='number of epochs to train')
    parser.add_argument('--local-checkpoint-file', default='checkpoint', help='model checkpoint')
    args = parser.parse_args(args=['--num-proc', '1', '--data-dir', 'file:///data/', 
                                   '--dataloader', 'nvtabular', '--learning-rate', '0.001',
                                   '--batch-size', '6553','--epochs', '1', '--logs-dir', 'tf_logs',
                                   '--local-checkpoint-file', 'ckpt_file'])
                                   

    dimensions = get_category_dimensions(spark, args.data_dir)

    train_df = spark.read.parquet(f'{args.data_dir}/train')
    val_df = spark.read.parquet(f'{args.data_dir}/val')
    test_df = spark.read.parquet(f'{args.data_dir}/test')
    train_rows, val_rows, test_rows = train_df.count(), val_df.count(), test_df.count()
    print('Training: %d' % train_rows)
    print('Validation: %d' % val_rows)
    print('Test: %d' % test_rows)

    train(dimensions, train_rows, val_rows, args)

#     spark.stop()

In [16]:
main()

21/12/22 12:55:13 WARN GpuOverrides: 
!Exec <CollectLimitExec> cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it
  @Partitioning <SinglePartition$> could run on GPU
    !Exec <FileSourceScanExec> cannot run on GPU because unsupported file format: org.apache.spark.sql.execution.datasources.text.TextFileFormat

21/12/22 12:55:13 WARN GpuOverrides: 
! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec
  ! <Invoke> value#5359.toString cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invo

{'_c0': '0',
 '_c14': '327597',
 '_c15': '21882',
 '_c16': '14396',
 '_c17': '7063',
 '_c18': '19090',
 '_c19': '3',
 '_c20': '6484',
 '_c21': '1261',
 '_c22': '49',
 '_c23': '296349',
 '_c24': '103536',
 '_c25': '86419',
 '_c26': '10',
 '_c27': '2182',
 '_c28': '8235',
 '_c29': '61',
 '_c30': '3',
 '_c31': '938',
 '_c32': '14',
 '_c33': '327656',
 '_c34': '217544',
 '_c35': '319293',
 '_c36': '85130',
 '_c37': '9545',
 '_c38': '72',
 '_c39': '33'}
Training: 152115810
Validation: 12171190
Test: 12163666


[Stage 155:>                                                        (0 + 1) / 1]

Checking whether extension tensorflow was built with MPI.
Extension tensorflow was built with MPI.
mpirun --allow-run-as-root --tag-output -np 1 -H 7123d402fe1d-43f1b58d65cc025cf85d240d6e66ce8a:1 -bind-to none -map-by slot -mca pml ob1 -mca btl ^openib --timestamp-output     -x PATH -mca btl_tcp_if_include enp134s0f0 -x NCCL_IB_GID_INDEX=3 -x NCCL_DEBUG=INFO -mca plm_rsh_agent "/databricks/conda/bin/python -m horovod.spark.driver.mpirun_rsh gAWVOgAAAAAAAAB9lCiMAmxvlF2UjAkxMjcuMC4wLjGUTUdOhpRhjARldGgwlF2UjAoxNzIuMTcuMC4ylE1HToaUYXUu gAWVBQMAAAAAAACMI2hvcm92b2QucnVubmVyLmNvbW1vbi51dGlsLnNldHRpbmdzlIwIU2V0dGluZ3OUk5QpgZR9lCiMCG51bV9wcm9jlEsBjAd2ZXJib3NllEsCjAhzc2hfcG9ydJROjBFzc2hfaWRlbnRpdHlfZmlsZZROjA5leHRyYV9tcGlfYXJnc5SMOS1tY2EgYnRsX3RjcF9pZl9pbmNsdWRlIGVucDEzNHMwZjAgLXggTkNDTF9JQl9HSURfSU5ERVg9M5SMCHRjcF9mbGFnlE6MDGJpbmRpbmdfYXJnc5ROjANrZXmUTowNc3RhcnRfdGltZW91dJSMImhvcm92b2QucnVubmVyLmNvbW1vbi51dGlsLnRpbWVvdXSUjAdUaW1lb3V0lJOUKYGUfZQojAhfdGltZW91dJRNWAKMC190aW1lb3V0X2F0lEdB2HDIooA8aY

Wed Dec 22 12:55:20 2021[1,0]<stderr>:2021-12-22 12:55:20.339416: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
Wed Dec 22 12:55:20 2021[1,0]<stderr>:2021-12-22 12:55:20.339945: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
Wed Dec 22 12:55:20 2021[1,0]<stderr>:2021-12-22 12:55:20.340149: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
--------------------------------------------------------------------------
value will be ignored.

  Local host: 7123d402fe1d
  Value:      enp134s0f0
  Message:    Unknown interface name
--------------------------------------

Wed Dec 22 12:55:22 2021[1,0]<stdout>:Model: "model"
Wed Dec 22 12:55:22 2021[1,0]<stdout>:__________________________________________________________________________________________________
Wed Dec 22 12:55:22 2021[1,0]<stdout>:Layer (type)                    Output Shape         Param #     Connected to                     
Wed Dec 22 12:55:22 2021[1,0]<stdout>:_c14 (InputLayer)               [(None, 1)]          0           []                               
Wed Dec 22 12:55:22 2021[1,0]<stdout>:__________________________________________________________________________________________________
Wed Dec 22 12:55:22 2021[1,0]<stdout>:_c15 (InputLayer)               [(None, 1)]          0           []                               
Wed Dec 22 12:55:22 2021[1,0]<stdout>:__________________________________________________________________________________________________
Wed Dec 22 12:55:22 2021[1,0]<stdout>:_c16 (InputLayer)               [(None, 1)]          0           []                    

Wed Dec 22 12:55:22 2021[1,0]<stdout>:__________________________________________________________________________________________________
Wed Dec 22 12:55:22 2021[1,0]<stdout>:embedding__c20 (Embedding)      (None, 1, 2)         12970       ['_c20[0][0]']                   
Wed Dec 22 12:55:22 2021[1,0]<stdout>:__________________________________________________________________________________________________
Wed Dec 22 12:55:22 2021[1,0]<stdout>:embedding__c21 (Embedding)      (None, 1, 2)         2524        ['_c21[0][0]']                   
Wed Dec 22 12:55:22 2021[1,0]<stdout>:__________________________________________________________________________________________________
Wed Dec 22 12:55:22 2021[1,0]<stdout>:embedding__c22 (Embedding)      (None, 1, 2)         100         ['_c22[0][0]']                   
Wed Dec 22 12:55:22 2021[1,0]<stdout>:__________________________________________________________________________________________________
Wed Dec 22 12:55:22 2021[1,0]<stdout>:emb



Wed Dec 22 12:55:22 2021[1,0]<stdout>:embedding__c35 (Embedding)      (None, 1, 2)         638588      ['_c35[0][0]']                   
Wed Dec 22 12:55:22 2021[1,0]<stdout>:__________________________________________________________________________________________________


Wed Dec 22 12:55:22 2021[1,0]<stderr>:2021-12-22 12:55:22.770363: I tensorflow/core/profiler/lib/profiler_session.cc:131] Profiler session initializing.
Wed Dec 22 12:55:22 2021[1,0]<stderr>:2021-12-22 12:55:22.770391: I tensorflow/core/profiler/lib/profiler_session.cc:146] Profiler session started.


Wed Dec 22 12:55:22 2021[1,0]<stdout>:embedding__c36 (Embedding)      (None, 1, 2)         170262      ['_c36[0][0]']                   
Wed Dec 22 12:55:22 2021[1,0]<stdout>:__________________________________________________________________________________________________Wed Dec 22 12:55:22 2021[1,0]<stdout>:


Wed Dec 22 12:55:22 2021[1,0]<stderr>:2021-12-22 12:55:22.771391: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1614] Profiler found 1 GPUs


Wed Dec 22 12:55:22 2021[1,0]<stdout>:embedding__c37 (Embedding)      (None, 1, 2)         19092       ['_c37[0][0]']                   
Wed Dec 22 12:55:22 2021[1,0]<stdout>:__________________________________________________________________________________________________Wed Dec 22 12:55:22 2021[1,0]<stdout>:
Wed Dec 22 12:55:22 2021[1,0]<stdout>:embedding__c38 (Embedding)      (None, 1, 2)         146         ['_c38[0][0]']                   
Wed Dec 22 12:55:22 2021[1,0]<stdout>:__________________________________________________________________________________________________
Wed Dec 22 12:55:22 2021[1,0]<stdout>:embedding__c39 (Embedding)      (None, 1, 2)         68          ['_c39[0][0]']                   
Wed Dec 22 12:55:22 2021[1,0]<stdout>:__________________________________________________________________________________________________
Wed Dec 22 12:55:22 2021[1,0]<stdout>:embeddings_concat (Concatenate  (None, 1, 48)        0           ['embedding__c14[0][0]',         
Wed

Wed Dec 22 12:55:22 2021[1,0]<stdout>:_c9 (InputLayer)                [(None, 1)]          0           []                               
Wed Dec 22 12:55:22 2021[1,0]<stdout>:__________________________________________________________________________________________________
Wed Dec 22 12:55:22 2021[1,0]<stdout>:_c10 (InputLayer)               [(None, 1)]          0           []                               
Wed Dec 22 12:55:22 2021[1,0]<stdout>:__________________________________________________________________________________________________
Wed Dec 22 12:55:22 2021[1,0]<stdout>:_c11 (InputLayer)               [(None, 1)]          0           []                               
Wed Dec 22 12:55:22 2021[1,0]<stdout>:__________________________________________________________________________________________________
Wed Dec 22 12:55:22 2021[1,0]<stdout>:_c12 (InputLayer)               [(None, 1)]          0           []                               
Wed Dec 22 12:55:22 2021[1,0]<stdout>:___

Wed Dec 22 12:55:23 2021[1,0]<stderr>:2021-12-22 12:55:23.242916: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session tear down.
Wed Dec 22 12:55:23 2021[1,0]<stderr>:2021-12-22 12:55:23.244633: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1748] CUPTI activity buffer flushed


Wed Dec 22 12:55:32 2021[1,0]<stdout>:

Wed Dec 22 12:55:33 2021[1,0]<stderr>:2021-12-22 12:55:33.491105: I tensorflow/core/profiler/lib/profiler_session.cc:131] Profiler session initializing.
Wed Dec 22 12:55:33 2021[1,0]<stderr>:2021-12-22 12:55:33.491136: I tensorflow/core/profiler/lib/profiler_session.cc:146] Profiler session started.


    1/23213 [..............................]Wed Dec 22 12:55:32 2021[1,0]<stdout>: - ETA: 47:44:40 - loss: 0.8806 - auc: 0.4889Wed Dec 22 12:55:33 2021[1,0]<stdout>:Wed Dec 22 12:55:33 2021[1,0]<stdout>:

Wed Dec 22 12:55:33 2021[1,0]<stderr>:2021-12-22 12:55:33.912884: I tensorflow/core/profiler/lib/profiler_session.cc:66] Profiler session collecting data.
Wed Dec 22 12:55:33 2021[1,0]<stderr>:2021-12-22 12:55:33.913423: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1748] CUPTI activity buffer flushed
Wed Dec 22 12:55:33 2021[1,0]<stderr>:2021-12-22 12:55:33.973720: I tensorflow/core/profiler/internal/gpu/cupti_collector.cc:673]  GpuTracer has collected 1624 callback api events and 1606 activity events. 
Wed Dec 22 12:55:33 2021[1,0]<stderr>:2021-12-22 12:55:33.999093: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session tear down.
Wed Dec 22 12:55:34 2021[1,0]<stderr>:2021-12-22 12:55:34.048614: I tensorflow/core/profiler/rpc/client/save_profile.cc:136] Creating directory: tf_logs/plugins/profile/2021_12_22_12_55_34
Wed Dec 22 12:55:34 2021[1,0]<stderr>:
Wed Dec 22 12:55:34 2021[1,0]<stderr>:2021-12-22 12:55:34.080719: I tensorflow/core/profiler/rpc/clien

Wed Dec 22 12:55:33 2021[1,0]<stdout>:    2/23213 [..............................]Wed Dec 22 12:55:33 2021[1,0]<stdout>: - ETA: 7:46:49 - loss: 0.8471 - auc: 0.5131 Wed Dec 22 12:55:34 2021[1,0]<stdout>:Wed Dec 22 12:55:34 2021[1,0]<stdout>:



Wed Dec 22 12:56:14 2021[1,0]<stdout>: 2658/23213 [==>...........................]Wed Dec 22 12:56:14 2021[1,0]<stdout>: - ETA: 5:26 - loss: 0.1375 - auc: 0.7458Wed Dec 22 12:56:14 2021[1,0]<stdout>Wed Dec 22 12:56:14 2021[1,0]<stdout>:

[Stage 155:>                                                        (0 + 1) / 1]



[Stage 155:>                                                        (0 + 1) / 1]



[Stage 155:>                                                        (0 + 1) / 1]



[Stage 155:>                                                        (0 + 1) / 1]



[Stage 155:>                                                        (0 + 1) / 1]



[Stage 155:>                                                        (0 + 1) / 1]



                                                                                

TypeError: Invalid argument, not a string or column: [0.12802616] of type <class 'list'>. For column literals, use 'lit', 'array', 'struct' or 'create_map' function.