# PySpark Huggingface Inferencing
### Sentence Transformers

From: https://huggingface.co/sentence-transformers

In [1]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

#Sentences we want to encode. Example:
sentence = ['This framework generates embeddings for each input sentence']


#Sentences are encoded by calling model.encode()
embedding = model.encode(sentence)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
embedding

array([[-1.76214516e-01,  1.20601304e-01, -2.93624043e-01,
        -2.29858175e-01, -8.22924003e-02,  2.37709180e-01,
         3.39985013e-01, -7.80964136e-01,  1.18127793e-01,
         1.63374111e-01, -1.37715325e-01,  2.40282863e-01,
         4.25125599e-01,  1.72417879e-01,  1.05279416e-01,
         5.18164098e-01,  6.22219592e-02,  3.99285942e-01,
        -1.81652382e-01, -5.85578799e-01,  4.49718162e-02,
        -1.72750458e-01, -2.68443376e-01, -1.47386298e-01,
        -1.89217895e-01,  1.92150623e-01, -3.83842528e-01,
        -3.96006793e-01,  4.30648834e-01, -3.15320045e-01,
         3.65949810e-01,  6.05160184e-02,  3.57326001e-01,
         1.59736484e-01, -3.00984085e-01,  2.63250291e-01,
        -3.94310981e-01,  1.84855387e-01, -3.99549156e-01,
        -2.67889678e-01, -5.45117497e-01, -3.13403830e-02,
        -4.30644304e-01,  1.33278236e-01, -1.74793929e-01,
        -4.35465544e-01, -4.77378905e-01,  7.12556019e-02,
        -7.37000927e-02,  5.69136977e-01, -2.82579631e-0

## PySpark

## Inference using Spark DL API
Note: you can restart the kernel and run from this point to simulate running in a different node or environment.

In [3]:
from pyspark.ml.functions import predict_batch_udf
from pyspark.sql.functions import col, struct
from pyspark.sql.types import ArrayType, FloatType

In [4]:
# only use first N examples, since this is slow
df = spark.read.parquet("imdb_test").limit(100).cache()

                                                                                

In [5]:
df.show(truncate=120)



+------------------------------------------------------------------------------------------------------------------------+
|                                                                                                                   lines|
+------------------------------------------------------------------------------------------------------------------------+
|...But not this one! I always wanted to know "what happened" next. We will never know for sure what happened because ...|
|Hard up, No proper jobs going down at the pit, why not rent your kids! DIY pimp story without the gratuitous sex scen...|
|I watched this movie to see the direction one of the most promising young talents in movies was going. Unfortunately,...|
|This movie makes you wish imdb would let you vote a zero. One of the two movies I've ever walked out of. It's very ha...|
|I never want to see this movie again!<br /><br />Not only is it dreadfully bad, but I can't stand seeing my hero Stan...|
|(As a note, I'd

                                                                                

In [6]:
def predict_batch_fn():
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
    def predict(inputs):
        return model.encode(inputs.tolist())
    return predict

In [7]:
encode = predict_batch_udf(predict_batch_fn,
                           return_type=ArrayType(FloatType()),
                           batch_size=10)

In [8]:
%%time
# first pass caches model/fn
embeddings = df.withColumn("encoding", encode(struct("lines")))
results = embeddings.collect()

[Stage 4:>                                                          (0 + 1) / 1]

CPU times: user 17.8 ms, sys: 9.35 ms, total: 27.2 ms
Wall time: 8.36 s


                                                                                

In [9]:
%%time
embeddings = df.withColumn("encoding", encode("lines"))
results = embeddings.collect()

[Stage 6:>                                                          (0 + 1) / 1]

CPU times: user 17.7 ms, sys: 1.13 ms, total: 18.8 ms
Wall time: 3.25 s


                                                                                

In [10]:
%%time
embeddings = df.withColumn("encoding", encode(col("lines")))
results = embeddings.collect()

[Stage 8:>                                                          (0 + 1) / 1]

CPU times: user 16.9 ms, sys: 0 ns, total: 16.9 ms
Wall time: 2.91 s


                                                                                

In [11]:
embeddings.show(truncate=60)

+------------------------------------------------------------+------------------------------------------------------------+
|                                                       lines|                                                    encoding|
+------------------------------------------------------------+------------------------------------------------------------+
|...But not this one! I always wanted to know "what happen...|[0.050629966, -0.19899231, 2.686046E-4, 0.13270327, -0.16...|
|Hard up, No proper jobs going down at the pit, why not re...|[0.08634103, -0.002254839, 0.10213216, -0.03454912, -0.23...|
|I watched this movie to see the direction one of the most...|[0.008758117, -0.0083419345, -0.119090386, 0.025434377, -...|
|This movie makes you wish imdb would let you vote a zero....|[0.24080081, -0.14614257, 0.18119521, 0.118741795, 0.1022...|
|I never want to see this movie again!<br /><br />Not only...|[0.32271573, -0.14145091, 0.09245593, 0.04562203, 0.07219...|
|(As a n

### Using Triton Inference Server

Note: you can restart the kernel and run from this point to simulate running in a different node or environment.

This notebook uses the [Python backend with a custom execution environment](https://github.com/triton-inference-server/python_backend#creating-custom-execution-environments), using a conda-pack environment created as follows:
```
conda create -n huggingface -c conda-forge python=3.8
conda activate huggingface

export PYTHONUSERSITE=True
pip install conda-pack sentencepiece sentence_transformers transformers

conda-pack  # huggingface.tar.gz
```

In [12]:
import numpy as np
import os
from pyspark.ml.functions import predict_batch_udf
from pyspark.sql.functions import col, struct
from pyspark.sql.types import ArrayType, FloatType

In [13]:
%%bash
# copy custom model to expected layout for Triton
rm -rf models
mkdir -p models
cp -r models_config/hf_transformer models

# add custom execution environment
cp huggingface.tar.gz models

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


#### Start Triton Server on each executor

In [14]:
num_executors = 1
triton_models_dir = "{}/models".format(os.getcwd())
huggingface_cache_dir = "{}/.cache/huggingface".format(os.path.expanduser('~'))
nodeRDD = sc.parallelize(list(range(num_executors)), num_executors)

def start_triton(it):
    import docker
    import time
    import tritonclient.grpc as grpcclient
    
    client=docker.from_env()
    containers=client.containers.list(filters={"name": "spark-triton"})
    if containers:
        print(">>>> containers: {}".format([c.short_id for c in containers]))
    else:
        container=client.containers.run(
            "nvcr.io/nvidia/tritonserver:23.04-py3", "tritonserver --model-repository=/models",
            detach=True,
            device_requests=[docker.types.DeviceRequest(device_ids=["0"], capabilities=[['gpu']])],
            environment=[
                "TRANSFORMERS_CACHE=/cache"
            ],
            name="spark-triton",
            network_mode="host",
            remove=True,
            shm_size="512M",
            volumes={
                triton_models_dir: {"bind": "/models", "mode": "ro"},
                huggingface_cache_dir: {"bind": "/cache", "mode": "rw"}
            }
        )
        print(">>>> starting triton: {}".format(container.short_id))

        # wait for triton to be running
        time.sleep(15)
        client = grpcclient.InferenceServerClient("localhost:8001")
        ready = False
        while not ready:
            try:
                ready = client.is_server_ready()
            except Exception as e:
                time.sleep(5)

    return [True]

nodeRDD.barrier().mapPartitions(start_triton).collect()

                                                                                

[True]

#### Run inference

In [15]:
from functools import partial
from pyspark.ml.functions import predict_batch_udf
from pyspark.sql.functions import col, struct
from pyspark.sql.types import ArrayType, FloatType

In [16]:
# only use first N examples, since this is slow
df = spark.read.parquet("imdb_test").limit(100).cache()

23/05/19 19:15:37 WARN CacheManager: Asked to cache already cached data.


In [17]:
def triton_fn(triton_uri, model_name):
    import numpy as np
    import tritonclient.grpc as grpcclient
    
    np_types = {
      "BOOL": np.dtype(np.bool8),
      "INT8": np.dtype(np.int8),
      "INT16": np.dtype(np.int16),
      "INT32": np.dtype(np.int32),
      "INT64": np.dtype(np.int64),
      "FP16": np.dtype(np.float16),
      "FP32": np.dtype(np.float32),
      "FP64": np.dtype(np.float64),
      "FP64": np.dtype(np.double),
      "BYTES": np.dtype(object)
    }

    client = grpcclient.InferenceServerClient(triton_uri)
    model_meta = client.get_model_metadata(model_name)
    
    def predict(inputs):
        if isinstance(inputs, np.ndarray):
            # single ndarray input
            request = [grpcclient.InferInput(model_meta.inputs[0].name, inputs.shape, model_meta.inputs[0].datatype)]
            request[0].set_data_from_numpy(inputs.astype(np_types[model_meta.inputs[0].datatype]))
        else:
            # dict of multiple ndarray inputs
            request = [grpcclient.InferInput(i.name, inputs[i.name].shape, i.datatype) for i in model_meta.inputs]
            for i in request:
                i.set_data_from_numpy(inputs[i.name()].astype(np_types[i.datatype()]))
        
        response = client.infer(model_name, inputs=request)
        
        if len(model_meta.outputs) > 1:
            # return dictionary of numpy arrays
            return {o.name: response.as_numpy(o.name) for o in model_meta.outputs}
        else:
            # return single numpy array
            return response.as_numpy(model_meta.outputs[0].name)
        
    return predict

In [18]:
encode = predict_batch_udf(partial(triton_fn, triton_uri="localhost:8001", model_name="hf_transformer"),
                           return_type=ArrayType(FloatType()),
                           input_tensor_shapes=[[1]],
                           batch_size=100)

In [19]:
%%time
# first pass caches model/fn
embeddings = df.withColumn("encoding", encode(struct("lines")))
results = embeddings.collect()

[Stage 14:>                                                         (0 + 1) / 1]

CPU times: user 19.7 ms, sys: 3.95 ms, total: 23.6 ms
Wall time: 2.67 s


                                                                                

In [20]:
%%time
embeddings = df.withColumn("encoding", encode("lines"))
results = embeddings.collect()

CPU times: user 7.66 ms, sys: 3.04 ms, total: 10.7 ms
Wall time: 265 ms


In [21]:
%%time
embeddings = df.withColumn("encoding", encode(col("lines")))
results = embeddings.collect()

CPU times: user 12.6 ms, sys: 570 µs, total: 13.2 ms
Wall time: 261 ms


In [22]:
embeddings.show(truncate=60)

+------------------------------------------------------------+------------------------------------------------------------+
|                                                       lines|                                                    encoding|
+------------------------------------------------------------+------------------------------------------------------------+
|...But not this one! I always wanted to know "what happen...|[0.050629944, -0.19899224, 2.68735E-4, 0.13270333, -0.160...|
|Hard up, No proper jobs going down at the pit, why not re...|[0.08634147, -0.002254737, 0.10213226, -0.034549147, -0.2...|
|I watched this movie to see the direction one of the most...|[0.008757966, -0.008341991, -0.11909033, 0.02543464, -0.2...|
|This movie makes you wish imdb would let you vote a zero....|[0.24080098, -0.14614293, 0.1811954, 0.11874188, 0.102292...|
|I never want to see this movie again!<br /><br />Not only...|[0.3227157, -0.14145078, 0.0924558, 0.045622032, 0.072197...|
|(As a n

#### Stop Triton Server on each executor

In [23]:
def stop_triton(it):
    import docker
    import time
    
    client=docker.from_env()
    containers=client.containers.list(filters={"name": "spark-triton"})
    print(">>>> stopping containers: {}".format([c.short_id for c in containers]))
    if containers:
        container=containers[0]
        container.stop(timeout=120)

    return [True]

nodeRDD.barrier().mapPartitions(stop_triton).collect()

                                                                                

[True]

In [24]:
spark.stop()