# PySpark Huggingface Inferencing
### Text Classification using Pipelines

Based on: https://huggingface.co/docs/transformers/quicktour#pipeline-usage

In [1]:
import pandas as pd

from inspect import signature
from pyspark.sql.functions import col, pandas_udf
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pipe = pipeline("text-classification")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Xformers is not installed correctly. If you want to use memorry_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [3]:
pipe("What can I say that hasn't been said already. I think this place is totally worth the hype.")

[{'label': 'POSITIVE', 'score': 0.9994712471961975}]

In [4]:
pipe("I will not say much about this film, because there is not much to say, because there is not much there to talk about.")

[{'label': 'NEGATIVE', 'score': 0.9997401833534241}]

## Inference using Spark DL API

In [5]:
import pandas as pd
from pyspark.sql.functions import col, struct, pandas_udf
from pyspark.ml.functions import predict_batch_udf
from pyspark.sql.types import FloatType, StringType, StructField, StructType

In [6]:
# only use first sentence of IMDB reviews
@pandas_udf("string")
def first_sentence(text: pd.Series) -> pd.Series:
    return pd.Series([s.split(".")[0] for s in text])

df = spark.read.parquet("imdb_test").withColumn("sentence", first_sentence(col("lines"))).select("sentence").limit(100).cache()
df.show(truncate=80)



+--------------------------------------------------------------------------------+
|                                                                        sentence|
+--------------------------------------------------------------------------------+
|                                                                                |
|Hard up, No proper jobs going down at the pit, why not rent your kids! DIY pi...|
|I watched this movie to see the direction one of the most promising young tal...|
|                        This movie makes you wish imdb would let you vote a zero|
|I never want to see this movie again!<br /><br />Not only is it dreadfully ba...|
|(As a note, I'd like to say that I saw this movie at my annual church camp, w...|
|                 Don't get me wrong, I love the TV series of League Of Gentlemen|
|Did you ever think, like after watching a horror movie with a group of friend...|
|                                                             Awful, awful, awful|
|Thi

                                                                                

In [7]:
def predict_batch_fn():
    from transformers import pipeline
    pipe = pipeline("text-classification")
    def predict(inputs):
        return pipe(inputs.tolist())
    return predict

In [8]:
classify = predict_batch_udf(predict_batch_fn,
                             return_type=StructType([
                                 StructField("label", StringType(), True),
                                 StructField("score", FloatType(), True)
                             ]),
                             batch_size=10)

In [9]:
%%time
# note: expanding the "struct" return_type to top-level columns
preds = df.withColumn("preds", classify(struct("sentence"))).select("sentence", "preds.*")
results = preds.collect()

[Stage 4:>                                                          (0 + 1) / 1]

CPU times: user 21.2 ms, sys: 4.33 ms, total: 25.5 ms
Wall time: 12.3 s


                                                                                

In [10]:
%%time
# note: expanding the "struct" return_type to top-level columns
preds = df.withColumn("preds", classify("sentence")).select("sentence", "preds.*")
results = preds.collect()

[Stage 6:>                                                          (0 + 1) / 1]

CPU times: user 8.67 ms, sys: 4.24 ms, total: 12.9 ms
Wall time: 5.44 s


                                                                                

In [11]:
%%time
# note: expanding the "struct" return_type to top-level columns
preds = df.withColumn("preds", classify(col("sentence"))).select("sentence", "preds.*")
results = preds.collect()

[Stage 8:>                                                          (0 + 1) / 1]

CPU times: user 13.8 ms, sys: 1.52 ms, total: 15.4 ms
Wall time: 5.46 s


                                                                                

In [12]:
preds.show(truncate=80)

[Stage 10:>                                                         (0 + 1) / 1]

+--------------------------------------------------------------------------------+--------+----------+
|                                                                        sentence|   label|     score|
+--------------------------------------------------------------------------------+--------+----------+
|                                                                                |POSITIVE|0.74812096|
|Hard up, No proper jobs going down at the pit, why not rent your kids! DIY pi...|NEGATIVE|0.99967253|
|I watched this movie to see the direction one of the most promising young tal...|POSITIVE| 0.9994943|
|                        This movie makes you wish imdb would let you vote a zero|NEGATIVE| 0.9981305|
|I never want to see this movie again!<br /><br />Not only is it dreadfully ba...|NEGATIVE| 0.9988337|
|(As a note, I'd like to say that I saw this movie at my annual church camp, w...|POSITIVE| 0.9901974|
|                 Don't get me wrong, I love the TV series of League Of G

                                                                                

### Using Triton Inference Server

Note: you can restart the kernel and run from this point to simulate running in a different node or environment.

This notebook uses the [Python backend with a custom execution environment](https://github.com/triton-inference-server/python_backend#creating-custom-execution-environments), using a conda-pack environment created as follows:
```
conda create -n huggingface -c conda-forge python=3.8
conda activate huggingface

export PYTHONUSERSITE=True
pip install conda-pack sentencepiece sentence_transformers transformers

conda-pack  # huggingface.tar.gz
```

In [13]:
import numpy as np
import pandas as pd
import os
from pyspark.ml.functions import predict_batch_udf
from pyspark.sql.functions import col, struct, pandas_udf
from pyspark.sql.types import FloatType, StringType, StructField, StructType

In [14]:
%%bash
# copy custom model to expected layout for Triton
rm -rf models
mkdir -p models
cp -r models_config/hf_pipeline models

# add custom execution environment
cp huggingface.tar.gz models

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


#### Start Triton Server on each executor

In [15]:
num_executors = 1
triton_models_dir = "{}/models".format(os.getcwd())
huggingface_cache_dir = "{}/.cache/huggingface".format(os.path.expanduser('~'))
nodeRDD = sc.parallelize(list(range(num_executors)), num_executors)

def start_triton(it):
    import docker
    import time
    import tritonclient.grpc as grpcclient
    
    client=docker.from_env()
    containers=client.containers.list(filters={"name": "spark-triton"})
    if containers:
        print(">>>> containers: {}".format([c.short_id for c in containers]))
    else:
        container=client.containers.run(
            "nvcr.io/nvidia/tritonserver:23.04-py3", "tritonserver --model-repository=/models",
            detach=True,
            device_requests=[docker.types.DeviceRequest(device_ids=["0"], capabilities=[['gpu']])],
            environment=[
                "TRANSFORMERS_CACHE=/cache"
            ],
            name="spark-triton",
            network_mode="host",
            remove=True,
            shm_size="256M",
            volumes={
                triton_models_dir: {"bind": "/models", "mode": "ro"},
                huggingface_cache_dir: {"bind": "/cache", "mode": "rw"}
            }
        )
        print(">>>> starting triton: {}".format(container.short_id))
        # wait for triton to be running
        time.sleep(15)
        
        client = grpcclient.InferenceServerClient("localhost:8001")
        
        elapsed = 0
        timeout = 120
        ready = False
        while not ready and elapsed < timeout:
            try:
                time.sleep(5)
                elapsed += 5
                ready = client.is_server_ready()
            except Exception as e:
                pass

    return [True]

nodeRDD.barrier().mapPartitions(start_triton).collect()

                                                                                

[True]

#### Run inference

In [16]:
# only use first sentence of IMDB reviews
@pandas_udf("string")
def first_sentence(text: pd.Series) -> pd.Series:
    return pd.Series([s.split(".")[0] for s in text])

df = spark.read.parquet("imdb_test").withColumn("sentence", first_sentence(col("lines"))).select("sentence").limit(1000)

In [17]:
def triton_fn(triton_uri, model_name):
    import numpy as np
    import tritonclient.grpc as grpcclient
    
    np_types = {
      "BOOL": np.dtype(np.bool8),
      "INT8": np.dtype(np.int8),
      "INT16": np.dtype(np.int16),
      "INT32": np.dtype(np.int32),
      "INT64": np.dtype(np.int64),
      "FP16": np.dtype(np.float16),
      "FP32": np.dtype(np.float32),
      "FP64": np.dtype(np.float64),
      "FP64": np.dtype(np.double),
      "BYTES": np.dtype(object)
    }

    client = grpcclient.InferenceServerClient(triton_uri)
    model_meta = client.get_model_metadata(model_name)
    
    def predict(inputs):
        if isinstance(inputs, np.ndarray):
            # single ndarray input
            request = [grpcclient.InferInput(model_meta.inputs[0].name, inputs.shape, model_meta.inputs[0].datatype)]
            request[0].set_data_from_numpy(inputs.astype(np_types[model_meta.inputs[0].datatype]))
        else:
            # dict of multiple ndarray inputs
            request = [grpcclient.InferInput(i.name, inputs[i.name].shape, i.datatype) for i in model_meta.inputs]
            for i in request:
                i.set_data_from_numpy(inputs[i.name()].astype(np_types[i.datatype()]))
        
        response = client.infer(model_name, inputs=request)
        
        if len(model_meta.outputs) > 1:
            # return dictionary of numpy arrays
            return {o.name: response.as_numpy(o.name) for o in model_meta.outputs}
        else:
            # return single numpy array
            return response.as_numpy(model_meta.outputs[0].name)
        
    return predict

In [18]:
from functools import partial

classify = predict_batch_udf(partial(triton_fn, triton_uri="localhost:8001", model_name="hf_pipeline"),
                             return_type=StructType([
                                 StructField("label", StringType(), True),
                                 StructField("score", FloatType(), True)
                             ]),
                             input_tensor_shapes=[[1]],
                             batch_size=100)

In [19]:
%%time
# first pass caches model/fn
# note: expanding the "struct" return_type to top-level columns
preds = df.withColumn("preds", classify(struct("sentence"))).select("sentence", "preds.*")
results = preds.collect()

[Stage 13:>                                                         (0 + 1) / 1]

CPU times: user 25.5 ms, sys: 0 ns, total: 25.5 ms
Wall time: 5.42 s


                                                                                

In [20]:
%%time
# note: expanding the "struct" return_type to top-level columns
preds = df.withColumn("preds", classify("sentence")).select("sentence", "preds.*")
results = preds.collect()

[Stage 14:>                                                         (0 + 1) / 1]

CPU times: user 11.6 ms, sys: 7.68 ms, total: 19.3 ms
Wall time: 4.52 s


                                                                                

In [21]:
%%time
# note: expanding the "struct" return_type to top-level columns
preds = df.withColumn("preds", classify(col("sentence"))).select("sentence", "preds.*")
results = preds.collect()

[Stage 15:>                                                         (0 + 1) / 1]

CPU times: user 13.1 ms, sys: 5.65 ms, total: 18.7 ms
Wall time: 4.51 s


                                                                                

In [22]:
preds.show(truncate=80)

[Stage 16:>                                                         (0 + 1) / 1]

+--------------------------------------------------------------------------------+--------+----------+
|                                                                        sentence|   label|     score|
+--------------------------------------------------------------------------------+--------+----------+
|                                                                                |POSITIVE| 0.7481212|
|Hard up, No proper jobs going down at the pit, why not rent your kids! DIY pi...|NEGATIVE|0.99967253|
|I watched this movie to see the direction one of the most promising young tal...|POSITIVE| 0.9994943|
|                        This movie makes you wish imdb would let you vote a zero|NEGATIVE| 0.9981305|
|I never want to see this movie again!<br /><br />Not only is it dreadfully ba...|NEGATIVE| 0.9988337|
|(As a note, I'd like to say that I saw this movie at my annual church camp, w...|POSITIVE| 0.9901974|
|                 Don't get me wrong, I love the TV series of League Of G

                                                                                

#### Stop Triton Server on each executor

In [23]:
def stop_triton(it):
    import docker
    import time
    
    client=docker.from_env()
    containers=client.containers.list(filters={"name": "spark-triton"})
    print(">>>> stopping containers: {}".format([c.short_id for c in containers]))
    if containers:
        container=containers[0]
        container.stop(timeout=120)

    return [True]

nodeRDD.barrier().mapPartitions(stop_triton).collect()

                                                                                

[True]

In [24]:
spark.stop()