# PySpark Huggingface Inferencing
## Conditional generation with Tensorflow

From: https://huggingface.co/docs/transformers/model_doc/t5

### Using TensorFlow

In [1]:
from transformers import AutoTokenizer, TFT5ForConditionalGeneration

  from .autonotebook import tqdm as notebook_tqdm
2024-09-25 17:37:28.748970: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-25 17:37:28.769960: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-25 17:37:28.775143: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-25 17:37:28.788944: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler fl

In [2]:
import tensorflow as tf

# Enable GPU memory growth
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)
        
print(tf.__version__)

2.17.0


In [3]:
tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
model = TFT5ForConditionalGeneration.from_pretrained("google-t5/t5-small")

max_source_length = 512
max_target_length = 128

task_prefix = "translate English to German: "

lines = [
    "The house is wonderful",
    "Welcome to NYC",
    "HuggingFace is a company"
]

input_sequences = [task_prefix + l for l in lines]

2024-09-25 17:37:34.350552: I tensorflow/core/common_runtime/gpu/gpu_device.cc:2021] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 31135 MB memory:  -> device: 0, name: Tesla V100-SXM3-32GB-H, pci bus id: 0000:34:00.0, compute capability: 7.0
2024-09-25 17:37:34.351970: I tensorflow/core/common_runtime/gpu/gpu_device.cc:2021] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 31135 MB memory:  -> device: 1, name: Tesla V100-SXM3-32GB-H, pci bus id: 0000:36:00.0, compute capability: 7.0
2024-09-25 17:37:34.353233: I tensorflow/core/common_runtime/gpu/gpu_device.cc:2021] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 31135 MB memory:  -> device: 2, name: Tesla V100-SXM3-32GB-H, pci bus id: 0000:39:00.0, compute capability: 7.0
2024-09-25 17:37:34.354461: I tensorflow/core/common_runtime/gpu/gpu_device.cc:2021] Created device /job:localhost/replica:0/task:0/device:GPU:3 with 31135 MB memory:  -> device: 3, name: Tesla V100-SXM3-32GB-H, pc

In [4]:
input_ids = tokenizer(input_sequences, 
                      padding="longest", 
                      max_length=max_source_length,
                      return_tensors="tf").input_ids
outputs = model.generate(input_ids)

I0000 00:00:1727285857.016900 2000247 service.cc:146] XLA service 0x7f8e980456a0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1727285857.016924 2000247 service.cc:154]   StreamExecutor device (0): Tesla V100-SXM3-32GB-H, Compute Capability 7.0
I0000 00:00:1727285857.016927 2000247 service.cc:154]   StreamExecutor device (1): Tesla V100-SXM3-32GB-H, Compute Capability 7.0
I0000 00:00:1727285857.016930 2000247 service.cc:154]   StreamExecutor device (2): Tesla V100-SXM3-32GB-H, Compute Capability 7.0
I0000 00:00:1727285857.016932 2000247 service.cc:154]   StreamExecutor device (3): Tesla V100-SXM3-32GB-H, Compute Capability 7.0
I0000 00:00:1727285857.016934 2000247 service.cc:154]   StreamExecutor device (4): Tesla V100-SXM3-32GB-H, Compute Capability 7.0
I0000 00:00:1727285857.016936 2000247 service.cc:154]   StreamExecutor device (5): Tesla V100-SXM3-32GB-H, Compute Capability 7.0
I0000 00:00:1727285857.016939 2000247 service.cc:15

In [5]:
[tokenizer.decode(o, skip_special_tokens=True) for o in outputs]

['Das Haus ist wunderbar',
 'Willkommen in NYC',
 'HuggingFace ist ein Unternehmen']

In [6]:
model.framework

'tf'

## PySpark

In [7]:
import os
from pathlib import Path
from datasets import load_dataset

In [8]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession

In [9]:
num_threads = 6

# Creating a local Spark session for demonstration, in case it hasn't already been created.

_config = {
    "spark.master": f"local[{num_threads}]",
    "spark.driver.host": "127.0.0.1",
    "spark.task.maxFailures": "1",
    "spark.driver.memory": "8g",
    "spark.executor.memory": "8g",
    "spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled": "false",
    "spark.sql.pyspark.jvmStacktrace.enabled": "true",
    "spark.sql.execution.arrow.pyspark.enabled": "true",
    "spark.python.worker.reuse": "true",
}
spark = SparkSession.builder.appName("spark-dl-example")
for key, value in _config.items():
    spark = spark.config(key, value)
spark = spark.getOrCreate()

sc = spark.sparkContext

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
24/09/25 17:37:39 WARN Utils: Your hostname, dgx2h0194.spark.sjc4.nvmetal.net resolves to a loopback address: 127.0.1.1; using 10.150.30.2 instead (on interface enp134s0f0np0)
24/09/25 17:37:39 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/25 17:37:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [10]:
# load IMDB reviews (test) dataset
data = load_dataset("imdb", split="test")

In [11]:
lines = []
for example in data:
    lines.append([example["text"].split(".")[0]])

len(lines)

25000

### Create PySpark DataFrame

In [12]:
df = spark.createDataFrame(lines, ['lines']).repartition(10)
df.schema

StructType([StructField('lines', StringType(), True)])

In [13]:
df.take(1)

                                                                                

[Row(lines="i do not understand at all why this movie received such good grades from critics - - i've seen tens of documentaries (on TV) about the wine world which were much much better when (if) you watch it, please think of two very annoying aspects of mondovino : first, the filming is just awful and terrible and upsetting : to me, it looked like the guy behind the camera just received the material and was playing with it : plenty of zooms (for no purpose other than pushing the button in/out) for instance - - i almost stopped to watch it because of that ! secondly, the interviewer (the director i think) is not really relevant : he looks like and ask questions like a boy scout, not like a journalist, even if the general idea and themes would have been interesting, too bad conclusion: overrated documentary, maybe only for guys who do not know nothing about wine => not recommended at all (2/10)")]

### Save the test dataset as parquet files

In [14]:
df.write.mode("overwrite").parquet("imdb_test")

                                                                                

### Check arrow memory configuration

In [15]:
spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "512")
# This line will fail if the vectorized reader runs out of memory
assert len(df.head()) > 0, "`df` should not be empty"

## Inference using Spark DL API (PyTorch)
Note: you can restart the kernel and run from this point to simulate running in a different node or environment.

In [16]:
import pandas as pd
from pyspark.ml.functions import predict_batch_udf
from pyspark.sql.functions import col, pandas_udf, struct
from pyspark.sql.types import StringType

In [17]:
# only use first sentence and add prefix for conditional generation
def preprocess(text: pd.Series, prefix: str = "") -> pd.Series:
    @pandas_udf("string")
    def _preprocess(text: pd.Series) -> pd.Series:
        return pd.Series([prefix + s.split(".")[0] for s in text])
    return _preprocess(text)

In [18]:
# only use first N examples, since this is slow
df = spark.read.parquet("imdb_test").limit(100)
df.show(truncate=120)
df.count()

+------------------------------------------------------------------------------------------------------------------------+
|                                                                                                                   lines|
+------------------------------------------------------------------------------------------------------------------------+
|A ridiculous movie, a terrible editing job, worst screenplay, ridiculous acting, a story that is completely ununderst...|
|                                                        Most of this film was okay, for a sequel of a sequel of a sequel|
|                                                                                                                 I tried|
|                                             This movie attempted to make Stu Ungar's life interesting by being creative|
|After I saw this I concluded that it was most likely a chick flick; afterward I found out that Keira's mother wrote t...|
|Jeff Speakman n

100

In [19]:
# only use first 100 rows, since generation takes a while
df1 = df.withColumn("input", preprocess(col("lines"), "Translate English to German: ")).select("input").limit(100).cache()

In [20]:
df1.count()

100

In [21]:
df1.show(truncate=120)

+------------------------------------------------------------------------------------------------------------------------+
|                                                                                                                   input|
+------------------------------------------------------------------------------------------------------------------------+
|Translate English to German: A ridiculous movie, a terrible editing job, worst screenplay, ridiculous acting, a story...|
|                           Translate English to German: Most of this film was okay, for a sequel of a sequel of a sequel|
|                                                                                    Translate English to German: I tried|
|                Translate English to German: This movie attempted to make Stu Ungar's life interesting by being creative|
|Translate English to German: After I saw this I concluded that it was most likely a chick flick; afterward I found ou...|
|Translate Engli

In [22]:
def predict_batch_fn():
    import tensorflow as tf
    import numpy as np
    from transformers import TFT5ForConditionalGeneration, AutoTokenizer

    # Enable GPU memory growth
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
        except RuntimeError as e:
            print(e)

    model = TFT5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
    tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")

    def predict(inputs):
        flattened = np.squeeze(inputs).tolist()   # convert 2d numpy array of string into flattened python list
        input_ids = tokenizer(flattened, 
                              padding="longest", 
                              max_length=128,
                              return_tensors="tf").input_ids
        output_ids = model.generate(input_ids)
        string_outputs = np.array([tokenizer.decode(o, skip_special_tokens=True) for o in output_ids])
        print("predict: {}".format(len(flattened)))
        return string_outputs
    
    return predict

In [23]:
generate = predict_batch_udf(predict_batch_fn,
                             return_type=StringType(),
                             batch_size=10)

In [24]:
%%time
# first pass caches model/fn
preds = df1.withColumn("preds", generate(struct("input")))
results = preds.collect()

2024-09-25 17:37:52.935639: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-25 17:37:52.953270: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-25 17:37:52.958829: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-25 17:37:52.972056: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-09-25 17:37:58.208058: I tensorflow/core

CPU times: user 102 ms, sys: 126 ms, total: 228 ms
Wall time: 29.1 s


predict: 10
                                                                                

In [25]:
%%time
preds = df1.withColumn("preds", generate("input"))
results = preds.collect()

predict: 10                                                         (0 + 1) / 1]
predict: 10
predict: 10
predict: 10
predict: 10
predict: 10
predict: 10
predict: 10
predict: 10


CPU times: user 49.6 ms, sys: 102 ms, total: 151 ms
Wall time: 19.3 s


predict: 10
                                                                                

In [26]:
%%time
preds = df1.withColumn("preds", generate(col("input")))
results = preds.collect()

predict: 10                                                         (0 + 1) / 1]
predict: 10
predict: 10
predict: 10
predict: 10
predict: 10
predict: 10
predict: 10
predict: 10


CPU times: user 60.2 ms, sys: 103 ms, total: 163 ms
Wall time: 20.9 s


predict: 10
                                                                                

In [27]:
preds.show(truncate=60)

predict: 10                                                         (0 + 1) / 1]
predict: 10


+------------------------------------------------------------+------------------------------------------------------------+
|                                                       input|                                                       preds|
+------------------------------------------------------------+------------------------------------------------------------+
|Translate English to German: A ridiculous movie, a terrib...|Ein lächerlicher Film, eine schreckliche Bearbeitung, sch...|
|Translate English to German: Most of this film was okay, ...|Der größte Teil dieses Films war okay, für eine Fortsetzu...|
|                        Translate English to German: I tried|                   Ich habe versucht, Englisch zu übersetzen|
|Translate English to German: This movie attempted to make...|Dieser Film versuchte, das Leben von Stu Ungar interessan...|
|Translate English to German: After I saw this I concluded...|Nach meiner Anzeige kam ich zu dem Schluss, dass es höchs...|
|Transla

predict: 121
                                                                                

In [28]:
# only use first 100 rows, since generation takes a while
df2 = df.withColumn("input", preprocess(col("lines"), "Translate English to French: ")).select("input").limit(100).cache()

In [29]:
df2.show(truncate=120)

+------------------------------------------------------------------------------------------------------------------------+
|                                                                                                                   input|
+------------------------------------------------------------------------------------------------------------------------+
|Translate English to French: A ridiculous movie, a terrible editing job, worst screenplay, ridiculous acting, a story...|
|                           Translate English to French: Most of this film was okay, for a sequel of a sequel of a sequel|
|                                                                                    Translate English to French: I tried|
|                Translate English to French: This movie attempted to make Stu Ungar's life interesting by being creative|
|Translate English to French: After I saw this I concluded that it was most likely a chick flick; afterward I found ou...|
|Translate Engli

In [30]:
%%time
# first pass caches model/fn
preds = df2.withColumn("preds", generate(struct("input")))
result = preds.collect()

2024-09-25 17:39:09.073889: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-25 17:39:09.092523: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-25 17:39:09.097933: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-25 17:39:09.111018: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-09-25 17:39:14.649415: I tensorflow/core

CPU times: user 118 ms, sys: 119 ms, total: 237 ms
Wall time: 30.6 s


predict: 10
                                                                                

In [31]:
%%time
preds = df2.withColumn("preds", generate("input"))
result = preds.collect()

predict: 10                                                         (0 + 1) / 1]
predict: 10
predict: 10
predict: 10
predict: 10
predict: 10
predict: 10
predict: 10
predict: 10


CPU times: user 80.7 ms, sys: 70.2 ms, total: 151 ms
Wall time: 20.5 s


predict: 10
                                                                                

In [32]:
%%time
preds = df2.withColumn("preds", generate(col("input")))
result = preds.collect()

predict: 10                                                         (0 + 1) / 1]
predict: 10
predict: 10
predict: 10
predict: 10
predict: 10
predict: 10
predict: 10
predict: 10


CPU times: user 65 ms, sys: 80.8 ms, total: 146 ms
Wall time: 20.4 s


predict: 10
                                                                                

In [33]:
preds.show(truncate=60)

predict: 10                                                         (0 + 1) / 1]
predict: 10


+------------------------------------------------------------+------------------------------------------------------------+
|                                                       input|                                                       preds|
+------------------------------------------------------------+------------------------------------------------------------+
|Translate English to French: A ridiculous movie, a terrib...|Un film ridicule, un terrible travail de rédaction, le pi...|
|Translate English to French: Most of this film was okay, ...|La plupart de ce film était en bonne et due forme, pour u...|
|                        Translate English to French: I tried|                                                 J'ai essayé|
|Translate English to French: This movie attempted to make...|Ce film tentait de rendre la vie de Stu Ungar intéressant...|
|Translate English to French: After I saw this I concluded...|Après avoir vu ce film, j'ai conclu qu'il était très prob...|
|Transla

predict: 121
                                                                                

### Using Triton Inference Server

Note: you can restart the kernel and run from this point to simulate running in a different node or environment.  
While the examples above use Tensorflow, note that inference on the Triton server is run using PyTorch. PyTorch will be included in the tarball below, and no further environment changes are required by the user.

This notebook uses the [Python backend with a custom execution environment](https://github.com/triton-inference-server/python_backend#creating-custom-execution-environments) with the compatible versions of Python/Numpy for Triton 24.08, using a conda-pack environment created as follows:
```
conda create -n huggingface -c conda-forge python=3.10.0
conda activate huggingface

export PYTHONNOUSERSITE=True
pip install numpy<2 conda-pack sentencepiece sentence_transformers transformers

conda-pack  # huggingface.tar.gz
```

In [34]:
import os

In [35]:
%%bash
# copy custom model to expected layout for Triton
rm -rf models
mkdir -p models
cp -r models_config/hf_generation models

# add custom execution environment
cp huggingface.tar.gz models

#### Start Triton Server on each executor

In [36]:
num_executors = 1
triton_models_dir = "{}/models".format(os.getcwd())
huggingface_cache_dir = "{}/.cache/huggingface".format(os.path.expanduser('~'))
nodeRDD = sc.parallelize(list(range(num_executors)), num_executors)

def start_triton(it):
    import docker
    import time
    import tritonclient.grpc as grpcclient
    
    client=docker.from_env()
    containers=client.containers.list(filters={"name": "spark-triton"})
    if containers:
        print(">>>> containers: {}".format([c.short_id for c in containers]))
    else:
        container=client.containers.run(
            "nvcr.io/nvidia/tritonserver:24.08-py3", "tritonserver --model-repository=/models",
            detach=True,
            device_requests=[docker.types.DeviceRequest(device_ids=["0"], capabilities=[['gpu']])],
            environment=[
                "TRANSFORMERS_CACHE=/cache"
            ],
            name="spark-triton",
            network_mode="host",
            remove=True,
            shm_size="1G",
            volumes={
                triton_models_dir: {"bind": "/models", "mode": "ro"},
                huggingface_cache_dir: {"bind": "/cache", "mode": "rw"}
            }
        )
        print(">>>> starting triton: {}".format(container.short_id))

        # wait for triton to be running
        time.sleep(15)
        client = grpcclient.InferenceServerClient("localhost:8001")
        ready = False
        while not ready:
            try:
                ready = client.is_server_ready()
            except Exception as e:
                time.sleep(5)

    return [True]

nodeRDD.barrier().mapPartitions(start_triton).collect()

>>>> starting triton: 5e892cc5bbfe                                  (0 + 1) / 1]
                                                                                

[True]

#### Run inference

In [37]:
import pandas as pd
from functools import partial
from pyspark.ml.functions import predict_batch_udf
from pyspark.sql.functions import col, pandas_udf, struct
from pyspark.sql.types import StringType

In [38]:
# only use first N examples, since this is slow
df = spark.read.parquet("imdb_test").limit(100).cache()

In [39]:
# only use first sentence and add prefix for conditional generation
def preprocess(text: pd.Series, prefix: str = "") -> pd.Series:
    @pandas_udf("string")
    def _preprocess(text: pd.Series) -> pd.Series:
        return pd.Series([prefix + s.split(".")[0] for s in text])
    return _preprocess(text)

In [40]:
# only use first 100 rows, since generation takes a while
df1 = df.withColumn("input", preprocess(col("lines"), "Translate English to German: ")).select("input").limit(100)

In [41]:
df1.show(truncate=120)

+------------------------------------------------------------------------------------------------------------------------+
|                                                                                                                   input|
+------------------------------------------------------------------------------------------------------------------------+
|Translate English to German: A ridiculous movie, a terrible editing job, worst screenplay, ridiculous acting, a story...|
|                           Translate English to German: Most of this film was okay, for a sequel of a sequel of a sequel|
|                                                                                    Translate English to German: I tried|
|                Translate English to German: This movie attempted to make Stu Ungar's life interesting by being creative|
|Translate English to German: After I saw this I concluded that it was most likely a chick flick; afterward I found ou...|
|Translate Engli

In [42]:
def triton_fn(triton_uri, model_name):
    import numpy as np
    import tritonclient.grpc as grpcclient
    
    np_types = {
      "BOOL": np.dtype(np.bool_),
      "INT8": np.dtype(np.int8),
      "INT16": np.dtype(np.int16),
      "INT32": np.dtype(np.int32),
      "INT64": np.dtype(np.int64),
      "FP16": np.dtype(np.float16),
      "FP32": np.dtype(np.float32),
      "FP64": np.dtype(np.float64),
      "FP64": np.dtype(np.double),
      "BYTES": np.dtype(object)
    }

    client = grpcclient.InferenceServerClient(triton_uri)
    model_meta = client.get_model_metadata(model_name)
    
    def predict(inputs):
        if isinstance(inputs, np.ndarray):
            # single ndarray input
            request = [grpcclient.InferInput(model_meta.inputs[0].name, inputs.shape, model_meta.inputs[0].datatype)]
            request[0].set_data_from_numpy(inputs.astype(np_types[model_meta.inputs[0].datatype]))
        else:
            # dict of multiple ndarray inputs
            request = [grpcclient.InferInput(i.name, inputs[i.name].shape, i.datatype) for i in model_meta.inputs]
            for i in request:
                i.set_data_from_numpy(inputs[i.name()].astype(np_types[i.datatype()]))
        
        response = client.infer(model_name, inputs=request)
        
        if len(model_meta.outputs) > 1:
            # return dictionary of numpy arrays
            return {o.name: response.as_numpy(o.name) for o in model_meta.outputs}
        else:
            # return single numpy array
            return response.as_numpy(model_meta.outputs[0].name)
        
    return predict

In [43]:
generate = predict_batch_udf(partial(triton_fn, triton_uri="localhost:8001", model_name="hf_generation"),
                             return_type=StringType(),
                             input_tensor_shapes=[[1]],
                             batch_size=100)

In [44]:
%%time
# first pass caches model/fn
preds = df1.withColumn("preds", generate(struct("input")))
results = preds.collect()



CPU times: user 12.1 ms, sys: 14.1 ms, total: 26.1 ms
Wall time: 2.62 s


                                                                                

In [45]:
%%time
preds = df1.withColumn("preds", generate("input"))
results = preds.collect()

[Stage 47:>                                                         (0 + 1) / 1]

CPU times: user 12.6 ms, sys: 6.19 ms, total: 18.8 ms
Wall time: 1.93 s


                                                                                

In [46]:
%%time
preds = df1.withColumn("preds", generate(col("input")))
results = preds.collect()

[Stage 49:>                                                         (0 + 1) / 1]

CPU times: user 11.9 ms, sys: 6.23 ms, total: 18.2 ms
Wall time: 1.93 s


                                                                                

In [47]:
preds.show(truncate=60)

+------------------------------------------------------------+------------------------------------------------------------+
|                                                       input|                                                       preds|
+------------------------------------------------------------+------------------------------------------------------------+
|Translate English to German: A ridiculous movie, a terrib...|Ein lächerlicher Film, eine schreckliche Bearbeitung, sch...|
|Translate English to German: Most of this film was okay, ...|Der größte Teil dieses Films war okay, für eine Fortsetzu...|
|                        Translate English to German: I tried|                   Ich habe versucht, Englisch zu übersetzen|
|Translate English to German: This movie attempted to make...|Dieser Film versuchte, das Leben von Stu Ungar interessan...|
|Translate English to German: After I saw this I concluded...|Nach meiner Anzeige kam ich zu dem Schluss, dass es höchs...|
|Transla

In [48]:
# only use first 100 rows, since generation takes a while
df2 = df.withColumn("input", preprocess(col("lines"), "Translate English to French: ")).select("input").limit(100).cache()

24/09/25 17:41:32 WARN CacheManager: Asked to cache already cached data.


In [49]:
df2.show(truncate=120)

+------------------------------------------------------------------------------------------------------------------------+
|                                                                                                                   input|
+------------------------------------------------------------------------------------------------------------------------+
|Translate English to French: A ridiculous movie, a terrible editing job, worst screenplay, ridiculous acting, a story...|
|                           Translate English to French: Most of this film was okay, for a sequel of a sequel of a sequel|
|                                                                                    Translate English to French: I tried|
|                Translate English to French: This movie attempted to make Stu Ungar's life interesting by being creative|
|Translate English to French: After I saw this I concluded that it was most likely a chick flick; afterward I found ou...|
|Translate Engli

In [50]:
%%time
preds = df2.withColumn("preds", generate(struct("input")))
results = preds.collect()

                                                                                

CPU times: user 226 ms, sys: 86.9 ms, total: 313 ms
Wall time: 3.01 s


In [51]:
%%time
preds = df2.withColumn("preds", generate("input"))
results = preds.collect()

[Stage 57:>                                                         (0 + 1) / 1]

CPU times: user 11.4 ms, sys: 9.74 ms, total: 21.2 ms
Wall time: 2.04 s


                                                                                

In [52]:
%%time
preds = df2.withColumn("preds", generate(col("input")))
results = preds.collect()

[Stage 59:>                                                         (0 + 1) / 1]

CPU times: user 14 ms, sys: 5 ms, total: 19 ms
Wall time: 2.05 s


                                                                                

In [53]:
preds.show(truncate=60)

+------------------------------------------------------------+------------------------------------------------------------+
|                                                       input|                                                       preds|
+------------------------------------------------------------+------------------------------------------------------------+
|Translate English to French: A ridiculous movie, a terrib...|Un film ridicule, un terrible travail de rédaction, le pi...|
|Translate English to French: Most of this film was okay, ...|La plupart de ce film était en bonne et due forme, pour u...|
|                        Translate English to French: I tried|                                                 J'ai essayé|
|Translate English to French: This movie attempted to make...|Ce film tentait de rendre la vie de Stu Ungar intéressant...|
|Translate English to French: After I saw this I concluded...|Après avoir vu ce film, j'ai conclu qu'il était très prob...|
|Transla

#### Stop Triton Server on each executor

In [54]:
def stop_triton(it):
    import docker
    import time
    
    client=docker.from_env()
    containers=client.containers.list(filters={"name": "spark-triton"})
    print(">>>> stopping containers: {}".format([c.short_id for c in containers]))
    if containers:
        container=containers[0]
        container.stop(timeout=120)

    return [True]

nodeRDD.barrier().mapPartitions(stop_triton).collect()

>>>> stopping containers: ['5e892cc5bbfe']
                                                                                

[True]

In [55]:
spark.stop()