# GPU-Accelerated Spark Connect Demo - ETL and ML Pipeline (Spark 4.0+)

Based on the Data and AI Summit 2025 session: [GPU Accelerated Spark Connect](https://www.databricks.com/dataaisummit/session/gpu-accelerated-spark-connect)


## Connect to Spark via Spark Connect


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
# Create GPU-accelerated Spark session using Spark Connect 4.0+
spark = (
  SparkSession.builder
    .remote('sc://spark-connect-server')
    .appName('GPU-Accelerated-ETL-ML-Demo') 
    .getOrCreate()
)

print(f'Spark version: {spark.version}')

Spark version: 4.0.0


## Smoke Test GPU


In [2]:
df = (
  spark.range(2 ** 35)
    .withColumn('mod10', col('id') % lit(10))
    .groupBy('mod10').agg(count('*'))
    .orderBy('mod10')
)
df.show()
# workaround to get a plan with GpuOverrides applied by disabling adaptive execution
spark.conf.set('spark.sql.adaptive.enabled', False)
df.explain(mode='formatted')
spark.conf.set('spark.sql.adaptive.enabled', True)

+-----+----------+
|mod10|  count(1)|
+-----+----------+
|    0|3435973837|
|    1|3435973837|
|    2|3435973837|
|    3|3435973837|
|    4|3435973837|
|    5|3435973837|
|    6|3435973837|
|    7|3435973837|
|    8|3435973836|
|    9|3435973836|
+-----+----------+

== Physical Plan ==
GpuColumnarToRow (10)
+- GpuSort (9)
   +- GpuShuffleCoalesce (8)
      +- GpuColumnarExchange (7)
         +- GpuHashAggregate (6)
            +- GpuShuffleCoalesce (5)
               +- GpuColumnarExchange (4)
                  +- GpuHashAggregate (3)
                     +- GpuProject (2)
                        +- GpuRange (1)


(1) GpuRange
Output [1]: [id#1L]
Arguments: 0, 34359738368, 1, 64, [id#1L], 536870912

(2) GpuProject
Input [1]: [id#1L]
Arguments: [(id#1L % 10) AS mod10#36L], true

(3) GpuHashAggregate
Input [1]: [mod10#36L]
Keys [1]: [mod10#36L]
Functions [1]: [partial_gpucount(1, false)]
Aggregate Attributes [1]: [count#39L]
Results [2]: [mod10#36L, count#40L]
Lore: 

(4) GpuColumnarExch

##  Should GPU Be Used from the next cell on?


In [3]:
accelerate_on_gpu = True

### ETL on GPU?

In [4]:
spark.conf.set('spark.rapids.sql.enabled', accelerate_on_gpu)  

### ML on GPU?

In [5]:
if accelerate_on_gpu:
  spark.conf.set('spark.connect.ml.backend.classes', 'com.nvidia.rapids.ml.Plugin')
else:
  spark.conf.unset('spark.connect.ml.backend.classes')

## Normalize references to the same bank 

In [6]:
import csv
with open('work/name_mapping.csv', 'r') as name_mapping_file:
  nm_reader = csv.reader(name_mapping_file,)
  name_mapping = [r for r in nm_reader]
name_mapping_df = spark.createDataFrame(name_mapping, ['from_seller_name', 'to_seller_name'])

(
  name_mapping_df
    .where(col('to_seller_name') == 'Wells Fargo' )
    .show(truncate=False)
)

+------------------------------------------------------+--------------+
|from_seller_name                                      |to_seller_name|
+------------------------------------------------------+--------------+
|WELLS FARGO CREDIT RISK TRANSFER SECURITIES TRUST 2015|Wells Fargo   |
|WELLS FARGO BANK,  NA                                 |Wells Fargo   |
|WELLS FARGO BANK, N.A.                                |Wells Fargo   |
|WELLS FARGO BANK, NA                                  |Wells Fargo   |
+------------------------------------------------------+--------------+



In [7]:
# String columns
cate_col_names = [
  'orig_channel',
  'first_home_buyer',
  'loan_purpose',
  'property_type',
  'occupancy_status',
  'property_state',
  'product_type',
  'relocation_mortgage_indicator',
  'seller_name',
  'mod_flag'
]
# Numeric columns
label_col_name = 'delinquency_12'
numeric_col_names = [
  'orig_interest_rate',
  'orig_upb',
  'orig_loan_term',
  'orig_ltv',
  'orig_cltv',
  'num_borrowers',
  'dti',
  'borrower_credit_score',
  'num_units',
  'zip',
  'mortgage_insurance_percent',
  'current_loan_delinquency_status',
  'current_actual_upb',
  'interest_rate',
  'loan_age',
  'msa',
  'non_interest_bearing_upb',
  label_col_name
]
all_col_names = cate_col_names + numeric_col_names

## Define ETL Process

### Functions to read raw columns

In [8]:
def read_raw_csv(spark, path):
  def _get_quarter_from_csv_file_name():
    return substring_index(substring_index(input_file_name(), '.', 1), '/', -1)

  with open('work/csv_raw_schema.ddl', 'r') as f:
    _csv_raw_schema_str = f.read()
  
  return (
    spark.read
      .format('csv') 
      .option('nullValue', '') 
      .option('header', False) 
      .option('delimiter', '|') 
      .schema(_csv_raw_schema_str) 
      .load(path) 
      .withColumn('quarter', _get_quarter_from_csv_file_name())
  )

def extract_perf_columns(rawDf):
  perfDf = rawDf.select(
    col('loan_id'),
    date_format(to_date(col('monthly_reporting_period'),'MMyyyy'), 'MM/dd/yyyy').alias('monthly_reporting_period'),
    upper(col('servicer')).alias('servicer'),
    col('interest_rate'),
    col('current_actual_upb'),
    col('loan_age'),
    col('remaining_months_to_legal_maturity'),
    col('adj_remaining_months_to_maturity'),
    date_format(to_date(col('maturity_date'),'MMyyyy'), 'MM/yyyy').alias('maturity_date'),
    col('msa'),
    col('current_loan_delinquency_status'),
    col('mod_flag'),
    col('zero_balance_code'),
    date_format(to_date(col('zero_balance_effective_date'),'MMyyyy'), 'MM/yyyy').alias('zero_balance_effective_date'),
    date_format(to_date(col('last_paid_installment_date'),'MMyyyy'), 'MM/dd/yyyy').alias('last_paid_installment_date'),
    date_format(to_date(col('foreclosed_after'),'MMyyyy'), 'MM/dd/yyyy').alias('foreclosed_after'),
    date_format(to_date(col('disposition_date'),'MMyyyy'), 'MM/dd/yyyy').alias('disposition_date'),
    col('foreclosure_costs'),
    col('prop_preservation_and_repair_costs'),
    col('asset_recovery_costs'),
    col('misc_holding_expenses'),
    col('holding_taxes'),
    col('net_sale_proceeds'),
    col('credit_enhancement_proceeds'),
    col('repurchase_make_whole_proceeds'),
    col('other_foreclosure_proceeds'),
    col('non_interest_bearing_upb'),
    col('principal_forgiveness_upb'),
    col('repurchase_make_whole_proceeds_flag'),
    col('foreclosure_principal_write_off_amount'),
    col('servicing_activity_indicator'),
    col('quarter')
  )
  return perfDf.select('*').filter('current_actual_upb != 0.0')

def extract_acq_columns(rawDf):
  acqDf = rawDf.select(
    col('loan_id'),
    col('orig_channel'),
    upper(col('seller_name')).alias('seller_name'),
    col('orig_interest_rate'),
    col('orig_upb'),
    col('orig_loan_term'),
    date_format(to_date(col('orig_date'),'MMyyyy'), 'MM/yyyy').alias('orig_date'),
    date_format(to_date(col('first_pay_date'),'MMyyyy'), 'MM/yyyy').alias('first_pay_date'),
    col('orig_ltv'),
    col('orig_cltv'),
    col('num_borrowers'),
    col('dti'),
    col('borrower_credit_score'),
    col('first_home_buyer'),
    col('loan_purpose'),
    col('property_type'),
    col('num_units'),
    col('occupancy_status'),
    col('property_state'),
    col('zip'),
    col('mortgage_insurance_percent'),
    col('product_type'),
    col('coborrow_credit_score'),
    col('mortgage_insurance_type'),
    col('relocation_mortgage_indicator'),
    dense_rank().over(Window.partitionBy('loan_id').orderBy(to_date(col('monthly_reporting_period'),'MMyyyy'))).alias('rank'),
    col('quarter')
  )

  return acqDf.select('*').filter(col('rank')==1)

### Define function to parse date in Performance data 

In [9]:
def _parse_dates(perf):
  return (
    perf
      .withColumn("monthly_reporting_period", to_date(col("monthly_reporting_period"), "MM/dd/yyyy")) 
      .withColumn("monthly_reporting_period_month", month(col("monthly_reporting_period"))) 
      .withColumn("monthly_reporting_period_year", year(col("monthly_reporting_period"))) 
      .withColumn("monthly_reporting_period_day", dayofmonth(col("monthly_reporting_period"))) 
      .withColumn("last_paid_installment_date", to_date(col("last_paid_installment_date"), "MM/dd/yyyy")) 
      .withColumn("foreclosed_after", to_date(col("foreclosed_after"), "MM/dd/yyyy")) 
      .withColumn("disposition_date", to_date(col("disposition_date"), "MM/dd/yyyy")) 
      .withColumn("maturity_date", to_date(col("maturity_date"), "MM/yyyy")) 
      .withColumn("zero_balance_effective_date", to_date(col("zero_balance_effective_date"), "MM/yyyy"))
  )

### Define function to create deliquency data frame from Performance data.  

The computed `delinquency_12` column denotes whether a loan will become delinquent by 3, 6, or 9 months, 
or not delinquent, within the next 12 month period.   

It will be the target label for ML multi-class prediction.

In [10]:
def _create_perf_deliquency(spark, perf):
    aggDF = perf.select(
            col("quarter"),
            col("loan_id"),
            col("current_loan_delinquency_status"),
            when(col("current_loan_delinquency_status") >= 1, col("monthly_reporting_period")).alias("delinquency_30"),
            when(col("current_loan_delinquency_status") >= 3, col("monthly_reporting_period")).alias("delinquency_90"),
            when(col("current_loan_delinquency_status") >= 6, col("monthly_reporting_period")).alias("delinquency_180")) \
            .groupBy("quarter", "loan_id") \
            .agg(
                max("current_loan_delinquency_status").alias("delinquency_12"),
                min("delinquency_30").alias("delinquency_30"),
                min("delinquency_90").alias("delinquency_90"),
                min("delinquency_180").alias("delinquency_180")) \
            .select(
                col("quarter"),
                col("loan_id"),
                (col("delinquency_12") >= 1).alias("ever_30"),
                (col("delinquency_12") >= 3).alias("ever_90"),
                (col("delinquency_12") >= 6).alias("ever_180"),
                col("delinquency_30"),
                col("delinquency_90"),
                col("delinquency_180"))
    #aggDF.printSchema()
    joinedDf = perf \
            .withColumnRenamed("monthly_reporting_period", "timestamp") \
            .withColumnRenamed("monthly_reporting_period_month", "timestamp_month") \
            .withColumnRenamed("monthly_reporting_period_year", "timestamp_year") \
            .withColumnRenamed("current_loan_delinquency_status", "delinquency_12") \
            .withColumnRenamed("current_actual_upb", "upb_12") \
            .select("quarter", "loan_id", "timestamp", "delinquency_12", "upb_12", "timestamp_month", "timestamp_year") \
            .join(aggDF, ["loan_id", "quarter"], "left_outer")

    # calculate the 12 month delinquency and upb values
    months = 12
    monthArray = [lit(x) for x in range(0, 12)]
    
    testDf = joinedDf \
            .withColumn("month_y", explode(array(monthArray))) \
            .select(
                    col("quarter"),
                    floor(((col("timestamp_year") * 12 + col("timestamp_month")) - 24000) / months).alias("josh_mody"),
                    floor(((col("timestamp_year") * 12 + col("timestamp_month")) - 24000 - col("month_y")) / months).alias("josh_mody_n"),
                    col("ever_30"),
                    col("ever_90"),
                    col("ever_180"),
                    col("delinquency_30"),
                    col("delinquency_90"),
                    col("delinquency_180"),
                    col("loan_id"),
                    col("month_y"),
                    col("delinquency_12"),
                    col("upb_12")) \
            .groupBy("quarter", "loan_id", "josh_mody_n", "ever_30", "ever_90", "ever_180", "delinquency_30", "delinquency_90", "delinquency_180", "month_y") \
            .agg(max("delinquency_12").alias("delinquency_12"), min("upb_12").alias("upb_12")) \
            .withColumn("timestamp_year", floor((lit(24000) + (col("josh_mody_n") * lit(months)) + (col("month_y") - 1)) / lit(12))) \
            .selectExpr("*", "pmod(24000 + (josh_mody_n * {}) + month_y, 12) as timestamp_month_tmp".format(months)) \
            .withColumn("timestamp_month", when(col("timestamp_month_tmp") == lit(0), lit(12)).otherwise(col("timestamp_month_tmp"))) \
            .withColumn("delinquency_12", ((col("delinquency_12") > 9).cast("int") + (col("delinquency_12") > 6).cast("int") + (col("delinquency_12") > 3).cast("int") + (col("upb_12") == 0).cast("int")).alias("delinquency_12")) \
            .drop("timestamp_month_tmp", "josh_mody_n", "month_y")

    return perf.withColumnRenamed("monthly_reporting_period_month", "timestamp_month") \
            .withColumnRenamed("monthly_reporting_period_year", "timestamp_year") \
            .join(testDf, ["quarter", "loan_id", "timestamp_year", "timestamp_month"], "left") \
            .drop("timestamp_year", "timestamp_month")

### Define function to create acquisition data frame from Acquisition data

In [11]:
def _create_acquisition(spark, acq):
    return acq.join(name_mapping_df, col("seller_name") == col("from_seller_name"), "left") \
      .drop("from_seller_name") \
      .withColumn("old_name", col("seller_name")) \
      .withColumn("seller_name", coalesce(col("to_seller_name"), col("seller_name"))) \
      .drop("to_seller_name") \
      .withColumn("orig_date", to_date(col("orig_date"), "MM/yyyy")) \
      .withColumn("first_pay_date", to_date(col("first_pay_date"), "MM/yyyy")) 

### Define Casting Process


This part is casting String column to Numeric one. 
Example:
```
col_1
 "a"
 "b"
 "c"
 "a"
# After String ====> Numeric
col_1
 0
 1
 2
 0
```  

### Define function to get column dictionary

Example

```
col1 = [row(data="a",id=0), row(data="b",id=1)]
```

In [12]:
def _gen_dictionary(etl_df, col_names):
    cnt_table = etl_df.select(posexplode(array([col(i) for i in col_names])))\
                    .withColumnRenamed("pos", "column_id")\
                    .withColumnRenamed("col", "data")\
                    .filter("data is not null")\
                    .groupBy("column_id", "data")\
                    .count()
    windowed = Window.partitionBy("column_id").orderBy(desc("count"))
    return cnt_table.withColumn("id", row_number().over(windowed)).drop("count")

### Define function to convert string columns to numeric



In [13]:
def _cast_string_columns_to_numeric(spark, input_df):
    cached_dict_df = _gen_dictionary(input_df, cate_col_names).cache()
    output_df = input_df
    #  Generate the final table with all columns being numeric.
    for col_pos, col_name in enumerate(cate_col_names):
        col_dict_df = cached_dict_df.filter(col("column_id") == col_pos)\
                                    .drop("column_id")\
                                    .withColumnRenamed("data", col_name)
        
        output_df = output_df.join(broadcast(col_dict_df), col_name, "left")\
                        .drop(col_name)\
                        .withColumnRenamed("id", col_name)
    return output_df     

### Define Main Function

In this function:
1. Parse date in Performance data by calling _parse_dates (parsed_perf)
2. Create deliqency dataframe(perf_deliqency) form Performance data by calling _create_perf_deliquency
3. Create cleaned acquisition dataframe(cleaned_acq) from Acquisition data by calling _create_acquisition
4. Join deliqency dataframe(perf_deliqency) and cleaned acquisition dataframe(cleaned_acq), get clean_df
5. Cast String column to Numeric in clean_df by calling _cast_string_columns_to_numeric, get casted_clean_df
6. Return casted_clean_df as final result

In [14]:
def run_mortgage(spark, perf, acq):
    parsed_perf = _parse_dates(perf)
    perf_deliqency = _create_perf_deliquency(spark, parsed_perf)
    cleaned_acq = _create_acquisition(spark, acq)
    clean_df = perf_deliqency.join(cleaned_acq, ["loan_id", "quarter"], "inner").drop("quarter")
    casted_clean_df = _cast_string_columns_to_numeric(spark, clean_df)\
                    .select(all_col_names)\
                    .withColumn(label_col_name, when(col(label_col_name) > 0, col(label_col_name)).otherwise(0))\
                    .fillna(float(0))
    return casted_clean_df

### Run ETL Pipeline

#### Read Raw Data and Run ETL Process, Save the Result

Convert CSV to Parquet

In [15]:
rawDf = read_raw_csv(spark, '/data/input.csv')
rawDf.write.parquet('/opt/spark/work-dir/parquet', mode='overwrite')

SparkException: Job aborted.

JVM stacktrace:
org.apache.spark.SparkException
	at org.apache.spark.sql.rapids.GpuFileFormatWriterBase.writeAndCommit(GpuFileFormatWriter.scala:316)
	at org.apache.spark.sql.rapids.GpuFileFormatWriterBase.executeWrite(GpuFileFormatWriter.scala:331)
	at org.apache.spark.sql.rapids.GpuFileFormatWriterBase.executeWrite$(GpuFileFormatWriter.scala:323)
	at org.apache.spark.sql.rapids.GpuFileFormatWriter$.executeWrite(GpuFileFormatWriter.scala:493)
	at org.apache.spark.sql.rapids.GpuFileFormatWriterBase.write(GpuFileFormatWriter.scala:197)
	at org.apache.spark.sql.rapids.GpuFileFormatWriterBase.write$(GpuFileFormatWriter.scala:83)
	at org.apache.spark.sql.rapids.GpuFileFormatWriter$.write(GpuFileFormatWriter.scala:493)
	at org.apache.spark.sql.rapids.GpuInsertIntoHadoopFsRelationCommand.runColumnar(GpuInsertIntoHadoopFsRelationCommand.scala:190)
	at com.nvidia.spark.rapids.GpuDataWritingCommandExec.sideEffectResult$lzycompute(GpuDataWritingCommandExec.scala:125)
	at com.nvidia.spark.rapids.GpuDataWritingCommandExec.sideEffectResult(GpuDataWritingCommandExec.scala:120)
	at com.nvidia.spark.rapids.GpuDataWritingCommandExec.internalDoExecuteColumnar(GpuDataWritingCommandExec.scala:157)
	at com.nvidia.spark.rapids.GpuExec.doExecuteColumnar(GpuExec.scala:193)
	at com.nvidia.spark.rapids.GpuExec.doExecuteColumnar$(GpuExec.scala:191)
	at com.nvidia.spark.rapids.GpuDataWritingCommandExec.doExecuteColumnar(GpuDataWritingCommandExec.scala:116)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeColumnarRDD$1(SparkPlan.scala:222)
	at scala.util.Try$.apply(Try.scala:217)
	at org.apache.spark.util.Utils$.doTryWithCallerStacktrace(Utils.scala:1378)
	at org.apache.spark.util.LazyTry.tryT$lzycompute(LazyTry.scala:46)
	at org.apache.spark.util.LazyTry.tryT(LazyTry.scala:46)
	at org.apache.spark.util.LazyTry.get(LazyTry.scala:58)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeColumnar$1(SparkPlan.scala:236)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:260)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:257)
	at org.apache.spark.sql.execution.SparkPlan.executeColumnar(SparkPlan.scala:232)
	at com.nvidia.spark.rapids.GpuColumnarToRowExec.doExecute(GpuColumnarToRowExec.scala:365)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeRDD$1(SparkPlan.scala:188)
	at scala.util.Try$.apply(Try.scala:217)
	at org.apache.spark.util.Utils$.doTryWithCallerStacktrace(Utils.scala:1378)
	at org.apache.spark.util.LazyTry.tryT$lzycompute(LazyTry.scala:46)
	at org.apache.spark.util.LazyTry.tryT(LazyTry.scala:46)
	at org.apache.spark.util.LazyTry.get(LazyTry.scala:58)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:201)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:260)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:257)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:197)
	at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:378)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:459)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$eagerlyExecuteCommands$2(QueryExecution.scala:155)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$8(SQLExecution.scala:162)
	at org.apache.spark.sql.execution.SQLExecution$.withSessionTagsApplied(SQLExecution.scala:268)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$7(SQLExecution.scala:124)
	at org.apache.spark.JobArtifactSet$.withActiveJobArtifactState(JobArtifactSet.scala:94)
	at org.apache.spark.sql.artifact.ArtifactManager.$anonfun$withResources$1(ArtifactManager.scala:112)
	at org.apache.spark.util.Utils$.withContextClassLoader(Utils.scala:186)
	at org.apache.spark.sql.artifact.ArtifactManager.withClassLoaderIfNeeded(ArtifactManager.scala:102)
	at org.apache.spark.sql.artifact.ArtifactManager.withResources(ArtifactManager.scala:111)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$6(SQLExecution.scala:124)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:291)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$1(SQLExecution.scala:123)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:804)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId0(SQLExecution.scala:77)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:233)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$eagerlyExecuteCommands$1(QueryExecution.scala:155)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:654)
	at org.apache.spark.sql.execution.QueryExecution.org$apache$spark$sql$execution$QueryExecution$$eagerlyExecute$1(QueryExecution.scala:154)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$3.applyOrElse(QueryExecution.scala:169)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$3.applyOrElse(QueryExecution.scala:164)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:470)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:86)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:470)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:37)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:360)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:356)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:37)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:37)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:446)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:164)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$lazyCommandExecuted$1(QueryExecution.scala:126)
	at org.apache.spark.util.Utils$.getTryWithCallerStacktrace(Utils.scala:1439)
	at org.apache.spark.util.LazyTry.get(LazyTry.scala:58)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:131)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:192)
	at org.apache.spark.sql.classic.DataFrameWriter.runCommand(DataFrameWriter.scala:622)
	at org.apache.spark.sql.classic.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:273)
	at org.apache.spark.sql.classic.DataFrameWriter.saveInternal(DataFrameWriter.scala:241)
	at org.apache.spark.sql.classic.DataFrameWriter.save(DataFrameWriter.scala:118)
	at org.apache.spark.sql.connect.planner.SparkConnectPlanner.handleWriteOperation(SparkConnectPlanner.scala:2955)
	at org.apache.spark.sql.connect.planner.SparkConnectPlanner.process(SparkConnectPlanner.scala:2492)
	at org.apache.spark.sql.connect.execution.ExecuteThreadRunner.handleCommand(ExecuteThreadRunner.scala:322)
	at org.apache.spark.sql.connect.execution.ExecuteThreadRunner.$anonfun$executeInternal$1(ExecuteThreadRunner.scala:224)
	at org.apache.spark.sql.connect.execution.ExecuteThreadRunner.$anonfun$executeInternal$1$adapted(ExecuteThreadRunner.scala:196)
	at org.apache.spark.sql.connect.service.SessionHolder.$anonfun$withSession$2(SessionHolder.scala:341)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:804)
	at org.apache.spark.sql.connect.service.SessionHolder.$anonfun$withSession$1(SessionHolder.scala:341)
	at org.apache.spark.JobArtifactSet$.withActiveJobArtifactState(JobArtifactSet.scala:94)
	at org.apache.spark.sql.artifact.ArtifactManager.$anonfun$withResources$1(ArtifactManager.scala:112)
	at org.apache.spark.util.Utils$.withContextClassLoader(Utils.scala:186)
	at org.apache.spark.sql.artifact.ArtifactManager.withClassLoaderIfNeeded(ArtifactManager.scala:102)
	at org.apache.spark.sql.artifact.ArtifactManager.withResources(ArtifactManager.scala:111)
	at org.apache.spark.sql.connect.service.SessionHolder.withSession(SessionHolder.scala:340)
	at org.apache.spark.sql.connect.execution.ExecuteThreadRunner.executeInternal(ExecuteThreadRunner.scala:196)
	at org.apache.spark.sql.connect.execution.ExecuteThreadRunner.org$apache$spark$sql$connect$execution$ExecuteThreadRunner$$execute(ExecuteThreadRunner.scala:125)
	at org.apache.spark.sql.connect.execution.ExecuteThreadRunner$ExecutionThread.run(ExecuteThreadRunner.scala:347)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 3.0 failed 4 times, most recent failure: Lost task 0.3 in stage 3.0 (TID 134) (172.18.0.3 executor 0): java.io.IOException: Mkdirs failed to create file:/opt/spark/work-dir/parquet/_temporary/0/_temporary/attempt_202509140626461081785401780376536_0003_m_000000_134 (exists=false, cwd=file:/opt/spark/work-dir/app-20250914062613-0000/0)
	at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:715)
	at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:700)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:1233)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:1210)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:1091)
	at com.nvidia.spark.rapids.ColumnarOutputWriter.openOutputStream(ColumnarOutputWriter.scala:125)
	at com.nvidia.spark.rapids.ColumnarOutputWriter.getOutputStream(ColumnarOutputWriter.scala:135)
	at com.nvidia.spark.rapids.ColumnarOutputWriter.<init>(ColumnarOutputWriter.scala:139)
	at com.nvidia.spark.rapids.GpuParquetWriter.<init>(GpuParquetFileFormat.scala:318)
	at com.nvidia.spark.rapids.GpuParquetFileFormat$$anon$1.newInstance(GpuParquetFileFormat.scala:290)
	at org.apache.spark.sql.rapids.GpuSingleDirectoryDataWriter.newOutputWriter(GpuFileFormatDataWriter.scala:273)
	at org.apache.spark.sql.rapids.GpuSingleDirectoryDataWriter.<init>(GpuFileFormatDataWriter.scala:247)
	at org.apache.spark.sql.rapids.GpuFileFormatWriterBase.executeTask(GpuFileFormatWriter.scala:417)
	at org.apache.spark.sql.rapids.GpuFileFormatWriterBase.executeTask$(GpuFileFormatWriter.scala:381)
	at org.apache.spark.sql.rapids.GpuFileFormatWriter$.executeTask(GpuFileFormatWriter.scala:493)
	at org.apache.spark.sql.execution.datasources.GpuWriteFilesExec.$anonfun$doExecuteColumnarWrite$1(GpuWriteFiles.scala:156)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:901)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:901)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:374)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:338)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:171)
	at org.apache.spark.scheduler.Task.run(Task.scala:147)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$5(Executor.scala:647)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:80)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:77)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:650)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:840)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$3(DAGScheduler.scala:2935)
	at scala.Option.getOrElse(Option.scala:201)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2935)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2927)
	at scala.collection.immutable.List.foreach(List.scala:334)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2927)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1295)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1295)
	at scala.Option.foreach(Option.scala:437)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1295)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3207)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3141)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3130)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:50)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:1009)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2484)
	at org.apache.spark.sql.rapids.GpuFileFormatWriterBase.$anonfun$executeWrite$6(GpuFileFormatWriter.scala:335)
	at org.apache.spark.sql.rapids.GpuFileFormatWriterBase.writeAndCommit(GpuFileFormatWriter.scala:299)
	at org.apache.spark.sql.rapids.GpuFileFormatWriterBase.executeWrite(GpuFileFormatWriter.scala:331)
	at org.apache.spark.sql.rapids.GpuFileFormatWriterBase.executeWrite$(GpuFileFormatWriter.scala:323)
	at org.apache.spark.sql.rapids.GpuFileFormatWriter$.executeWrite(GpuFileFormatWriter.scala:493)
	at org.apache.spark.sql.rapids.GpuFileFormatWriterBase.write(GpuFileFormatWriter.scala:197)
	at org.apache.spark.sql.rapids.GpuFileFormatWriterBase.write$(GpuFileFormatWriter.scala:83)
	at org.apache.spark.sql.rapids.GpuFileFormatWriter$.write(GpuFileFormatWriter.scala:493)
	at org.apache.spark.sql.rapids.GpuInsertIntoHadoopFsRelationCommand.runColumnar(GpuInsertIntoHadoopFsRelationCommand.scala:190)
	at com.nvidia.spark.rapids.GpuDataWritingCommandExec.sideEffectResult$lzycompute(GpuDataWritingCommandExec.scala:125)
	at com.nvidia.spark.rapids.GpuDataWritingCommandExec.sideEffectResult(GpuDataWritingCommandExec.scala:120)
	at com.nvidia.spark.rapids.GpuDataWritingCommandExec.internalDoExecuteColumnar(GpuDataWritingCommandExec.scala:157)
	at com.nvidia.spark.rapids.GpuExec.doExecuteColumnar(GpuExec.scala:193)
	at com.nvidia.spark.rapids.GpuExec.doExecuteColumnar$(GpuExec.scala:191)
	at com.nvidia.spark.rapids.GpuDataWritingCommandExec.doExecuteColumnar(GpuDataWritingCommandExec.scala:116)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeColumnarRDD$1(SparkPlan.scala:222)
	at scala.util.Try$.apply(Try.scala:217)
	at org.apache.spark.util.Utils$.doTryWithCallerStacktrace(Utils.scala:1378)
	at org.apache.spark.util.LazyTry.tryT$lzycompute(LazyTry.scala:46)
	at org.apache.spark.util.LazyTry.tryT(LazyTry.scala:46)
	at org.apache.spark.util.LazyTry.get(LazyTry.scala:58)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeColumnar$1(SparkPlan.scala:236)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:260)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:257)
	at org.apache.spark.sql.execution.SparkPlan.executeColumnar(SparkPlan.scala:232)
	at com.nvidia.spark.rapids.GpuColumnarToRowExec.doExecute(GpuColumnarToRowExec.scala:365)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeRDD$1(SparkPlan.scala:188)
	at scala.util.Try$.apply(Try.scala:217)
	at org.apache.spark.util.Utils$.doTryWithCallerStacktrace(Utils.scala:1378)
	at org.apache.spark.util.LazyTry.tryT$lzycompute(LazyTry.scala:46)
	at org.apache.spark.util.LazyTry.tryT(LazyTry.scala:46)
	at org.apache.spark.util.LazyTry.get(LazyTry.scala:58)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:201)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:260)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:257)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:197)
	at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:378)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:459)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$eagerlyExecuteCommands$2(QueryExecution.scala:155)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$8(SQLExecution.scala:162)
	at org.apache.spark.sql.execution.SQLExecution$.withSessionTagsApplied(SQLExecution.scala:268)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$7(SQLExecution.scala:124)
	at org.apache.spark.JobArtifactSet$.withActiveJobArtifactState(JobArtifactSet.scala:94)
	at org.apache.spark.sql.artifact.ArtifactManager.$anonfun$withResources$1(ArtifactManager.scala:112)
	at org.apache.spark.util.Utils$.withContextClassLoader(Utils.scala:186)
	at org.apache.spark.sql.artifact.ArtifactManager.withClassLoaderIfNeeded(ArtifactManager.scala:102)
	at org.apache.spark.sql.artifact.ArtifactManager.withResources(ArtifactManager.scala:111)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$6(SQLExecution.scala:124)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:291)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$1(SQLExecution.scala:123)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:804)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId0(SQLExecution.scala:77)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:233)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$eagerlyExecuteCommands$1(QueryExecution.scala:155)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:654)
	at org.apache.spark.sql.execution.QueryExecution.org$apache$spark$sql$execution$QueryExecution$$eagerlyExecute$1(QueryExecution.scala:154)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$3.applyOrElse(QueryExecution.scala:169)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$3.applyOrElse(QueryExecution.scala:164)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:470)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:86)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:470)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:37)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:360)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:356)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:37)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:37)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:446)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:164)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$lazyCommandExecuted$1(QueryExecution.scala:126)
	at scala.util.Try$.apply(Try.scala:217)
	at org.apache.spark.util.Utils$.doTryWithCallerStacktrace(Utils.scala:1378)
	at org.apache.spark.util.LazyTry.tryT$lzycompute(LazyTry.scala:46)
	at org.apache.spark.util.LazyTry.tryT(LazyTry.scala:46)
	at org.apache.spark.util.LazyTry.get(LazyTry.scala:58)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:131)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:192)
	at org.apache.spark.sql.classic.DataFrameWriter.runCommand(DataFrameWriter.scala:622)
	at org.apache.spark.sql.classic.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:273)
	at org.apache.spark.sql.classic.DataFrameWriter.saveInternal(DataFrameWriter.scala:241)
	at org.apache.spark.sql.classic.DataFrameWriter.save(DataFrameWriter.scala:118)
	at org.apache.spark.sql.connect.planner.SparkConnectPlanner.handleWriteOperation(SparkConnectPlanner.scala:2955)
	at org.apache.spark.sql.connect.planner.SparkConnectPlanner.process(SparkConnectPlanner.scala:2492)
	at org.apache.spark.sql.connect.execution.ExecuteThreadRunner.handleCommand(ExecuteThreadRunner.scala:322)
	at org.apache.spark.sql.connect.execution.ExecuteThreadRunner.$anonfun$executeInternal$1(ExecuteThreadRunner.scala:224)
	at org.apache.spark.sql.connect.execution.ExecuteThreadRunner.$anonfun$executeInternal$1$adapted(ExecuteThreadRunner.scala:196)
	at org.apache.spark.sql.connect.service.SessionHolder.$anonfun$withSession$2(SessionHolder.scala:341)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:804)
	at org.apache.spark.sql.connect.service.SessionHolder.$anonfun$withSession$1(SessionHolder.scala:341)
	at org.apache.spark.JobArtifactSet$.withActiveJobArtifactState(JobArtifactSet.scala:94)
	at org.apache.spark.sql.artifact.ArtifactManager.$anonfun$withResources$1(ArtifactManager.scala:112)
	at org.apache.spark.util.Utils$.withContextClassLoader(Utils.scala:186)
	at org.apache.spark.sql.artifact.ArtifactManager.withClassLoaderIfNeeded(ArtifactManager.scala:102)
	at org.apache.spark.sql.artifact.ArtifactManager.withResources(ArtifactManager.scala:111)
	at org.apache.spark.sql.connect.service.SessionHolder.withSession(SessionHolder.scala:340)
	at org.apache.spark.sql.connect.execution.ExecuteThreadRunner.executeInternal(ExecuteThreadRunner.scala:196)
	at org.apache.spark.sql.connect.execution.ExecuteThreadRunner.org$apache$spark$sql$connect$execution$ExecuteThreadRunner$$execute(ExecuteThreadRunner.scala:125)
	at org.apache.spark.sql.connect.execution.ExecuteThreadRunner$ExecutionThread.run(ExecuteThreadRunner.scala:347)
Caused by: java.io.IOException: Mkdirs failed to create file:/opt/spark/work-dir/parquet/_temporary/0/_temporary/attempt_202509140626461081785401780376536_0003_m_000000_134 (exists=false, cwd=file:/opt/spark/work-dir/app-20250914062613-0000/0)
	at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:715)
	at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:700)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:1233)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:1210)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:1091)
	at com.nvidia.spark.rapids.ColumnarOutputWriter.openOutputStream(ColumnarOutputWriter.scala:125)
	at com.nvidia.spark.rapids.ColumnarOutputWriter.getOutputStream(ColumnarOutputWriter.scala:135)
	at com.nvidia.spark.rapids.ColumnarOutputWriter.<init>(ColumnarOutputWriter.scala:139)
	at com.nvidia.spark.rapids.GpuParquetWriter.<init>(GpuParquetFileFormat.scala:318)
	at com.nvidia.spark.rapids.GpuParquetFileFormat$$anon$1.newInstance(GpuParquetFileFormat.scala:290)
	at org.apache.spark.sql.rapids.GpuSingleDirectoryDataWriter.newOutputWriter(GpuFileFormatDataWriter.scala:273)
	at org.apache.spark.sql.rapids.GpuSingleDirectoryDataWriter.<init>(GpuFileFormatDataWriter.scala:247)
	at org.apache.spark.sql.rapids.GpuFileFormatWriterBase.executeTask(GpuFileFormatWriter.scala:417)
	at org.apache.spark.sql.rapids.GpuFileFormatWriterBase.executeTask$(GpuFileFormatWriter.scala:381)
	at org.apache.spark.sql.rapids.GpuFileFormatWriter$.executeTask(GpuFileFormatWriter.scala:493)
	at org.apache.spark.sql.execution.datasources.GpuWriteFilesExec.$anonfun$doExecuteColumnarWrite$1(GpuWriteFiles.scala:156)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:901)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:901)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:374)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:338)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:171)
	at org.apache.spark.scheduler.Task.run(Task.scala:147)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$5(Executor.scala:647)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:80)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:77)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:650)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.lang.Thread.run(Thread.java:840)