-
Notifications
You must be signed in to change notification settings - Fork 221
/
GpuTransitionOverrides.scala
855 lines (789 loc) · 39 KB
/
GpuTransitionOverrides.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.nvidia.spark.rapids
import java.util.concurrent.atomic.AtomicInteger
import scala.annotation.tailrec
import scala.collection.mutable
import com.nvidia.spark.rapids.shims.{GpuBatchScanExec, SparkShimImpl}
import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, AttributeReference, Expression, InputFileBlockLength, InputFileBlockStart, InputFileName, SortOrder}
import org.apache.spark.sql.catalyst.plans.physical.IdentityBroadcastMode
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.execution._
import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, BroadcastQueryStageExec, ShuffleQueryStageExec}
import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec
import org.apache.spark.sql.execution.command.{DataWritingCommandExec, ExecutedCommandExec}
import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2ScanExecBase, DropTableExec, ShowTablesExec}
import org.apache.spark.sql.execution.exchange.{BroadcastExchangeLike, Exchange, ReusedExchangeExec, ShuffleExchangeLike}
import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, BroadcastNestedLoopJoinExec, HashedRelationBroadcastMode}
import org.apache.spark.sql.rapids.{GpuDataSourceScanExec, GpuFileSourceScanExec, GpuInputFileBlockLength, GpuInputFileBlockStart, GpuInputFileName, GpuShuffleEnv, GpuTaskMetrics}
import org.apache.spark.sql.rapids.execution.{ExchangeMappingCache, GpuBroadcastExchangeExec, GpuBroadcastExchangeExecBase, GpuBroadcastToRowExec, GpuCustomShuffleReaderExec, GpuHashJoin, GpuShuffleExchangeExecBase}
import org.apache.spark.sql.types.StructType
/**
* Rules that run after the row to columnar and columnar to row transitions have been inserted.
* These rules insert transitions to and from the GPU, and then optimize various transitions.
*/
class GpuTransitionOverrides extends Rule[SparkPlan] {
// previous name of the field `conf` collides with Rule#conf as of Spark 3.1.1
var rapidsConf: RapidsConf = null
def optimizeGpuPlanTransitions(plan: SparkPlan): SparkPlan = plan match {
case HostColumnarToGpu(r2c: RowToColumnarExec, goal) =>
val optimizedChild = optimizeGpuPlanTransitions(r2c.child)
val projectedChild =
r2c.child.getTagValue(GpuOverrides.preRowToColProjection).map { preProcessing =>
ProjectExec(preProcessing, optimizedChild)
}.getOrElse(optimizedChild)
GpuRowToColumnarExec(projectedChild, goal)
case ColumnarToRowExec(bb: GpuBringBackToHost) =>
GpuColumnarToRowExec(optimizeGpuPlanTransitions(bb.child))
// inserts postColumnarToRowTransition into newly-created GpuColumnarToRowExec
case p if p.getTagValue(GpuOverrides.postColToRowProjection).nonEmpty =>
val c2r = p.children.map(optimizeGpuPlanTransitions).head.asInstanceOf[GpuColumnarToRowExec]
val newChild = p.getTagValue(GpuOverrides.postColToRowProjection).map { exprs =>
ProjectExec(exprs, c2r)
}.getOrElse(c2r)
p.withNewChildren(Array(newChild))
case p =>
p.withNewChildren(p.children.map(optimizeGpuPlanTransitions))
}
/** Adds the appropriate coalesce after a shuffle depending on the type of shuffle configured */
private def addPostShuffleCoalesce(plan: SparkPlan): SparkPlan = {
if (GpuShuffleEnv.useGPUShuffle(rapidsConf)) {
GpuCoalesceBatches(plan, TargetSize(rapidsConf.gpuTargetBatchSizeBytes))
} else {
GpuShuffleCoalesceExec(plan, rapidsConf.gpuTargetBatchSizeBytes)
}
}
def optimizeAdaptiveTransitions(
plan: SparkPlan,
parent: Option[SparkPlan]): SparkPlan = plan match {
case bb @ GpuBringBackToHost(child) if parent.isEmpty =>
// This is hacky but we need to remove the GpuBringBackToHost from the final
// query stage, if there is one. It gets inserted by
// GpuTransitionOverrides.insertColumnarFromGpu around columnar adaptive
// plans when we are writing to columnar formats on the GPU. It would be nice to avoid
// inserting it in the first place but we just don't have enough context
// at the time GpuTransitionOverrides is applying rules.
optimizeAdaptiveTransitions(child, Some(bb))
// HostColumnarToGpu(RowToColumnarExec(..)) => GpuRowToColumnarExec(..)
case HostColumnarToGpu(r2c: RowToColumnarExec, goal) =>
val child = optimizeAdaptiveTransitions(r2c.child, Some(r2c))
child match {
case a: AdaptiveSparkPlanExec =>
// we hit this case when we have an adaptive plan wrapped in a write
// to columnar file format on the GPU
val columnarAdaptivePlan = SparkShimImpl.columnarAdaptivePlan(a, goal)
optimizeAdaptiveTransitions(columnarAdaptivePlan, None)
case _ =>
val newChild = child.getTagValue(GpuOverrides.preRowToColProjection).map { exprs =>
ProjectExec(exprs, child)
}.getOrElse(child)
GpuRowToColumnarExec(newChild, goal)
}
// adaptive plan final query stage with columnar output
case r2c @ RowToColumnarExec(child) if parent.isEmpty =>
val optimizedChild = optimizeAdaptiveTransitions(child, Some(r2c))
val projectedChild =
optimizedChild.getTagValue(GpuOverrides.preRowToColProjection).map { exprs =>
ProjectExec(exprs, optimizedChild)
}.getOrElse(optimizedChild)
GpuRowToColumnarExec(projectedChild, TargetSize(rapidsConf.gpuTargetBatchSizeBytes))
case ColumnarToRowExec(bb: GpuBringBackToHost) =>
// We typically want the final operator in the plan (the operator that has no parent) to be
// wrapped in `ColumnarToRowExec(GpuBringBackToHost(_))` operators to
// bring the data back onto the host and be translated to rows so that it can be returned
// from the Spark API. However, in the case of AQE, each exchange operator is treated as an
// individual query with no parent and we need to remove these operators in this case
// because we need to return an operator that implements `BroadcastExchangeLike` or
// `ShuffleExchangeLike`.
bb.child match {
case GpuShuffleCoalesceExec(e: GpuShuffleExchangeExecBase, _) if parent.isEmpty =>
// The coalesce step gets added back into the plan later on, in a
// future query stage that reads the output from this query stage. This
// is handled in the case clauses below.
e.withNewChildren(e.children.map(c => optimizeAdaptiveTransitions(c, Some(e))))
case GpuCoalesceBatches(e: GpuShuffleExchangeExecBase, _) if parent.isEmpty =>
// The coalesce step gets added back into the plan later on, in a
// future query stage that reads the output from this query stage. This
// is handled in the case clauses below.
e.withNewChildren(e.children.map(c => optimizeAdaptiveTransitions(c, Some(e))))
case _ => optimizeAdaptiveTransitions(bb.child, Some(bb)) match {
case e: GpuBroadcastExchangeExecBase => e
case e: GpuShuffleExchangeExecBase => e
case other => GpuColumnarToRowExec(other)
}
}
case s: ShuffleQueryStageExec =>
// When reading a materialized shuffle query stage in AQE mode, we need to insert an
// operator to coalesce batches. We either insert it directly around the shuffle query
// stage, or around the custom shuffle reader, if one exists.
val plan = GpuTransitionOverrides.getNonQueryStagePlan(s)
if (plan.supportsColumnar && plan.isInstanceOf[GpuExec]) {
(plan, parent) match {
case (_, Some(x)) if SparkShimImpl.isCustomReaderExec(x) =>
// We can't insert a coalesce batches operator between a custom shuffle reader
// and a shuffle query stage, so we instead insert it around the custom shuffle
// reader later on, in the next top-level case clause.
s
case (ex: ShuffleExchangeLike, Some(x))
if SparkShimImpl.shuffleParentReadsShuffleData(ex, x) =>
// In some cases, the parent might have to read the shuffle data directly, so
// we don't need the post-shuffle coalesce exec since the parent should
// coalesce the shuffle data as needed
s
case _ =>
// Directly wrap shuffle query stage with coalesce batches operator
addPostShuffleCoalesce(s)
}
} else {
s.plan.getTagValue(GpuOverrides.preRowToColProjection).foreach { p =>
s.setTagValue(GpuOverrides.preRowToColProjection, p)
}
s
}
case e: GpuCustomShuffleReaderExec =>
// We wrap custom shuffle readers with a coalesce batches operator here.
addPostShuffleCoalesce(e.copy(child = optimizeAdaptiveTransitions(e.child, Some(e))))
case ColumnarToRowExec(e: ShuffleQueryStageExec) =>
val c2r = GpuColumnarToRowExec(optimizeAdaptiveTransitions(e, Some(plan)))
SparkShimImpl.addRowShuffleToQueryStageTransitionIfNeeded(c2r, e)
case ColumnarToRowExec(e: BroadcastQueryStageExec) =>
e.plan match {
case ReusedExchangeExec(output, b: GpuBroadcastExchangeExec) =>
// we can't directly re-use a GPU broadcast exchange to feed a CPU broadcast
// join but Spark will sometimes try and do this
val keys = output.map { a => a.asInstanceOf[Expression] }
val (index, keyExprs) = b.mode match {
case HashedRelationBroadcastMode(keys, _) => (None, Some(keys))
case IdentityBroadcastMode => (Some(0), None)
case m => throw new UnsupportedOperationException(s"Unknown broadcast mode $m")
}
GpuBroadcastToRowExec(keys, b.mode, e)(index, keyExprs)
case _ => GpuColumnarToRowExec(optimizeAdaptiveTransitions(e, Some(plan)))
}
// inserts postColumnarToRowTransition into newly-created GpuColumnarToRowExec
case p if p.getTagValue(GpuOverrides.postColToRowProjection).nonEmpty =>
val c2r = p.children.map(optimizeAdaptiveTransitions(_, Some(p))).head
.asInstanceOf[GpuColumnarToRowExec]
val newChild = p.getTagValue(GpuOverrides.postColToRowProjection).map { exprs =>
ProjectExec(exprs, c2r)
}.getOrElse(c2r)
p.withNewChildren(Array(newChild))
case p =>
p.withNewChildren(p.children.map(c => optimizeAdaptiveTransitions(c, Some(p))))
}
/**
* Fixes up instances of HostColumnarToGpu that are operating on nested types.
* There are no batch methods to access nested types in Spark's ColumnVector, and as such
* HostColumnarToGpu does not support nested types due to the performance problem. If there's
* nested types involved, use a CPU columnar to row transition followed by a GPU row to
* columnar transition which is a more optimized code path for these types.
* This is done as a fixup pass since there are earlier transition optimizations that are
* looking for HostColumnarToGpu when optimizing transitions.
*/
def fixupHostColumnarTransitions(plan: SparkPlan): SparkPlan = plan match {
case HostColumnarToGpu(child, goal) if DataTypeUtils.hasNestedTypes(child.schema) =>
GpuRowToColumnarExec(ColumnarToRowExec(fixupHostColumnarTransitions(child)), goal)
case p => p.withNewChildren(p.children.map(fixupHostColumnarTransitions))
}
@tailrec
private def isGpuShuffleLike(execNode: SparkPlan): Boolean = execNode match {
case _: GpuShuffleExchangeExecBase | _: GpuCustomShuffleReaderExec => true
case qs: ShuffleQueryStageExec => isGpuShuffleLike(qs.plan)
case _ => false
}
/**
* This optimizes the plan to remove [[GpuCoalesceBatches]] nodes that are unnecessary
* or undesired in some situations.
*
* @note This does not examine [[GpuShuffleCoalesceExec]] nodes in the plan, as they
* are always required after GPU columnar exchanges during normal shuffle
* to place the data after shuffle on the GPU. Those nodes also do not
* coalesce to the same goal as used by [[GpuCoalesceBatches]], so a
* [[GpuShuffleCoalesceExec]] immediately followed by a [[GpuCoalesceBatches]] is
* not unusual.
*/
def optimizeCoalesce(plan: SparkPlan): SparkPlan = plan match {
case c2r @ GpuColumnarToRowExec(gpuCoalesce: GpuCoalesceBatches, _)
if !isGpuShuffleLike(gpuCoalesce.child) =>
// Don't build a batch if we are just going to go back to ROWS
// and there isn't a GPU shuffle involved
c2r.withNewChildren(gpuCoalesce.children.map(optimizeCoalesce))
case GpuCoalesceBatches(r2c: GpuRowToColumnarExec, goal: TargetSize) =>
// TODO in the future we should support this for all goals, but
// GpuRowToColumnarExec preallocates all of the memory, and the builder does not
// support growing the sizes dynamically....
// Don't build batches and then coalesce, just build the right sized batch
GpuRowToColumnarExec(optimizeCoalesce(r2c.child),
CoalesceGoal.maxRequirement(goal, r2c.goal).asInstanceOf[CoalesceSizeGoal])
case GpuCoalesceBatches(co: GpuCoalesceBatches, goal) =>
GpuCoalesceBatches(optimizeCoalesce(co.child), CoalesceGoal.maxRequirement(goal, co.goal))
case GpuCoalesceBatches(child: GpuExec, goal)
if CoalesceGoal.satisfies(child.outputBatching, goal) =>
// The goal is already satisfied so remove the batching
child.withNewChildren(child.children.map(optimizeCoalesce))
case p =>
p.withNewChildren(p.children.map(optimizeCoalesce))
}
/**
* Removes `GpuCoalesceBatches(GpuShuffleCoalesceExec(build side))` for the build side
* for the shuffled hash join. The coalesce logic has been moved to the
* `GpuShuffleCoalesceExec` class, and is handled differently to prevent holding onto the
* GPU semaphore for stream IO.
*/
def shuffledHashJoinOptimizeShuffle(plan: SparkPlan): SparkPlan = plan match {
case x@GpuShuffledHashJoinExec(
_, _, _, buildSide, _,
left: GpuShuffleCoalesceExec,
GpuCoalesceBatches(GpuShuffleCoalesceExec(rc, _), _),_) if buildSide == GpuBuildRight =>
x.withNewChildren(
Seq(shuffledHashJoinOptimizeShuffle(left), shuffledHashJoinOptimizeShuffle(rc)))
case x@GpuShuffledHashJoinExec(
_, _, _, buildSide, _,
GpuCoalesceBatches(GpuShuffleCoalesceExec(lc, _), _),
right: GpuShuffleCoalesceExec, _) if buildSide == GpuBuildLeft =>
x.withNewChildren(
Seq(shuffledHashJoinOptimizeShuffle(lc), shuffledHashJoinOptimizeShuffle(right)))
case p => p.withNewChildren(p.children.map(shuffledHashJoinOptimizeShuffle))
}
private def insertCoalesce(plans: Seq[SparkPlan], goals: Seq[CoalesceGoal],
disableUntilInput: Boolean): Seq[SparkPlan] = {
plans.zip(goals).map {
case (plan, null) =>
// No coalesce requested
insertCoalesce(plan, disableUntilInput)
case (plan, goal: RequireSingleBatchLike) =>
// Even if coalesce is disabled a single batch is required to make this operator work
// This should not cause bugs because we require a single batch in situations where
// Spark also buffers data, so any operator that needs coalesce disabled would also
// get an incorrect answer in regular Spark
GpuCoalesceBatches(insertCoalesce(plan, disableUntilInput), goal)
case (plan, _) if disableUntilInput =>
// We wanted to coalesce the input but cannot because it could cause errors
insertCoalesce(plan, disableUntilInput)
case (plan, goal) =>
GpuCoalesceBatches(insertCoalesce(plan, disableUntilInput), goal)
}
}
/**
* Essentially check if this plan is in the same task as a file input.
*/
private def hasDirectLineToInput(plan: SparkPlan): Boolean = plan match {
case _: Exchange => false
case _: DataSourceScanExec => true
case _: GpuDataSourceScanExec => true
case _: DataSourceV2ScanExecBase => true
case _: RDDScanExec => true // just in case an RDD was reading in data
case p => p.children.exists(hasDirectLineToInput)
}
/**
* Essentially check if we have hit a boundary of a task.
*/
private def shouldEnableCoalesce(plan: SparkPlan): Boolean = plan match {
case _: Exchange => true
case _: DataSourceScanExec => true
case _: GpuDataSourceScanExec => true
case _: DataSourceV2ScanExecBase => true
case _: RDDScanExec => true // just in case an RDD was reading in data
case _ => false
}
/**
* Because we cannot change the executors in spark itself we need to try and account for
* the ones that might have issues with coalesce here.
*/
private def disableCoalesceUntilInput(plan: SparkPlan): Boolean = {
plan.expressions.exists(GpuTransitionOverrides.checkHasInputFileExpressions)
}
private def disableScanUntilInput(exec: Expression): Boolean = {
exec match {
case _: InputFileName => true
case _: InputFileBlockStart => true
case _: InputFileBlockLength => true
case _: GpuInputFileName => true
case _: GpuInputFileBlockStart => true
case _: GpuInputFileBlockLength => true
case e => e.children.exists(disableScanUntilInput)
}
}
private def disableScanUntilInput(plan: SparkPlan): Boolean = {
plan.expressions.exists(disableScanUntilInput)
}
// This walks from the output to the input to look for any uses of InputFileName,
// InputFileBlockStart, or InputFileBlockLength when we use a Parquet read because
// we can't support the coalesce file reader optimization when this is used.
private def updateScansForInputAndOrder(plan: SparkPlan,
disableUntilInput: Boolean = false): SparkPlan = {
plan match {
case batchScan: GpuBatchScanExec =>
if (disableUntilInput || disableScanUntilInput(batchScan)) {
val newScan = batchScan.scan.withInputFile()
if (newScan ne batchScan.scan) {
batchScan.copy(scan = newScan)
} else {
batchScan
}
} else {
batchScan
}
case fileSourceScan: GpuFileSourceScanExec =>
if ((disableUntilInput || disableScanUntilInput(fileSourceScan))) {
fileSourceScan.copy(queryUsesInputFile = true)(fileSourceScan.rapidsConf)
} else {
fileSourceScan
}
case p =>
val planDisableUntilInput = disableScanUntilInput(p) && hasDirectLineToInput(p)
p.withNewChildren(p.children.map(c => {
updateScansForInputAndOrder(c, planDisableUntilInput || disableUntilInput)
}))
}
}
def extractAttrReferences[A <: Expression](expression: A): Seq[AttributeReference] = {
expression match {
case a: AttributeReference =>
Seq(a)
case other: Expression =>
other.children.flatMap(extractAttrReferences)
}
}
private def withPrunedPartSchema(
fss: GpuFileSourceScanExec,
referenceList: Seq[Expression]): GpuFileSourceScanExec = {
// Luckily partition columns do not support nested types. So only need to prune the
// top level columns.
// Prune the output of the Scan so it only includes things that the referenceList will
// actually read
val neededExprIds = referenceList.flatMap(extractAttrReferences).map(_.exprId).toSet
val partOutAttrs = fss.output.drop(fss.requiredSchema.length)
val neededPartIndexes = partOutAttrs.zipWithIndex.collect{
case (provided, index) if neededExprIds.contains(provided.exprId) => index
}
val prunedPartSchema = Some(StructType(neededPartIndexes.map(fss.relation.partitionSchema)))
fss.copy(requiredPartitionSchema = prunedPartSchema)(fss.rapidsConf)
}
// This tries to prune the partition schema for GpuFileSourceScanExec by leveraging
// the project list of the first GpuProjectExec after a GpuFileSourceScanExec.
private def prunePartitionForFileSourceScan(plan: SparkPlan): SparkPlan = plan match {
case p @ GpuProjectExecLike(projectList, fss: GpuFileSourceScanExec) =>
// A ProjectExec next to FileSourceScanExec, for cases like
// df.groupBy("b").agg(max($"a")), or
// df.select("b")
p.withNewChildren(Seq(withPrunedPartSchema(fss, projectList)))
case p @ GpuProjectExecLike(
projectList,
f @ GpuFilterExec(condition, fss: GpuFileSourceScanExec)) =>
// A FilterExec is between the ProjectExec and FileSourceScanExec, for cases like
// df.select("a").filter("a != 1")
p.withNewChildren(Seq(
f.withNewChildren(Seq(withPrunedPartSchema(fss, projectList :+ condition)))))
// Partial GPU plan cases.
// This rule executes before rules override the ColumnarToRowExec, so the exec is the
// CPU version here.
case p @ ProjectExec(projectList, cr @ ColumnarToRowExec(fss: GpuFileSourceScanExec)) =>
// cpu project + gpu file scan
p.withNewChildren(Seq(
cr.withNewChildren(Seq(withPrunedPartSchema(fss, projectList)))))
case p @ ProjectExec(
projectList,
cr @ ColumnarToRowExec(f @ GpuFilterExec(condition, fss: GpuFileSourceScanExec))) =>
// cpu project + gpu filter + gpu file scan
p.withNewChildren(Seq(
cr.withNewChildren(Seq(
f.withNewChildren(Seq(withPrunedPartSchema(fss, projectList :+ condition)))))))
case p @ ProjectExec(
projectList,
f @ FilterExec(condition, cr @ ColumnarToRowExec(fss: GpuFileSourceScanExec))) =>
// cpu project + cpu filter + gpu file scan
p.withNewChildren(Seq(
f.withNewChildren(Seq(
cr.withNewChildren(Seq(withPrunedPartSchema(fss, projectList :+ condition)))))))
case p @ GpuProjectExecLike(
projectList,
rc @ RowToColumnarExec(
f @ FilterExec(condition, cr @ ColumnarToRowExec(fss: GpuFileSourceScanExec)))) =>
// gpu project + cpu filter + gpu file scan
p.withNewChildren(Seq(
rc.withNewChildren(Seq(
f.withNewChildren(Seq(
cr.withNewChildren(Seq(withPrunedPartSchema(fss, projectList :+ condition)))))))))
case _ =>
plan.withNewChildren(plan.children.map(prunePartitionForFileSourceScan))
}
// This walks from the output to the input so disableUntilInput can walk its way from when
// we hit something that cannot allow for coalesce up until the input
private def insertCoalesce(plan: SparkPlan,
disableUntilInput: Boolean = false): SparkPlan = plan match {
case exec: GpuExec =>
// We will disable coalesce if it is already disabled and we cannot re-enable it
val shouldDisable = (disableUntilInput && !shouldEnableCoalesce(exec)) ||
//or if we should disable it and it is in a stage with a file input that would matter
(exec.disableCoalesceUntilInput() && hasDirectLineToInput(exec))
val tmp = exec.withNewChildren(insertCoalesce(exec.children, exec.childrenCoalesceGoal,
shouldDisable))
if (exec.coalesceAfter && !shouldDisable) {
GpuCoalesceBatches(tmp, TargetSize(rapidsConf.gpuTargetBatchSizeBytes))
} else {
tmp
}
case p =>
// We will disable coalesce if it is already disabled and we cannot re-enable it
val shouldDisable = disableUntilInput && !shouldEnableCoalesce(p) ||
//or if we should disable it and it is in a stage with a file input that would matter
(disableCoalesceUntilInput(p) && hasDirectLineToInput(p))
p.withNewChildren(p.children.map(c => insertCoalesce(c, shouldDisable)))
}
/**
* Inserts a shuffle coalesce after every shuffle to coalesce the serialized tables
* on the host before copying the data to the GPU.
* @note This should not be used in combination with the RAPIDS shuffle.
*/
private def insertShuffleCoalesce(plan: SparkPlan): SparkPlan = plan match {
case exec: GpuShuffleExchangeExecBase =>
// always follow a GPU shuffle with a shuffle coalesce
GpuShuffleCoalesceExec(exec.withNewChildren(exec.children.map(insertShuffleCoalesce)),
rapidsConf.gpuTargetBatchSizeBytes)
case exec => exec.withNewChildren(plan.children.map(insertShuffleCoalesce))
}
/**
* Inserts a transition to be running on the CPU columnar
*/
private def insertColumnarFromGpu(plan: SparkPlan): SparkPlan = {
if (plan.supportsColumnar && plan.isInstanceOf[GpuExec]) {
GpuBringBackToHost(insertColumnarToGpu(plan))
} else if (plan.isInstanceOf[ColumnarToRowTransition] && plan.isInstanceOf[GpuExec]) {
plan.withNewChildren(plan.children.map(insertColumnarToGpu))
} else {
plan.withNewChildren(plan.children.map(insertColumnarFromGpu))
}
}
/** Returns true if the plan is a host columnar plan */
private def isHostColumnar(plan: SparkPlan): Boolean = {
def isGpuAdaptivePlan(plan: SparkPlan): Boolean = plan match {
case ap: AdaptiveSparkPlanExec => GpuOverrides.probablyGpuPlan(ap, rapidsConf)
case _ => false
}
plan.supportsColumnar && !plan.isInstanceOf[GpuExec] && !isGpuAdaptivePlan(plan)
}
/**
* Inserts a transition to be running on the GPU from CPU columnar
*/
private def insertColumnarToGpu(plan: SparkPlan): SparkPlan = {
val nonQueryStagePlan = GpuTransitionOverrides.getNonQueryStagePlan(plan)
if (isHostColumnar(nonQueryStagePlan)) {
HostColumnarToGpu(insertColumnarFromGpu(plan), TargetSize(rapidsConf.gpuTargetBatchSizeBytes))
} else {
plan.withNewChildren(plan.children.map(insertColumnarToGpu))
}
}
// If a GPU hash-based operation, such as GpuHashJoin or GpuHashAggregateExec,
// is followed eventually by a data writing command without an intermediate node
// changing the sort order, insert a sort to optimize the output file size.
private def insertHashOptimizeSorts(plan: SparkPlan,
hasWriteParent: Boolean = false): SparkPlan = {
if (rapidsConf.enableHashOptimizeSort) {
// Insert a sort after the last hash-based op before the query result if there are no
// intermediate nodes that have a specified sort order. This helps with the size of
// Parquet and ORC files.
// Note that this is using a GPU SortOrder expression as the CPU SortOrder which should
// normally be avoided. However since we have checked that no node later in the plan
// needs a particular sort order, it should not be a problem in practice that would
// trigger a redundant sort in the plan.
plan match {
// look for any writing command, not just a GPU writing command
case _: GpuDataWritingCommandExec | _: DataWritingCommandExec =>
plan.withNewChildren(plan.children.map(c => insertHashOptimizeSorts(c, true)))
case _: GpuHashJoin | _: GpuHashAggregateExec if hasWriteParent =>
val gpuSortOrder = getOptimizedSortOrder(plan)
GpuSortExec(gpuSortOrder, false, plan, SortEachBatch)(gpuSortOrder)
case _: GpuHashJoin | _: GpuHashAggregateExec => plan
case p =>
if (p.outputOrdering.isEmpty) {
plan.withNewChildren(plan.children.map(c => insertHashOptimizeSorts(c, hasWriteParent)))
} else {
plan
}
}
} else {
plan
}
}
private def getOptimizedSortOrder(plan: SparkPlan): Seq[SortOrder] = {
plan.output.map { expr =>
val wrapped = GpuOverrides.wrapExpr(expr, rapidsConf, None)
wrapped.tagForGpu()
assert(wrapped.canThisBeReplaced)
SortOrder(wrapped.convertToGpu(), Ascending)
}
}
def assertIsOnTheGpu(exp: Expression, conf: RapidsConf): Unit = {
// There are no GpuAttributeReference or GpuSortOrder
exp match {
case _: AttributeReference | _: SortOrder | _: GpuExpression =>
case _ =>
val classBaseName = PlanUtils.getBaseNameFromClass(exp.getClass.toString)
if (!conf.testingAllowedNonGpu.contains(classBaseName)) {
throw new IllegalArgumentException(s"The expression $exp is not columnar ${exp.getClass}")
}
}
exp.children.foreach(subExp => assertIsOnTheGpu(subExp, conf))
}
def assertIsOnTheGpu(plan: SparkPlan, conf: RapidsConf): Unit = {
def isTestExempted(plan: SparkPlan): Boolean = {
conf.testingAllowedNonGpu.exists(PlanUtils.sameClass(plan, _))
}
val isAdaptiveEnabled = plan.conf.adaptiveExecutionEnabled
plan match {
case _: BroadcastExchangeLike if isAdaptiveEnabled =>
// broadcasts are left on CPU for now when AQE is enabled
case _: BroadcastHashJoinExec | _: BroadcastNestedLoopJoinExec
if isAdaptiveEnabled =>
// broadcasts are left on CPU for now when AQE is enabled
case p if SparkShimImpl.isAqePlan(p) =>
// we do not yet fully support GPU-acceleration when AQE is enabled, so we skip checking
// the plan in this case - https://github.com/NVIDIA/spark-rapids/issues/5
case lts: LocalTableScanExec =>
if (!lts.expressions.forall(_.isInstanceOf[AttributeReference])) {
throw new IllegalArgumentException("It looks like some operations were " +
s"pushed down to LocalTableScanExec ${lts.expressions.mkString(",")}")
}
case imts: InMemoryTableScanExec =>
if (!imts.expressions.forall(_.isInstanceOf[AttributeReference])) {
throw new IllegalArgumentException("It looks like some operations were " +
s"pushed down to InMemoryTableScanExec ${imts.expressions.mkString(",")}")
}
// some metadata operations, may add more when needed
case _: ShowTablesExec =>
case _: DropTableExec =>
case _: RDDScanExec => () // Ignored
case p if SparkShimImpl.skipAssertIsOnTheGpu(p) => () // Ignored
case p: ExecutedCommandExec if !isTestExempted(p) =>
val meta = GpuOverrides.wrapPlan(p, conf, None)
if (!meta.suppressWillWorkOnGpuInfo) {
throw new IllegalArgumentException("Part of the plan is not columnar " +
s"${p.getClass}\n$p")
}
case _ =>
if (!plan.supportsColumnar &&
// There are some python execs that are not columnar because of a little
// used feature. This prevents those from failing tests. This also allows
// the columnar to row transitions to not cause test issues because they too
// are not columnar (they output rows) but are instances of GpuExec.
!plan.isInstanceOf[GpuExec] &&
!isTestExempted(plan)) {
throw new IllegalArgumentException(s"Part of the plan is not columnar " +
s"${plan.getClass}\n$plan")
}
// Check child expressions if this is a GPU node
plan match {
case gpuExec: GpuExec =>
// filter out the output expressions since those are not GPU expressions
val planOutput = gpuExec.output.toSet
gpuExec.gpuExpressions.filter(_ match {
case a: Attribute => !planOutput.contains(a)
case _ => true
}).foreach(assertIsOnTheGpu(_, conf))
case _ =>
}
}
plan.children.foreach(assertIsOnTheGpu(_, conf))
}
/**
* This is intended for testing only and this only supports looking for an exec once.
*/
private def validateExecsInGpuPlan(plan: SparkPlan, conf: RapidsConf): Unit = {
val validateExecs = conf.validateExecsInGpuPlan.toSet
if (validateExecs.nonEmpty) {
def planContainsInstanceOf(plan: SparkPlan): Boolean = {
validateExecs.contains(plan.getClass.getSimpleName)
}
// to set to make uniq execs
val execsFound = PlanUtils.findOperators(plan, planContainsInstanceOf).toSet
val execsNotFound = validateExecs.diff(execsFound.map(_.getClass.getSimpleName))
require(execsNotFound.isEmpty,
s"Plan ${plan.toString()} does not contain the following execs: " +
execsNotFound.mkString(","))
}
}
def detectAndTagFinalColumnarOutput(plan: SparkPlan): SparkPlan = plan match {
case d: DeserializeToObjectExec if d.child.isInstanceOf[GpuColumnarToRowExec] =>
val gpuColumnar = d.child.asInstanceOf[GpuColumnarToRowExec]
plan.withNewChildren(Seq(GpuColumnarToRowExec(gpuColumnar.child, exportColumnarRdd = true)))
case _ => plan
}
/** Mark nodes as GPU planning completed. */
private def markGpuPlanningComplete(plan: SparkPlan): SparkPlan = {
plan.foreach {
case g: GpuBroadcastExchangeExec => g.markGpuPlanningComplete()
case _ =>
}
plan
}
/**
* On some Spark platforms, AQE planning ends up not reusing as many GPU exchanges as possible.
* This searches the plan for any GPU broadcast exchanges and checks if their original CPU plans
* match any other previously seen GPU broadcasts with the same CPU plan.
*/
private def fixupAdaptiveExchangeReuse(p: SparkPlan): SparkPlan = {
def doFixup(plan: SparkPlan): SparkPlan = {
plan.transformUp {
case g: GpuBroadcastExchangeExec =>
ExchangeMappingCache.findGpuExchangeReplacement(g.cpuCanonical).map { other =>
if (other eq g) {
g
} else {
ReusedExchangeExec(g.output, other)
}
}.getOrElse(g)
}
}
// If an exchange is at the top of the plan being remapped, this is likely due to AQE
// re-planning, and we're not allowed to change an exchange to a reused exchange in that case.
p match {
case e: Exchange => e.mapChildren(doFixup)
case _ => doFixup(p)
}
}
private def insertStageLevelMetrics(plan: SparkPlan): Unit = {
val sc = SparkSession.active.sparkContext
val gen = new AtomicInteger(0)
val allMetrics = mutable.Map[Int, GpuTaskMetrics]()
insertStageLevelMetrics(sc, plan, gen.getAndIncrement(), gen, allMetrics)
}
private def insertStageLevelMetrics(sc: SparkContext,
plan: SparkPlan,
currentStageId: Int,
stageIdGen: AtomicInteger,
allMetrics: mutable.Map[Int, GpuTaskMetrics]): Unit = {
plan match {
case shuffle: Exchange =>
shuffle.children.foreach { child =>
val newStageId = stageIdGen.getAndIncrement()
insertStageLevelMetrics(sc, child, newStageId, stageIdGen, allMetrics)
}
case gpu: GpuExec if gpu.supportsColumnar =>
// We only want to insert metrics for one of the execs per stage, but that can
// have problems because we want it to be deserialized before any of the metrics
// are used, but depending on how the iterators work, that might not happen, so to
// be safe for now we are going to include it everywhere
val metrics = allMetrics.getOrElse(currentStageId, {
val newMetrics = new GpuTaskMetrics
newMetrics.register(sc)
allMetrics.put(currentStageId, newMetrics)
newMetrics
})
gpu.setTaskMetrics(metrics)
gpu.children.foreach { child =>
insertStageLevelMetrics(sc, child, currentStageId, stageIdGen, allMetrics)
}
case other =>
other.children.foreach { child =>
insertStageLevelMetrics(sc, child, currentStageId, stageIdGen, allMetrics)
}
}
}
override def apply(sparkPlan: SparkPlan): SparkPlan = GpuOverrideUtil.tryOverride { plan =>
this.rapidsConf = new RapidsConf(plan.conf)
if (rapidsConf.isSqlEnabled && rapidsConf.isSqlExecuteOnGPU) {
GpuOverrides.logDuration(rapidsConf.shouldExplain,
t => f"GPU plan transition optimization took $t%.2f ms") {
var updatedPlan = insertHashOptimizeSorts(plan)
updatedPlan = updateScansForInputAndOrder(updatedPlan)
if (rapidsConf.isFileScanPrunePartitionEnabled) {
updatedPlan = prunePartitionForFileSourceScan(updatedPlan)
}
updatedPlan = insertColumnarFromGpu(updatedPlan)
updatedPlan = insertCoalesce(updatedPlan)
// only insert shuffle coalesces when using normal shuffle
if (!GpuShuffleEnv.useGPUShuffle(rapidsConf)) {
updatedPlan = insertShuffleCoalesce(updatedPlan)
}
if (plan.conf.adaptiveExecutionEnabled) {
updatedPlan = optimizeAdaptiveTransitions(updatedPlan, None)
} else {
updatedPlan = optimizeGpuPlanTransitions(updatedPlan)
}
updatedPlan = fixupHostColumnarTransitions(updatedPlan)
updatedPlan = optimizeCoalesce(updatedPlan)
if (rapidsConf.shuffledHashJoinOptimizeShuffle) {
updatedPlan = shuffledHashJoinOptimizeShuffle(updatedPlan)
}
if (rapidsConf.exportColumnarRdd) {
updatedPlan = detectAndTagFinalColumnarOutput(updatedPlan)
}
if (rapidsConf.isTestEnabled) {
assertIsOnTheGpu(updatedPlan, rapidsConf)
// Generate the canonicalized plan to ensure no incompatibilities.
// The plan itself is not currently checked.
updatedPlan.canonicalized
validateExecsInGpuPlan(updatedPlan, rapidsConf)
}
// Some distributions of Spark don't properly transform the plan after the
// plugin performs its final transformations of the plan. In this case, we
// need to apply any remaining rules that should have been applied.
updatedPlan = SparkShimImpl.applyPostShimPlanRules(updatedPlan)
updatedPlan = markGpuPlanningComplete(updatedPlan)
if (rapidsConf.isAqeExchangeReuseFixupEnabled &&
plan.conf.adaptiveExecutionEnabled && plan.conf.exchangeReuseEnabled) {
updatedPlan = fixupAdaptiveExchangeReuse(updatedPlan)
}
if (rapidsConf.logQueryTransformations) {
logWarning(s"Transformed query:" +
s"\nOriginal Plan:\n$plan\nTransformed Plan:\n$updatedPlan")
}
insertStageLevelMetrics(updatedPlan)
updatedPlan
}
} else {
plan
}
}(sparkPlan)
}
object GpuTransitionOverrides {
/**
* Returning the underlying plan of a query stage, or the plan itself if it is not a
* query stage. This method is typically used when we want to determine if a plan is
* a GpuExec or not, and this gets hidden by the query stage wrapper.
*/
def getNonQueryStagePlan(plan: SparkPlan): SparkPlan = {
plan match {
case bqse: BroadcastQueryStageExec =>
if (bqse.plan.isInstanceOf[ReusedExchangeExec]) {
bqse.plan.asInstanceOf[ReusedExchangeExec].child
} else {
bqse.plan
}
case sqse: ShuffleQueryStageExec =>
if (sqse.plan.isInstanceOf[ReusedExchangeExec]) {
sqse.plan.asInstanceOf[ReusedExchangeExec].child
} else {
sqse.plan
}
case _ => plan
}
}
/**
* Check the Expression is or has Input File expressions.
* @param exec expression to check
* @return true or false
*/
def checkHasInputFileExpressions(exec: Expression): Boolean = exec match {
case _: InputFileName => true
case _: InputFileBlockStart => true
case _: InputFileBlockLength => true
case e => e.children.exists(checkHasInputFileExpressions)
}
}