In [26]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
spark = (
    SparkSession
    .builder
    .appName("07_chap")
    .config("spark.sql.catalogImplementation", "hive")
    .getOrCreate()
    )
sc = spark.sparkContext

# Sort-Merge Sort

In [2]:
import random

spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")

# Prepare mappings and RNG
states = {
    0: "AZ", 1: "CO", 2: "CA",
    3: "TX", 4: "NY", 5: "MI"
}
items = {
    0: "SKU-0", 1: "SKU-1", 2: "SKU-2",
    3: "SKU-3", 4: "SKU-4", 5: "SKU-5"
}
rnd = random.Random(42)

# Generate users data (uid, login, email, user_state)
users_data = [
    (
        uid,
        f"user_{uid}",
        f"user_{uid}@databricks.com",
        states[rnd.randint(0, 5)]
    )
    for uid in range(0, 1_000_001)
]

# Generate orders data
# (transaction_id, quantity, users_id, amount, state, items)
orders_data = [
    (
        tid,
        tid,                             # quantity = tid (to match the Scala example)
        rnd.randint(0, 9999),            # users_id
        10 * tid * 0.2,                  # amount
        states[rnd.randint(0, 5)],       # state
        items[rnd.randint(0, 5)]         # item
    )
    for tid in range(0, 1_000_001)
]

# Create DataFrames
usersDF = spark.createDataFrame(
    users_data,
    schema=["uid", "login", "email", "user_state"]
)

ordersDF = spark.createDataFrame(
    orders_data,
    schema=["transaction_id", "quantity", "users_id", "amount", "state", "items"]
)

# Perform the join
usersOrdersDF = ordersDF.join(
    usersDF,
    ordersDF.users_id == usersDF.uid
)

# Show results
usersOrdersDF.show(truncate=False)

25/05/17 18:36:52 WARN TaskSetManager: Stage 0 contains a task of very large size (2105 KiB). The maximum recommended task size is 1000 KiB.
25/05/17 18:36:56 WARN TaskSetManager: Stage 1 contains a task of very large size (3993 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

+--------------+--------+--------+--------+-----+-----+---+------+---------------------+----------+
|transaction_id|quantity|users_id|amount  |state|items|uid|login |email                |user_state|
+--------------+--------+--------+--------+-----+-----+---+------+---------------------+----------+
|6183          |6183    |0       |12366.0 |TX   |SKU-0|0  |user_0|user_0@databricks.com|MI        |
|19031         |19031   |0       |38062.0 |CA   |SKU-3|0  |user_0|user_0@databricks.com|MI        |
|22870         |22870   |0       |45740.0 |NY   |SKU-1|0  |user_0|user_0@databricks.com|MI        |
|26533         |26533   |0       |53066.0 |MI   |SKU-1|0  |user_0|user_0@databricks.com|MI        |
|44187         |44187   |0       |88374.0 |MI   |SKU-5|0  |user_0|user_0@databricks.com|MI        |
|59757         |59757   |0       |119514.0|NY   |SKU-2|0  |user_0|user_0@databricks.com|MI        |
|67663         |67663   |0       |135326.0|CA   |SKU-3|0  |user_0|user_0@databricks.com|MI        |


In [3]:
usersOrdersDF.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- SortMergeJoin [users_id#10L], [uid#0L], Inner
   :- Sort [users_id#10L ASC NULLS FIRST], false, 0
   :  +- Exchange hashpartitioning(users_id#10L, 200), ENSURE_REQUIREMENTS, [plan_id=136]
   :     +- Filter isnotnull(users_id#10L)
   :        +- Scan ExistingRDD[transaction_id#8L,quantity#9L,users_id#10L,amount#11,state#12,items#13]
   +- Sort [uid#0L ASC NULLS FIRST], false, 0
      +- Exchange hashpartitioning(uid#0L, 200), ENSURE_REQUIREMENTS, [plan_id=137]
         +- Filter isnotnull(uid#0L)
            +- Scan ExistingRDD[uid#0L,login#1,email#2,user_state#3]




# Broadcast

In [18]:
import random

spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")

# Prepare mappings and RNG
states = {
    0: "AZ", 1: "CO", 2: "CA",
    3: "TX", 4: "NY", 5: "MI"
}
items = {
    0: "SKU-0", 1: "SKU-1", 2: "SKU-2",
    3: "SKU-3", 4: "SKU-4", 5: "SKU-5"
}
rnd = random.Random(42)

# Generate users data (uid, login, email, user_state)
users_data = [
    (
        uid,
        f"user_{uid}",
        f"user_{uid}@databricks.com",
        states[rnd.randint(0, 5)]
    )
    for uid in range(0, 1_000)
]

# Generate orders data
# (transaction_id, quantity, users_id, amount, state, items)
orders_data = [
    (
        tid,
        tid,                             # quantity = tid (to match the Scala example)
        rnd.randint(0, 9999),            # users_id
        10 * tid * 0.2,                  # amount
        states[rnd.randint(0, 5)],       # state
        items[rnd.randint(0, 5)]         # item
    )
    for tid in range(0, 1_000_001)
]

# Create DataFrames
usersDF = spark.createDataFrame(
    users_data,
    schema=["uid", "login", "email", "user_state"]
)

ordersDF = spark.createDataFrame(
    orders_data,
    schema=["transaction_id", "quantity", "users_id", "amount", "state", "items"]
)

# Perform the join
usersOrdersDF = ordersDF.join(
    usersDF,
    ordersDF.users_id == usersDF.uid
)

# Show results
usersOrdersDF.show(truncate=False)

25/05/17 18:43:12 WARN TaskSetManager: Stage 16 contains a task of very large size (2105 KiB). The maximum recommended task size is 1000 KiB.
[Stage 17:>                                                       (0 + 12) / 12]

+--------------+--------+--------+--------+-----+-----+---+------+---------------------+----------+
|transaction_id|quantity|users_id|amount  |state|items|uid|login |email                |user_state|
+--------------+--------+--------+--------+-----+-----+---+------+---------------------+----------+
|4256          |4256    |0       |8512.0  |CO   |SKU-4|0  |user_0|user_0@databricks.com|MI        |
|5772          |5772    |0       |11544.0 |CA   |SKU-3|0  |user_0|user_0@databricks.com|MI        |
|11253         |11253   |0       |22506.0 |MI   |SKU-2|0  |user_0|user_0@databricks.com|MI        |
|11359         |11359   |0       |22718.0 |CA   |SKU-5|0  |user_0|user_0@databricks.com|MI        |
|48657         |48657   |0       |97314.0 |CA   |SKU-2|0  |user_0|user_0@databricks.com|MI        |
|51286         |51286   |0       |102572.0|AZ   |SKU-1|0  |user_0|user_0@databricks.com|MI        |
|69723         |69723   |0       |139446.0|CO   |SKU-4|0  |user_0|user_0@databricks.com|MI        |


                                                                                

In [22]:
joinedDF = ordersDF.join(broadcast(usersDF), ordersDF.users_id == usersDF.uid)

In [24]:
joinedDF.show(truncate=False)

25/05/17 18:44:29 WARN TaskSetManager: Stage 22 contains a task of very large size (2105 KiB). The maximum recommended task size is 1000 KiB.
[Stage 22:>                                                         (0 + 1) / 1]

+--------------+--------+--------+------+-----+-----+---+--------+-----------------------+----------+
|transaction_id|quantity|users_id|amount|state|items|uid|login   |email                  |user_state|
+--------------+--------+--------+------+-----+-----+---+--------+-----------------------+----------+
|5             |5       |986     |10.0  |NY   |SKU-5|986|user_986|user_986@databricks.com|TX        |
|29            |29      |423     |58.0  |AZ   |SKU-1|423|user_423|user_423@databricks.com|NY        |
|45            |45      |888     |90.0  |CO   |SKU-4|888|user_888|user_888@databricks.com|MI        |
|53            |53      |224     |106.0 |NY   |SKU-3|224|user_224|user_224@databricks.com|MI        |
|79            |79      |251     |158.0 |MI   |SKU-1|251|user_251|user_251@databricks.com|AZ        |
|101           |101     |153     |202.0 |TX   |SKU-2|153|user_153|user_153@databricks.com|NY        |
|110           |110     |359     |220.0 |MI   |SKU-3|359|user_359|user_359@databri

25/05/17 18:44:33 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 22 (TID 103): Attempting to kill Python Worker
                                                                                

# Optimizing

In [29]:
usersDF \
    .orderBy(asc("uid")) \
    .write \
    .format("parquet") \
    .bucketBy(8, "uid") \
    .mode('overwrite') \
    .saveAsTable("UsersTbl")

# Orders: bucket by 8 on "users_id"
ordersDF \
    .orderBy(asc("users_id")) \
    .write \
    .format("parquet") \
    .bucketBy(8, "users_id") \
    .mode('overwrite') \
    .saveAsTable("OrdersTbl")

# 2. Cache the tables in memory
spark.sql("CACHE TABLE UsersTbl")
spark.sql("CACHE TABLE OrdersTbl")

# 3. Read them back in as DataFrames
usersBucketDF = spark.table("UsersTbl")
ordersBucketDF = spark.table("OrdersTbl")

# 4. Perform the bucket-aware join and show results
joinUsersOrdersBucketDF = ordersBucketDF.join(
    usersBucketDF,
    ordersBucketDF.users_id == usersBucketDF.uid
)

joinUsersOrdersBucketDF.show(truncate=False)

25/05/17 18:50:05 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
25/05/17 18:50:05 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
25/05/17 18:50:09 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
25/05/17 18:50:09 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore khodosevichleo@198.18.1.200
25/05/17 18:50:09 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException
25/05/17 18:50:13 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
25/05/17 18:50:13 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
25/05/17 18:50:13 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
25/05/17 18:50

+--------------+--------+--------+--------+-----+-----+---+------+---------------------+----------+
|transaction_id|quantity|users_id|amount  |state|items|uid|login |email                |user_state|
+--------------+--------+--------+--------+-----+-----+---+------+---------------------+----------+
|20137         |20137   |2       |40274.0 |CO   |SKU-1|2  |user_2|user_2@databricks.com|AZ        |
|36842         |36842   |2       |73684.0 |CO   |SKU-5|2  |user_2|user_2@databricks.com|AZ        |
|41557         |41557   |2       |83114.0 |NY   |SKU-2|2  |user_2|user_2@databricks.com|AZ        |
|62786         |62786   |2       |125572.0|CO   |SKU-4|2  |user_2|user_2@databricks.com|AZ        |
|65482         |65482   |2       |130964.0|TX   |SKU-3|2  |user_2|user_2@databricks.com|AZ        |
|77367         |77367   |2       |154734.0|AZ   |SKU-4|2  |user_2|user_2@databricks.com|AZ        |
|79424         |79424   |2       |158848.0|MI   |SKU-5|2  |user_2|user_2@databricks.com|AZ        |


In [30]:
joinUsersOrdersBucketDF.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- SortMergeJoin [users_id#449L], [uid#310L], Inner
   :- Sort [users_id#449L ASC NULLS FIRST], false, 0
   :  +- Filter isnotnull(users_id#449L)
   :     +- Scan In-memory table OrdersTbl [transaction_id#447L, quantity#448L, users_id#449L, amount#450, state#451, items#452], [isnotnull(users_id#449L)]
   :           +- InMemoryRelation [transaction_id#447L, quantity#448L, users_id#449L, amount#450, state#451, items#452], StorageLevel(disk, memory, deserialized, 1 replicas)
   :                 +- *(1) ColumnarToRow
   :                    +- FileScan parquet spark_catalog.default.orderstbl[transaction_id#447L,quantity#448L,users_id#449L,amount#450,state#451,items#452] Batched: true, Bucketed: true, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/Users/khodosevichleo/Desktop/Weiterbildung/Spark/Learning-Spark-..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<transaction_id:bigint,quantity

In [31]:
spark.sql("DROP TABLE UsersTbl")
spark.sql("DROP TABLE OrdersTbl")

DataFrame[]

25/05/17 23:39:48 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1077134 ms exceeds timeout 120000 ms
25/05/17 23:39:48 WARN SparkContext: Killing executors is not supported by current scheduler.
25/05/17 23:56:00 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$