In [1]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")

In [2]:
from pyspark.sql.functions import col

#************* Set configurations ******************#
# When true, try to eliminate shuffle by using the partitioning reported by a compatible V2 data source
spark.conf.set("spark.sql.sources.v2.bucketing.enabled", "true")

# When true, partitioning info is retained during query planning. 
# This prevents unnecessary repartitioning, optimizing performance by reducing shuffle costs during execution.
spark.conf.set("spark.sql.iceberg.planning.preserve-data-grouping", "true")

# When enabled, try to eliminate shuffle if one side of the join has missing partition values from the other side.
# spark.conf.set("spark.sql.sources.v2.bucketing.pushPartValues.enabled", "true")

# When true, require the join or MERGE keys to be same and in the same order as the partition keys to eliminate shuffle.
# That's the reason it's being set to false
# spark.conf.set("spark.sql.requireAllClusterKeysForCoPartition", "false")

# When enabled, try to avoid shuffle if join condition does not include all partition columns.
# spark.sql.sources.v2.bucketing.allowJoinKeysSubsetOfPartitionKeys.enabled


# Read bucketed tables
bucketed_match_details_df = spark.read.table("bootcamp.bucketed_match_details")
bucketed_matches_df = spark.read.table("bootcamp.bucketed_matches")

# Filter out null values from match_id columns
bucketed_match_details_df = bucketed_match_details_df.filter(bucketed_match_details_df.match_id.isNotNull())
bucketed_matches_df = bucketed_matches_df.filter(bucketed_matches_df.match_id.isNotNull())

# # Check for skewness in match_details_df
# match_details_skewness = bucketed_match_details_df.groupBy("match_id").count().orderBy("count", ascending=False)
# match_details_skewness.show()

# # Check for skewness in matches_df
# matches_skewness = bucketed_matches_df.groupBy("match_id").count().orderBy("count", ascending=False)
# matches_skewness.show()

# # Repartition skewed data
# bucketed_match_details_df = bucketed_match_details_df.repartition(16, col("match_id"))
# bucketed_matches_df = bucketed_matches_df.repartition(16, col("match_id"))

# Perform the join
match_details_with_matches_df = bucketed_match_details_df.alias("md").join(
    bucketed_matches_df.alias("m"), col("md.match_id") == col("m.match_id"), "inner"
).select(
    col("md.match_id").alias("md_match_id"),
    col("md.player_gamertag"),
    col("md.player_total_kills"),
    col("m.playlist_id"),
    col("m.mapid").alias("map_id")
)

# Verify the join plan
match_details_with_matches_df.explain("FORMATTED")

# Try to execute the count and catch any errors
try:
    result_count = match_details_with_matches_df.count()
    print(f"Count Result: {result_count}")
except Exception as e:
    print("Error during execution:", e)


AnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `bootcamp`.`bucketed_match_details` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.;
'UnresolvedRelation [bootcamp, bucketed_match_details], [], false
