In [None]:
from pyspark.sql import SparkSession
from dotenv import load_dotenv
from pyspark.sql.types import LongType
from pyspark.sql import functions as F
import os
import time

!docker inspect minioserver | grep IPAddress

!docker inspect spark-master | grep IPAddress

In [None]:
load_dotenv()

# Get credentials from .env
MINIO_USER = os.getenv("MINIO_ROOT_USER")
MINIO_PASSWORD = os.getenv("MINIO_ROOT_PASSWORD")
MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY")
MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY")

In [None]:
# Create Spark session connecting to your Docker cluster
spark = (SparkSession.builder
    .appName("Partition_Performance_Test")
    .master("spark://172.18.0.3:7077")  # Your Spark master in Docker
    .config("spark.sql.adaptive.enabled", "true")
    .config("spark.jars", 
            "./shared-data/hadoop-aws-3.3.4.jar,"
            "./shared-data/aws-java-sdk-bundle-1.12.792.jar")
    .config("spark.driver.extraClassPath",
            "./shared-data/hadoop-aws-3.3.4.jar:"
            "./shared-data/aws-java-sdk-bundle-1.12.792.jar")
    .config("spark.executor.extraClassPath",  # Add this for executors
            "./shared-data/hadoop-aws-3.3.4.jar:"
            "./shared-data/aws-java-sdk-bundle-1.12.792.jar")
    .config("spark.hadoop.fs.s3a.endpoint", "http://172.18.0.2:9000")
    .config("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS_KEY)
    .config("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET_KEY)
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider",
            "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
    .getOrCreate())

print("✓ Spark session created!")
print(f"Spark version: {spark.version}")

In [None]:
MINIO_UNPARTITIONED_TARGET_PATH = "s3a://datalake/raw-data/unpartitioned/nyc yellow taxi"
MINIO_PARTITIONED_TARGET_PATH = "s3a://datalake/raw-data/partitioned/nyc yellow taxi"

In [None]:
def create_monthly_revenue(df):
    """Create monthly revenue aggregation"""
    return df.groupBy(
        F.year("tpep_pickup_datetime").alias("year"),
        F.month("tpep_pickup_datetime").alias("month")
    ).agg(
        F.count("*").alias("total_trips"),
        F.sum("total_amount").alias("total_revenue"),
        F.avg("total_amount").alias("avg_fare"),
        F.sum("trip_distance").alias("total_distance")
    ).orderBy("year", "month")

In [None]:
def test_performance(path, name, filter_year=2023, filter_months=[6], is_partitioned=False):
    """
    Test performance using NOOP write to isolate read/compute speed.
    """
    print(f"\n{'='*60}")
    print(f"TEST: {name}")
    print(f"{'='*60}")
    
    start = time.time()
    
    if is_partitioned:
        df = spark.read.option("basePath", path).parquet(path)
        
        df_filtered = df.filter(
            (F.col("trip_year") == filter_year) & 
            (F.col("trip_month").isin(filter_months))
        )
    else:
        df = spark.read.parquet(path)
        
        df_filtered = df.filter(
            (F.year("tpep_pickup_datetime") == filter_year) & 
            (F.month("tpep_pickup_datetime").isin(filter_months))
        )
    
    result_df = create_monthly_revenue(df_filtered)
    
    result_df.explain()
    
    result_df.write.format("noop").mode("overwrite").save()
    
    duration = time.time() - start
    print(f"\n⏱️ Time Taken: {duration:.4f} seconds")
    
    return duration

# Warm Up

In [None]:
spark.range(100).write.format("noop").mode("overwrite").save()

# Unpartitioned

In [None]:
t_unpartitioned = test_performance(
    path=MINIO_UNPARTITIONED_TARGET_PATH,
    name="Unpartitioned (Full Scan)",
    filter_year=2023,
    filter_months=[6],
    is_partitioned=False
)

In [None]:
t_partitioned = test_performance(
    path=MINIO_PARTITIONED_TARGET_PATH,
    name="Partitioned (Directory Pruning)",
    filter_year=2023,
    filter_months=[6],
    is_partitioned=True
)

# Summary Result

In [None]:
print(f"\n{'='*60}")
print("FINAL RESULTS")
print(f"{'='*60}")
print(f"Unpartitioned Time: {t_unpartitioned:.4f}s")

In [None]:
spark.stop()