In [0]:
df = spark.read.option("header", True).csv("dbfs:/FileStore/tables/transactions.csv")
df.show()

+----+-------------+---+------+----------+-------------+-----------+-------------------+------+-----+
|step|     customer|age|gender|zipcodeOri|     merchant|zipMerchant|           category|amount|fraud|
+----+-------------+---+------+----------+-------------+-----------+-------------------+------+-----+
|   0|'C1093826151'|'4'|   'M'|   '28007'| 'M348934600'|    '28007'|'es_transportation'|  4.55|    0|
|   0| 'C352968107'|'2'|   'M'|   '28007'| 'M348934600'|    '28007'|'es_transportation'| 39.68|    0|
|   0|'C2054744914'|'4'|   'F'|   '28007'|'M1823072687'|    '28007'|'es_transportation'| 26.89|    0|
|   0|'C1760612790'|'3'|   'M'|   '28007'| 'M348934600'|    '28007'|'es_transportation'| 17.25|    0|
|   0| 'C757503768'|'5'|   'M'|   '28007'| 'M348934600'|    '28007'|'es_transportation'| 35.72|    0|
|   0|'C1315400589'|'3'|   'F'|   '28007'| 'M348934600'|    '28007'|'es_transportation'| 25.81|    0|
|   0| 'C765155274'|'1'|   'F'|   '28007'| 'M348934600'|    '28007'|'es_transporta

In [0]:
df.write \
  .option("header", True) \
  .mode("overwrite") \
  .csv("s3a://banktransactionskrnl1/chunks/")

In [0]:
from pyspark.sql import Row
from datetime import datetime
import time

def stream_to_s3(df, chunk_size=10000, max_chunks=None):
    """
    Streams the given Spark DataFrame to S3 in chunks of `chunk_size` rows.
    Each chunk is written to a unique S3 folder every second.

    Parameters:
    - df (Spark DataFrame): The input dataframe to chunk and stream
    - chunk_size (int): Number of rows per chunk (default: 10000)
    - max_chunks (int or None): Maximum number of chunks to stream (default: None = stream all)
    """
    print("Starting Mechanism X: Streaming to S3")

    # Add row_id using zipWithIndex
    rdd_with_index = df.rdd.zipWithIndex()

    def add_index(row_index_tuple):
        row, idx = row_index_tuple
        data_dict = row.asDict()
        data_dict['row_id'] = idx
        return Row(**data_dict)

    df_with_index = rdd_with_index.map(add_index).toDF()
    total = df_with_index.count()
    
    num_chunks = (total // chunk_size) + (1 if total % chunk_size != 0 else 0)
    if max_chunks is not None:
        num_chunks = min(num_chunks, max_chunks)

    bucket = "s3a://banktransactionskrnl1/chunks/"

    for i in range(num_chunks):
        begin = i * chunk_size
        end = begin + chunk_size
        chunk_df = df_with_index.filter((df_with_index.row_id >= begin) & (df_with_index.row_id < end)).drop("row_id")
        
        timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
        output_path = f"{bucket}chunk_{i}_{timestamp}/"

        chunk_df.repartition(1).write.option("header", True).mode("overwrite").csv(output_path)
        
        print(f"[{timestamp}] ✅ Uploaded chunk {i} with {chunk_df.count()} rows to {output_path}")
        time.sleep(1)

    print("✅ All chunks uploaded. Mechanism X complete.")


In [0]:
# Load your DataFrame
df = spark.read.option("header", True).csv("dbfs:/FileStore/tables/transactions.csv")  

# Start streaming to S3 (Mechanism X)
stream_to_s3(df, chunk_size=10000) 


Starting Mechanism X: Streaming to S3
[20250605035607] ✅ Uploaded chunk 0 with 10000 rows to s3a://banktransactionskrnl1/chunks/chunk_0_20250605035607/
[20250605035643] ✅ Uploaded chunk 1 with 10000 rows to s3a://banktransactionskrnl1/chunks/chunk_1_20250605035643/
[20250605035718] ✅ Uploaded chunk 2 with 10000 rows to s3a://banktransactionskrnl1/chunks/chunk_2_20250605035718/
[20250605035754] ✅ Uploaded chunk 3 with 10000 rows to s3a://banktransactionskrnl1/chunks/chunk_3_20250605035754/
[20250605035829] ✅ Uploaded chunk 4 with 10000 rows to s3a://banktransactionskrnl1/chunks/chunk_4_20250605035829/
[20250605035904] ✅ Uploaded chunk 5 with 10000 rows to s3a://banktransactionskrnl1/chunks/chunk_5_20250605035904/
[20250605035939] ✅ Uploaded chunk 6 with 10000 rows to s3a://banktransactionskrnl1/chunks/chunk_6_20250605035939/
[20250605040015] ✅ Uploaded chunk 7 with 10000 rows to s3a://banktransactionskrnl1/chunks/chunk_7_20250605040015/
[20250605040051] ✅ Uploaded chunk 8 with 10000 row