## Importing Libraries

In [0]:
from datetime import datetime, timedelta
import time
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, col, count
from pyspark.sql.functions import expr, current_timestamp, rand, randn, lit, datediff, date_sub
from pyspark.sql.types import IntegerType
import numpy as np
np.random.seed(42)

### Spark memory usage

In [0]:
def generate_synthetic_data_spark(num_records):

   return spark.range(num_records) \
       .withColumn("customer_id", (rand() * 9999 + 1).cast(IntegerType())) \
       .withColumn("category", expr("array('Electronics', 'Clothing', 'Food', 'Books', 'Home')[cast(rand() * 5 as int)]")) \
       .withColumn("amount", randn() * 50 + 100) \
       .withColumn("transaction_date", date_sub(current_timestamp(), (rand() * 365).cast("int")))


def spark_analysis(df: pyspark.sql.dataframe.DataFrame):
    """Perform analysis using Spark"""
    print(f'Number of partitions: {df.rdd.getNumPartitions()}')
    start_time = time.time()
    
    # Calculate average amount by category
    result = df.groupby('category').agg(
        avg('amount').alias('mean'),
        count('*').alias('count')
    )
    end_time = time.time()
    processing_time = end_time - start_time
    return result, processing_time

In [0]:
df = generate_synthetic_data_spark(100)
df.show()

In [0]:
sizes = [10_000_000, 100_000_000, 1_000_000_000] # 10M, 100M, 1B
for size in sizes:
    print(f"\nGenerating {size:,} records...")
    spark.sparkContext.setJobDescription(f"Generating {size:,} records")
    print(f"Running Spark analysis... for {size} records")
    spark_df = generate_synthetic_data_spark(size)
    spark_result, spark_time = spark_analysis(spark_df)
    print(f"Spark processing time for {size:,} records: {spark_time:.2f} seconds")
    print("Spark Results:")