## Importing Libraries

In [0]:
from datetime import datetime, timedelta
import time
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, col, count
from pyspark.sql.functions import expr, current_timestamp, rand, randn, lit, datediff, date_sub
from pyspark.sql.types import IntegerType
import numpy as np
np.random.seed(42)

### Spark memory usage

In [0]:
def generate_synthetic_data_spark(num_records):

   return spark.range(num_records) \
       .withColumn("customer_id", (rand() * 9999 + 1).cast(IntegerType())) \
       .withColumn("category", expr("array('Electronics', 'Clothing', 'Food', 'Books', 'Home')[cast(rand() * 5 as int)]")) \
       .withColumn("amount", randn() * 50 + 100) \
       .withColumn("transaction_date", date_sub(current_timestamp(), (rand() * 365).cast("int")))


def spark_analysis(df: pyspark.sql.dataframe.DataFrame):
    """Perform analysis using Spark"""
    print(f'Number of partitions: {df.rdd.getNumPartitions()}')
    result = df.groupby('category').agg(
        avg('amount').alias('mean'),
        count('*').alias('count')
    )
    return result

In [0]:
df = generate_synthetic_data_spark(100)
df.show()

+---+-----------+-----------+-------------------+----------------+
| id|customer_id|   category|             amount|transaction_date|
+---+-----------+-----------+-------------------+----------------+
|  0|       1339|   Clothing| 63.247145742748486|      2024-04-28|
|  1|       8958|       Home|  143.3590236152635|      2024-04-20|
|  2|       2250|Electronics| 139.92614709273062|      2024-12-13|
|  3|       2102|   Clothing| 175.57782730903995|      2024-04-28|
|  4|        416|      Books| 100.19031133891677|      2025-02-15|
|  5|       7585|       Food|-26.609133469386734|      2024-09-09|
|  6|       2999|       Food| 25.591294385776024|      2024-03-01|
|  7|        677|       Food|  168.7696468161506|      2024-11-29|
|  8|       7132|      Books| 101.39833832202068|      2024-04-26|
|  9|       1830|      Books| 56.913955594264166|      2024-04-21|
| 10|       6407|      Books| 15.117522767188902|      2025-02-03|
| 11|       3209|   Clothing| 101.68086740885728|      2024-05

In [0]:
sizes = [10_000_000, 100_000_000, 1_000_000_000] # 10M, 100M, 1B
for size in sizes:
    print(f"\nGenerating {size:,} records...")
    spark.sparkContext.setJobDescription(f"Generating {size:,} records")
    print(f"Running Spark analysis... for {size} records")
    spark_df = generate_synthetic_data_spark(size)
    spark_result = spark_analysis(spark_df)
    print("Spark Results:")
    start = time.time()
    spark_result.show()
    end = time.time()
    print(f"Spark analysis completed in {end - start:.3f} seconds")


Generating 10,000,000 records...
Running Spark analysis... for 10000000 records
Number of partitions: 4
Spark Results:
+-----------+------------------+-------+
|   category|              mean|  count|
+-----------+------------------+-------+
|       Home| 99.98281523812383|1999345|
|       Food| 99.99680065397315|1999161|
|Electronics|100.03276193670644|2001755|
|   Clothing| 99.97572706591164|2000398|
|      Books| 99.99831397194481|1999341|
+-----------+------------------+-------+

Spark analysis completed in 0.533 seconds

Generating 100,000,000 records...
Running Spark analysis... for 100000000 records
Number of partitions: 4
Spark Results:
+-----------+------------------+--------+
|   category|              mean|   count|
+-----------+------------------+--------+
|       Home|100.00026971164223|19999857|
|       Food|100.00178138044673|20000552|
|Electronics| 99.99485588940858|19997703|
|   Clothing| 100.0176901641435|20007582|
|      Books|  99.9901987618742|19994306|
+---------