## Importing Libraries

In [0]:
from datetime import datetime, timedelta
import time
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, col, count
from pyspark.sql.functions import expr, current_timestamp, rand, randn, lit, datediff, date_sub
from pyspark.sql.types import IntegerType
import numpy as np
np.random.seed(42)

### Spark memory usage

In [0]:
def generate_synthetic_data_spark(num_records):

   return spark.range(num_records) \
       .withColumn("customer_id", (rand() * 9999 + 1).cast(IntegerType())) \
       .withColumn("category", expr("array('Electronics', 'Clothing', 'Food', 'Books', 'Home')[cast(rand() * 5 as int)]")) \
       .withColumn("amount", randn() * 50 + 100) \
       .withColumn("transaction_date", date_sub(current_timestamp(), (rand() * 365).cast("int")))


def spark_analysis(df: pyspark.sql.dataframe.DataFrame):
    """Perform analysis using Spark"""
    print(f'Number of partitions: {df.rdd.getNumPartitions()}')
    start_time = time.time()
    
    # Calculate average amount by category
    result = df.groupby('category').agg(
        avg('amount').alias('mean'),
        count('*').alias('count')
    )
    end_time = time.time()
    processing_time = end_time - start_time
    return result, processing_time

In [0]:
df = generate_synthetic_data_spark(100)
df.show()

+---+-----------+-----------+------------------+----------------+
| id|customer_id|   category|            amount|transaction_date|
+---+-----------+-----------+------------------+----------------+
|  0|       8143|       Home| 98.13690682012054|      2024-04-18|
|  1|       6696|Electronics| 128.2076472286725|      2024-12-26|
|  2|       5001|Electronics|167.32913691201458|      2024-11-27|
|  3|       6369|      Books|131.55744868111262|      2024-10-30|
|  4|       4789|      Books|62.727262645812004|      2024-12-27|
|  5|       7190|      Books|101.35237257201412|      2024-03-22|
|  6|       8880|       Food| 75.03604431763159|      2024-10-03|
|  7|       8639|       Home|  211.166402805631|      2024-07-06|
|  8|       4540|       Food|112.22600697123166|      2025-01-23|
|  9|       1199|       Food|20.123007696566873|      2024-08-18|
| 10|       1099|       Food| 46.81029026244269|      2024-06-01|
| 11|       3228|      Books|58.770660685041726|      2024-10-08|
| 12|     

In [0]:
sizes = [10_000_000, 100_000_000, 1_000_000_000] # 10M, 100M, 1B
for size in sizes:
    print(f"\nGenerating {size:,} records...")
    spark.sparkContext.setJobDescription(f"Generating {size:,} records")
    print(f"Running Spark analysis... for {size} records")
    spark_df = generate_synthetic_data_spark(size)
    spark_result, spark_time = spark_analysis(spark_df)
    print(f"Spark processing time for {size:,} records: {spark_time:.2f} seconds")
    print("Spark Results:")


Generating 10,000,000 records...
Running Spark analysis... for 10000000 records
Number of partitions: 4
Spark processing time for 10,000,000 records: 0.55 seconds
Spark Results:

Generating 100,000,000 records...
Running Spark analysis... for 100000000 records
Number of partitions: 4
Spark processing time for 100,000,000 records: 0.19 seconds
Spark Results:

Generating 1,000,000,000 records...
Running Spark analysis... for 1000000000 records
Number of partitions: 4
Spark processing time for 1,000,000,000 records: 0.58 seconds
Spark Results:
