# Setup

In [1]:
# SPARK_START_FROM_SCRATCH = True
# DOCKER_INTERNAL_HOST = "host.docker.internal"

# SPARK_VPN_SELF_HOSTNAME = "mavasbel.vpn.itam.mx"
# SPARK_VPN_SELF_IP = "10.15.20.2"
# SPARK_WORKDIR = "/opt/spark/work-dir"
# SPARK_JUPYTER_DOCKER_TAG = "spark-jupyter:python3"
# SPARK_JOB_VENV_DOCKER_TAG = "spark-job-venv:python3"
# SPARK_JOB_VENV_BUILD_DIR = "/opt/spark/venv-build"

# SPARK_MASTER_PORT = 6077
# SPARK_MASTER_WUBUI_PORT = 6080

# JUPYTER_PORT = 6888
# JUPYTER_MONITOR_PORT = 4040
# JUPYTER_TOKEN = ""

In [None]:
SPARK_START_FROM_SCRATCH = True
DOCKER_INTERNAL_HOST = "host.docker.internal"
DOCKER_DNS = ["10.15.20.1"]

SPARK_JUPYTER_LAB_DOCKER_TAG = "spark-jupyter:python3"
SPARK_JOB_VENV_DOCKER_TAG = "spark-job-venv:python3"
SPARK_JOB_VENV_BUILD_DIR = "/opt/spark/venv-build"

SPARK_MASTER_NAME = "spark-master"
SPARK_MASTER_HOSTNAME = f"{SPARK_MASTER_NAME}.mavasbel.vpn.itam.mx"
SPARK_MASTER_IP = "10.15.20.2"
SPARK_MASTER_WUBUI_PORT = 6080
SPARK_MASTER_PORT = 6077

SPARK_TOTAL_WORKERS = 3
SPARK_WORKER_NAMES = [f"spark-worker-{i+1}" for i in range(SPARK_TOTAL_WORKERS)]
SPARK_WORKER_HOSTNAMES = [
    f"{SPARK_WORKER_NAMES[i]}.mavasbel.vpn.itam.mx" for i in range(SPARK_TOTAL_WORKERS)
]
SPARK_WORKER_IPS = ["10.15.20.2"] * SPARK_TOTAL_WORKERS
SPARK_WORKER_WEBUI_PORTS = [6080 + (i + 1) for i in range(SPARK_TOTAL_WORKERS)]

SPARK_WORKDIR = "/opt/spark/work-dir"

JUPYTER_LAB_NAME = "spark-jupyter"
JUPYTER_LAB_HOSTNAME = "spark-jupyter.mavasbel.vpn.itam.mx"
JUPYTER_LAB_IP = "10.15.20.2"
JUPYTER_LAB_PORT = 6888
JUPYTER_LAB_MONITOR_PORT = 4040
JUPYTER_LAB_TOKEN = ""

In [3]:
import os
from pathlib import Path

SPARK_DATADIR = Path(os.path.join(os.path.abspath(Path.cwd()), "data"))
SPARK_DATADIR.mkdir(parents=True, exist_ok=True)

In [4]:
!pip install faker

Collecting faker
  Downloading faker-40.1.0-py3-none-any.whl (2.0 MB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m2.0/2.0 MB[0m [31m54.8 kB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
Installing collected packages: faker
Successfully installed faker-40.1.0
[0m

##### Cleaning Spark context

In [5]:
from pyspark import SparkContext

# Forcefully kill any existing or "ghost" contexts
try:
    sc = SparkContext.getOrCreate()
    sc.stop()
    print("üßπ Ghost SparkContext cleaned up.")
except Exception:
    print("‚ú® No existing SparkContext to clean.")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/10 08:12:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


üßπ Ghost SparkContext cleaned up.


# Spark session

In [6]:
import sys
from pyspark.sql import SparkSession
from datetime import datetime

spark = (
    SparkSession.builder.master(
        f"spark://{SPARK_MASTER_HOSTNAME}:{SPARK_MASTER_PORT}"
    )
    .appName(f"SparkLab_{datetime.now().strftime('%Y-%m-%d_%H:%M:%S.%f')}")
    .config("spark.archives", f"{SPARK_WORKDIR}/spark_job_env.tar.gz#environment")
    .config("spark.driver.host", f"{JUPYTER_LAB_HOSTNAME}")
    .config("spark.driver.bindAddress", "0.0.0.0")
    .config("spark.driver.memory", "512m")
    .config("spark.executorEnv.PYSPARK_PYTHON", "./environment/bin/python3")
    .config("spark.executor.memory", "1G")
    .config(
        "spark.executorEnv.PYTHONPATH",
        f"./environment/lib/python{'.'.join(str(n) for n in sys.version_info[:2])}/site-packages",
    )
    .getOrCreate()
)

print("‚úÖ Spark Session is now active.")

‚úÖ Spark Session is now active.


# Data generation

In [7]:
total_rows = 10000
partitions = 10


def batch_generator(ids):
    import socket
    import random
    from faker import Faker

    node_name = socket.gethostname()
    faker = Faker()
    for _ in ids:
        yield (
            faker.uuid4(),
            node_name,
            faker.date_time(),
            faker.first_name(),
            faker.last_name(),
            faker.email(),
            faker.basic_phone_number(),
            random.random() * 1000.0,
        )


df_column_names = [
    "id",
    "worker",
    "timestamp",
    "first_name",
    "last_name",
    "email",
    "phone",
    "amount",
]
df_column_types = spark.createDataFrame(
    list(batch_generator(range(1))), schema=df_column_names
).schema
print(f"‚úÖ batch_generator schema: {df_column_types}")

‚úÖ batch_generator schema: StructType([StructField('id', StringType(), True), StructField('worker', StringType(), True), StructField('timestamp', TimestampType(), True), StructField('first_name', StringType(), True), StructField('last_name', StringType(), True), StructField('email', StringType(), True), StructField('phone', StringType(), True), StructField('amount', DoubleType(), True)])


In [8]:
from pyspark.sql import functions as F
from IPython.display import Markdown, display

df = spark.createDataFrame(
    list(batch_generator(range(total_rows))), df_column_names
).repartition(partitions)
df.write.mode("overwrite").csv(f"{SPARK_DATADIR}/faker.csv")
print(f"‚úÖ Created {SPARK_DATADIR}/faker.csv")


partition_stats = (
    df.withColumn("partition_id", F.spark_partition_id())
    .groupBy("worker", "partition_id")
    .count()
    .orderBy("worker", "partition_id")
)
# partition_stats.show()
display(partition_stats.toPandas())


                                                                                

‚úÖ Created /opt/spark/work-dir/data/faker.csv


Unnamed: 0,worker,partition_id,count
0,spark-jupyter,0,1002
1,spark-jupyter,1,1002
2,spark-jupyter,2,999
3,spark-jupyter,3,998
4,spark-jupyter,4,999
5,spark-jupyter,5,999
6,spark-jupyter,6,1000
7,spark-jupyter,7,999
8,spark-jupyter,8,1000
9,spark-jupyter,9,1002


In [9]:
from IPython.display import Markdown, display

rdd = spark.sparkContext.parallelize(range(total_rows), partitions).mapPartitions(
    batch_generator
)
df = rdd.toDF(df_column_names)
df.write.mode("overwrite").parquet(f"{SPARK_DATADIR}/faker.parquet")
print(f"‚úÖ Created {SPARK_DATADIR}/faker.parquet")


partition_stats = (
    df.withColumn("partition_id", F.spark_partition_id())
    .groupBy("worker", "partition_id")
    .count()
    .orderBy("worker", "partition_id")
)
# partition_stats.show()
display(partition_stats.toPandas())

                                                                                

‚úÖ Created /opt/spark/work-dir/data/faker.parquet


                                                                                

Unnamed: 0,worker,partition_id,count
0,spark-worker-1.mavasbel.vpn.itam.mx,2,1000
1,spark-worker-1.mavasbel.vpn.itam.mx,5,1000
2,spark-worker-1.mavasbel.vpn.itam.mx,9,1000
3,spark-worker-2.mavasbel.vpn.itam.mx,1,1000
4,spark-worker-2.mavasbel.vpn.itam.mx,4,1000
5,spark-worker-2.mavasbel.vpn.itam.mx,6,1000
6,spark-worker-3.mavasbel.vpn.itam.mx,0,1000
7,spark-worker-3.mavasbel.vpn.itam.mx,3,1000
8,spark-worker-3.mavasbel.vpn.itam.mx,7,1000
9,spark-worker-3.mavasbel.vpn.itam.mx,8,1000


In [10]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import pandas_udf
import pandas as pd


@pandas_udf(df_column_types)
def generate_batch_vectorized(batch_ser: pd.Series) -> pd.DataFrame:
    return pd.DataFrame(list(batch_generator(batch_ser)))


df: DataFrame = (
    spark.range(total_rows, numPartitions=partitions)
    .withColumn("data", generate_batch_vectorized("id"))
    .select("data.*")
)
df.write.mode("overwrite").parquet(f"{SPARK_DATADIR}/faker_vectorized.parquet")
print(f"‚úÖ Created {SPARK_DATADIR}/faker_vectorized.parquet")


partition_stats = (
    df.withColumn("partition_id", F.spark_partition_id())
    .groupBy("worker", "partition_id")
    .count()
    .orderBy("worker", "partition_id")
)
# partition_stats.show()
display(partition_stats.toPandas())

                                                                                

‚úÖ Created /opt/spark/work-dir/data/faker_vectorized.parquet


                                                                                

Unnamed: 0,worker,partition_id,count
0,spark-worker-1.mavasbel.vpn.itam.mx,2,1000
1,spark-worker-1.mavasbel.vpn.itam.mx,5,1000
2,spark-worker-2.mavasbel.vpn.itam.mx,1,1000
3,spark-worker-2.mavasbel.vpn.itam.mx,4,1000
4,spark-worker-2.mavasbel.vpn.itam.mx,8,1000
5,spark-worker-2.mavasbel.vpn.itam.mx,9,1000
6,spark-worker-3.mavasbel.vpn.itam.mx,0,1000
7,spark-worker-3.mavasbel.vpn.itam.mx,3,1000
8,spark-worker-3.mavasbel.vpn.itam.mx,6,1000
9,spark-worker-3.mavasbel.vpn.itam.mx,7,1000


In [11]:
from IPython.display import Markdown, display
from pyspark.sql import functions as F

# Read it back and check the schema/count
df_verify = spark.read.parquet(f"{SPARK_DATADIR}/faker_vectorized.parquet").repartition(partitions)
print(f"Generated rows: {df_verify.count()}")

print("\nFirst 10 by timestamp desc:")
# df_verify.sort(F.col("timestamp").desc()).show(10)
display(df_verify.sort(F.col("timestamp").desc()).toPandas())

print("\nFirst 10 by count(first_name) desc:")
# df_verify.groupBy("first_name").count().sort(F.col("count").desc()).show(10)
display(df_verify.groupBy("first_name").count().sort(F.col("count").desc()).toPandas())

Generated rows: 10000

First 10 by timestamp desc:


Unnamed: 0,id,worker,timestamp,first_name,last_name,email,phone,amount
0,3aa5625b-67d9-4a08-ae85-84b41ab555a1,spark-worker-2.mavasbel.vpn.itam.mx,2026-01-10 04:51:45.993281,David,Evans,aanderson@example.org,2965689995,824.042507
1,19f6cf3c-7f78-416d-ba4b-dc6825d511e2,spark-worker-3.mavasbel.vpn.itam.mx,2026-01-06 18:46:54.507877,Lindsey,Mason,cstephenson@example.net,(298)365-0281,601.962074
2,e8bd74cb-6806-4162-a94b-ad718ee0a915,spark-worker-1.mavasbel.vpn.itam.mx,2026-01-06 17:04:24.422867,David,Carter,ashleyallen@example.com,(324)291-7800,576.942374
3,12f8f999-a439-476c-b0f6-ab15318b6797,spark-worker-1.mavasbel.vpn.itam.mx,2026-01-06 13:54:12.045779,Jeffrey,Webb,cheryl16@example.net,917-486-3729,109.516686
4,0bfab5cb-b79a-4391-aba5-872aca6cdc13,spark-worker-2.mavasbel.vpn.itam.mx,2026-01-05 18:49:17.807191,Jessica,Park,christinasmith@example.net,8275063145,226.584842
...,...,...,...,...,...,...,...,...
9995,6ccc4a22-92a8-4059-997a-afe4e23ffb2f,spark-worker-1.mavasbel.vpn.itam.mx,1970-01-08 19:23:16.265615,Lucas,Kemp,robinsonsharon@example.org,452-439-2776,878.816036
9996,7e8d4d19-2752-4ef4-abb6-bd3b7aad257b,spark-worker-2.mavasbel.vpn.itam.mx,1970-01-07 08:57:07.396668,Sharon,Schmidt,james38@example.com,(572)901-4300,796.166936
9997,f71e4b3a-4541-427d-8c0f-b9a01adc3336,spark-worker-3.mavasbel.vpn.itam.mx,1970-01-05 16:40:21.475890,Alicia,Kelley,kwilliams@example.net,(307)827-4666,502.944628
9998,20cc29ae-9a29-46ca-9d0c-79a98ab7af1c,spark-worker-1.mavasbel.vpn.itam.mx,1970-01-05 15:30:41.134234,Hannah,Smith,bartletthelen@example.net,376-760-8649,655.902833



First 10 by count(first_name) desc:


Unnamed: 0,first_name,count
0,Michael,250
1,David,177
2,Jennifer,153
3,Robert,149
4,James,136
...,...,...
656,Carly,1
657,Mathew,1
658,Malik,1
659,Jaclyn,1


In [12]:
from IPython.display import Markdown, display

df_verify.createOrReplaceTempView("df_verify")
df_sparkql = spark.sql("""
    SELECT 
        first_name, 
        SUM(amount) as total_amount,
        COUNT(*) as first_name_count
    FROM df_verify
    GROUP BY first_name
    ORDER BY first_name_count DESC
""")
display(df_sparkql.toPandas())

Unnamed: 0,first_name,total_amount,first_name_count
0,Michael,119639.936271,250
1,David,87867.289249,177
2,Jennifer,80390.143495,153
3,Robert,73416.190915,149
4,James,66438.052658,136
...,...,...,...
656,Joann,803.718857,1
657,Francis,201.765654,1
658,Kaylee,444.540740,1
659,Sheryl,408.702268,1


In [13]:
from IPython.display import Markdown, display

df_sparkql = spark.sql(f"""
    SELECT 
        first_name, 
        SUM(amount) as total_amount,
        COUNT(*) as first_name_count
    FROM parquet.`{SPARK_DATADIR}/faker_vectorized.parquet`
    GROUP BY first_name
    ORDER BY first_name_count DESC
""")
display(df_sparkql.toPandas())

Unnamed: 0,first_name,total_amount,first_name_count
0,Michael,119639.936271,250
1,David,87867.289249,177
2,Jennifer,80390.143495,153
3,Robert,73416.190915,149
4,James,66438.052658,136
...,...,...,...
656,Kerri,459.042111,1
657,Malik,473.424686,1
658,Edgar,805.562951,1
659,Jaclyn,352.019146,1
