# Setup

In [1]:
SPARK_START_FROM_SCRATCH = True
DOCKER_INTERNAL_HOST = "host.docker.internal"
DOCKER_DNS = ["10.15.20.1"]

SPARK_JUPYTER_LAB_DOCKER_TAG = "spark-jupyter:python3"
SPARK_JOB_VENV_DOCKER_TAG = "spark-job-venv:python3"
SPARK_JOB_VENV_BUILD_DIR = "/opt/spark/venv-build"

SPARK_MASTER_NAME = "spark-master"
SPARK_MASTER_HOSTNAME = f"{SPARK_MASTER_NAME}.mavasbel.vpn.itam.mx"
SPARK_MASTER_IP = "10.15.20.2"
SPARK_MASTER_WUBUI_PORT = 6080
SPARK_MASTER_PORT = 6077

SPARK_TOTAL_WORKERS = 3
SPARK_WORKER_NAMES = [f"spark-worker-{i+1}" for i in range(SPARK_TOTAL_WORKERS)]
SPARK_WORKER_HOSTNAMES = [
    f"{SPARK_WORKER_NAMES[i]}.mavasbel.vpn.itam.mx" for i in range(SPARK_TOTAL_WORKERS)
]
SPARK_WORKER_IPS = ["10.15.20.2"] * SPARK_TOTAL_WORKERS
SPARK_WORKER_WEBUI_PORTS = [6080 + (i + 1) for i in range(SPARK_TOTAL_WORKERS)]

SPARK_WORKDIR = "/opt/spark/work-dir"

JUPYTER_LAB_NAME = "spark-jupyter"
JUPYTER_LAB_HOSTNAME = "spark-jupyter.mavasbel.vpn.itam.mx"
JUPYTER_LAB_IP = "10.15.20.2"
JUPYTER_LAB_PORT = 6888
JUPYTER_LAB_MONITOR_PORT = 4040
JUPYTER_LAB_TOKEN = ""

In [2]:
HADOOP_NAMENODE_HOSTNAME = "namenode.mavasbel.vpn.itam.mx"
HADOOP_NAMENODE_IP = "10.15.20.2"
HADOOP_NAMENODE_PORT = 8020

In [3]:
import os
from pathlib import Path

SPARK_DATADIR = Path(os.path.join(os.path.abspath(Path.cwd()), "data"))
SPARK_DATADIR.mkdir(parents=True, exist_ok=True)

In [4]:
!pip install faker

Collecting faker
  Downloading faker-40.1.0-py3-none-any.whl (2.0 MB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m2.0/2.0 MB[0m [31m641.8 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: faker
Successfully installed faker-40.1.0
[0m

##### Cleaning Spark context

In [5]:
from pyspark import SparkContext

# Forcefully kill any existing or "ghost" contexts
try:
    sc = SparkContext.getOrCreate()
    sc.stop()
    print("üßπ Ghost SparkContext cleaned up.")
except Exception:
    print("‚ú® No existing SparkContext to clean.")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/11 04:50:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


üßπ Ghost SparkContext cleaned up.


# Spark session

In [6]:
import sys
from pyspark.sql import SparkSession
from datetime import datetime

spark = (
    SparkSession.builder.master(
        f"spark://{SPARK_MASTER_HOSTNAME}:{SPARK_MASTER_PORT}"
    )
    .appName(f"SparkLab_{datetime.now().strftime('%Y-%m-%d_%H:%M:%S.%f')}")
    .config("spark.archives", f"{SPARK_WORKDIR}/spark_job_env.tar.gz#environment")
    .config("spark.driver.host", f"{JUPYTER_LAB_HOSTNAME}")
    .config("spark.driver.bindAddress", "0.0.0.0")
    .config("spark.driver.memory", "512m")
    .config("spark.executorEnv.PYSPARK_PYTHON", "./environment/bin/python3")
    .config("spark.executor.memory", "1G")
    .config(
        "spark.executorEnv.PYTHONPATH",
        f"./environment/lib/python{'.'.join(str(n) for n in sys.version_info[:2])}/site-packages",
    )
    .getOrCreate()
)

print("‚úÖ Spark Session is now active.")

‚úÖ Spark Session is now active.


# Data generation

In [7]:
total_rows = 10000
partitions = 10


def batch_generator(ids):
    import socket
    import random
    from faker import Faker

    node_name = socket.gethostname()
    faker = Faker()
    for _ in ids:
        yield (
            faker.uuid4(),
            node_name,
            faker.date_time(),
            faker.first_name(),
            faker.last_name(),
            faker.email(),
            faker.basic_phone_number(),
            random.random() * 1000.0,
        )

df_column_names = [
    "id",
    "worker",
    "timestamp",
    "first_name",
    "last_name",
    "email",
    "phone",
    "amount",
]
df_column_types = spark.createDataFrame(
    list(batch_generator(range(1))), schema=df_column_names
).schema
print(f"‚úÖ batch_generator schema: {df_column_types}")

‚úÖ batch_generator schema: StructType([StructField('id', StringType(), True), StructField('worker', StringType(), True), StructField('timestamp', TimestampType(), True), StructField('first_name', StringType(), True), StructField('last_name', StringType(), True), StructField('email', StringType(), True), StructField('phone', StringType(), True), StructField('amount', DoubleType(), True)])


In [8]:
from pyspark.sql import functions as F
from IPython.display import Markdown, display

df = spark.createDataFrame(
    list(batch_generator(range(total_rows))), df_column_names
).repartition(partitions)
df.write.mode("overwrite").csv(f"{SPARK_DATADIR}/faker.csv")
print(f"‚úÖ Created {SPARK_DATADIR}/faker.csv")

partition_stats = (
    df.withColumn("partition_id", F.spark_partition_id())
    .groupBy("worker", "partition_id")
    .count()
    .orderBy("worker", "partition_id")
)
# partition_stats.show()
display(partition_stats.toPandas())


                                                                                

‚úÖ Created /opt/spark/work-dir/data/faker.csv


                                                                                

Unnamed: 0,worker,partition_id,count
0,spark-jupyter.mavasbel.vpn.itam.mx,0,1002
1,spark-jupyter.mavasbel.vpn.itam.mx,1,1002
2,spark-jupyter.mavasbel.vpn.itam.mx,2,999
3,spark-jupyter.mavasbel.vpn.itam.mx,3,998
4,spark-jupyter.mavasbel.vpn.itam.mx,4,999
5,spark-jupyter.mavasbel.vpn.itam.mx,5,999
6,spark-jupyter.mavasbel.vpn.itam.mx,6,1000
7,spark-jupyter.mavasbel.vpn.itam.mx,7,999
8,spark-jupyter.mavasbel.vpn.itam.mx,8,1000
9,spark-jupyter.mavasbel.vpn.itam.mx,9,1002


In [9]:
from IPython.display import Markdown, display

rdd = spark.sparkContext.parallelize(range(total_rows), partitions).mapPartitions(
    batch_generator
)
df = rdd.toDF(df_column_names)
df.write.mode("overwrite").parquet(f"{SPARK_DATADIR}/faker.parquet")
print(f"‚úÖ Created {SPARK_DATADIR}/faker.parquet")

partition_stats = (
    df.withColumn("partition_id", F.spark_partition_id())
    .groupBy("worker", "partition_id")
    .count()
    .orderBy("worker", "partition_id")
)
# partition_stats.show()
display(partition_stats.toPandas())

                                                                                

‚úÖ Created /opt/spark/work-dir/data/faker.parquet


                                                                                

Unnamed: 0,worker,partition_id,count
0,spark-worker-1.mavasbel.vpn.itam.mx,1,1000
1,spark-worker-1.mavasbel.vpn.itam.mx,4,1000
2,spark-worker-1.mavasbel.vpn.itam.mx,6,1000
3,spark-worker-1.mavasbel.vpn.itam.mx,7,1000
4,spark-worker-2.mavasbel.vpn.itam.mx,2,1000
5,spark-worker-2.mavasbel.vpn.itam.mx,5,1000
6,spark-worker-3.mavasbel.vpn.itam.mx,0,1000
7,spark-worker-3.mavasbel.vpn.itam.mx,3,1000
8,spark-worker-3.mavasbel.vpn.itam.mx,8,1000
9,spark-worker-3.mavasbel.vpn.itam.mx,9,1000


In [11]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import pandas_udf
import pandas as pd


@pandas_udf(df_column_types)
def generate_batch_vectorized(batch_ser: pd.Series) -> pd.DataFrame:
    return pd.DataFrame(list(batch_generator(batch_ser)))


df: DataFrame = (
    spark.range(total_rows, numPartitions=partitions)
    .withColumn("data", generate_batch_vectorized("id"))
    .select("data.*")
)
# df.write.mode("overwrite").parquet(f"{SPARK_DATADIR}/faker_vectorized.parquet")
# df.coalesce(1).write.mode("overwrite").parquet(f"{SPARK_DATADIR}/faker_vectorized.parquet")
# pdf = df.toPandas()
# pdf.to_parquet(f"{SPARK_DATADIR}/faker_vectorized.parquet", index=False)
df.write.mode("overwrite").parquet(f"hdfs://{HADOOP_NAMENODE_HOSTNAME}:{HADOOP_NAMENODE_PORT}/spark/work-dir/faker_vectorized.parquet")
print(f"‚úÖ Created hdfs://{HADOOP_NAMENODE_HOSTNAME}:{HADOOP_NAMENODE_PORT}/spark/work-dir/faker_vectorized.parquet")


partition_stats = (
    df.withColumn("partition_id", F.spark_partition_id())
    .groupBy("worker", "partition_id")
    .count()
    .orderBy("worker", "partition_id")
)
# partition_stats.show()
display(partition_stats.toPandas())

                                                                                

‚úÖ Created hdfs://namenode.mavasbel.vpn.itam.mx:8020/spark/work-dir/faker_vectorized.parquet


                                                                                

Unnamed: 0,worker,partition_id,count
0,spark-worker-1.mavasbel.vpn.itam.mx,2,1000
1,spark-worker-1.mavasbel.vpn.itam.mx,5,1000
2,spark-worker-2.mavasbel.vpn.itam.mx,0,1000
3,spark-worker-2.mavasbel.vpn.itam.mx,3,1000
4,spark-worker-2.mavasbel.vpn.itam.mx,6,1000
5,spark-worker-2.mavasbel.vpn.itam.mx,7,1000
6,spark-worker-3.mavasbel.vpn.itam.mx,1,1000
7,spark-worker-3.mavasbel.vpn.itam.mx,4,1000
8,spark-worker-3.mavasbel.vpn.itam.mx,8,1000
9,spark-worker-3.mavasbel.vpn.itam.mx,9,1000


In [12]:
from IPython.display import Markdown, display
from pyspark.sql import functions as F

# Read it back and check the schema/count

# df_verify = spark.read.parquet(f"{SPARK_DATADIR}/faker_vectorized.parquet").repartition(partitions)
# pdf_verify = pd.read_parquet(f"{SPARK_DATADIR}/faker_vectorized.parquet")
# df_verify = spark.createDataFrame(pdf_verify).repartition(partitions)
df_verify = spark.read.parquet(f"hdfs://{HADOOP_NAMENODE_HOSTNAME}:{HADOOP_NAMENODE_PORT}/spark/work-dir/faker_vectorized.parquet").repartition(partitions)
print(f"Generated rows: {df_verify.count()}")

print("\nFirst 10 by timestamp desc:")
# df_verify.sort(F.col("timestamp").desc()).show(10)
display(df_verify.sort(F.col("timestamp").desc()).toPandas())

print("\nFirst 10 by count(first_name) desc:")
# df_verify.groupBy("first_name").count().sort(F.col("count").desc()).show(10)
display(df_verify.groupBy("first_name").count().sort(F.col("count").desc()).toPandas())

Generated rows: 10000

First 10 by timestamp desc:


Unnamed: 0,id,worker,timestamp,first_name,last_name,email,phone,amount
0,24fc9c0f-1fe7-4e1f-8e2d-2fb0ae7c27b9,spark-worker-1.mavasbel.vpn.itam.mx,2026-01-09 10:47:40.641212,Benjamin,Bowers,nathangreen@example.com,519-998-7406,99.234629
1,05e601af-ef9b-4e7d-963b-da3c88fd2fd3,spark-worker-2.mavasbel.vpn.itam.mx,2026-01-04 10:30:49.899540,Sean,Taylor,morriskara@example.org,(261)225-8887,771.357307
2,6edb4bae-cca6-45cb-80f6-3922c7aeb5e5,spark-worker-1.mavasbel.vpn.itam.mx,2026-01-03 23:28:34.928758,Lisa,Lawrence,victorbell@example.net,266-247-9796,191.040138
3,91693555-93ca-4ee3-9a77-b289131cd99d,spark-worker-2.mavasbel.vpn.itam.mx,2026-01-02 05:12:36.000366,Jose,Ramos,kevinrodgers@example.net,(999)925-7425,597.864049
4,8bbbcafa-3a63-4e37-a646-55f05ceddf4a,spark-worker-1.mavasbel.vpn.itam.mx,2026-01-01 14:18:50.473825,Mary,Salas,yjones@example.com,838-967-2564,770.445834
...,...,...,...,...,...,...,...,...
9995,bced262c-1fb3-44df-8cbf-c3b5c4a61511,spark-worker-3.mavasbel.vpn.itam.mx,1970-01-07 00:00:14.136774,Mariah,Forbes,velliott@example.com,(433)350-7799,850.051115
9996,64f61b82-fcb0-4abb-ab40-3495eb0168dc,spark-worker-3.mavasbel.vpn.itam.mx,1970-01-02 23:10:21.839516,Toni,Gray,bakerstacey@example.com,(226)806-7702,529.640863
9997,e41cc3bc-1c23-4ee5-bedf-81c7d4d25ed5,spark-worker-1.mavasbel.vpn.itam.mx,1970-01-01 23:30:35.796476,Courtney,Woods,jasonwiggins@example.org,512-411-7082,725.699249
9998,6c10981a-faec-4333-b681-7ffa771e8df7,spark-worker-1.mavasbel.vpn.itam.mx,1970-01-01 14:03:57.568477,John,Cervantes,gileskenneth@example.org,911-932-5697,306.497695



First 10 by count(first_name) desc:


Unnamed: 0,first_name,count
0,Michael,239
1,Jennifer,171
2,David,148
3,James,145
4,Robert,143
...,...,...
659,Sierra,1
660,Maxwell,1
661,Collin,1
662,Traci,1


In [13]:
from IPython.display import Markdown, display

df_verify.createOrReplaceTempView("df_verify")
df_sparkql = spark.sql("""
    SELECT 
        first_name, 
        SUM(amount) as total_amount,
        COUNT(*) as first_name_count
    FROM df_verify
    GROUP BY first_name
    ORDER BY first_name_count DESC
""")
display(df_sparkql.toPandas())

Unnamed: 0,first_name,total_amount,first_name_count
0,Michael,114965.325126,239
1,Jennifer,88140.957269,171
2,David,82401.532086,148
3,James,74006.206988,145
4,Robert,66294.051367,143
...,...,...,...
659,Stefanie,197.045255,1
660,Bob,997.628761,1
661,Sherri,449.384312,1
662,Tracie,916.210197,1


In [14]:
from IPython.display import Markdown, display

df_sparkql = spark.sql(f"""
    SELECT 
        first_name, 
        SUM(amount) as total_amount,
        COUNT(*) as first_name_count
    FROM parquet.`hdfs://{HADOOP_NAMENODE_HOSTNAME}:{HADOOP_NAMENODE_PORT}/spark/work-dir/faker_vectorized.parquet`
    GROUP BY first_name
    ORDER BY first_name_count DESC
""")
display(df_sparkql.toPandas())

Unnamed: 0,first_name,total_amount,first_name_count
0,Michael,114965.325126,239
1,Jennifer,88140.957269,171
2,David,82401.532086,148
3,James,74006.206988,145
4,Robert,66294.051367,143
...,...,...,...
659,Alejandra,533.396406,1
660,Clinton,792.338771,1
661,Colton,560.061666,1
662,Jody,730.414695,1
