In [2]:
import os, sys
from pyspark.sql import SparkSession, functions as F, types as T

In [3]:
from pyspark.sql import SparkSession, functions as F

def spark_session():
    # Stop any old session so new configs take effect in notebooks
    return (
        SparkSession.builder
        .appName("MySQL_to_Delta_on_MinIO")
        .master("spark://spark-master:7077")
        .config("spark.jars.packages",
                ",".join([
                    # Delta
                    "io.delta:delta-spark_2.12:3.1.0",
                    # MySQL JDBC
                    "mysql:mysql-connector-java:8.0.33",
                    # S3A / MinIO (versions must match your Hadoop)
                    "org.apache.hadoop:hadoop-aws:3.3.2",
                    "com.amazonaws:aws-java-sdk-bundle:1.11.1026",
                ]))
        # Delta integration
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
        # MinIO (S3A) configs
        .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
        .config("spark.hadoop.fs.s3a.access.key", "minioadmin")
        .config("spark.hadoop.fs.s3a.secret.key", "minioadmin")
        .config("spark.hadoop.fs.s3a.path.style.access", "true")
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
        .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
        # Resources
        .config("spark.executor.cores", "2")
        .config("spark.executor.memory", "2g")
        .config("spark.executor.memoryOverhead", "512m")
        .getOrCreate()
    )

def read_from_mysql(spark):
    # 2) Read MySQL
    return (spark.read.format("jdbc")
      .option("url", "jdbc:mysql://mysql-business-only:3306/whole_corp"
                      "?useUnicode=true&characterEncoding=utf8"
                      "&serverTimezone=Asia/Taipei"
                      "&useSSL=false&allowPublicKeyRetrieval=true")
      .option("dbtable", "whole_corp")
      .option("user", "root")
      .option("password", "!QAZ2wsx")
      .option("driver", "com.mysql.cj.jdbc.Driver")
      .load())


def bronze_to_silver(s):
    # Read Bronze (from MinIO or local)
    bronze_path = os.getenv("BRONZE_PATH", "s3a://deltabucket/bronze/wholeCorp_delta")
    df = s.read.parquet(bronze_path)

    # Coerce types
    to_int = ["資本額","實收資本總額","員工","年營收"]
    for c in to_int:
        if c in df.columns:
            df = df.withColumn(c, F.regexp_replace(F.col(c), r"[^\d]", "").cast("long"))

    if "成立年份" in df.columns:
        df = df.withColumn("公司年齡", F.lit(F.year(F.current_date())) - F.col("成立年份").cast("int"))

    # Trim strings
    for c, t in df.dtypes:
        if t == "string":
            df = df.withColumn(c, F.trim(F.col(c)))

    silver_path = os.getenv("SILVER_PATH", "s3a://deltabucket/silver/wholeCorp_delta")
    df.write.format("delta").mode("overwrite").save(silver_path)


def store_in_minio(df):
    # 3) Write Delta to MinIO
    (df.write.format("delta")
       .mode("overwrite")
       .save("s3a://deltabucket/bronze/wholeCorp_delta"))

def main():
    try:
        s = spark_session()
        df = read_from_mysql(s)
        store_in_minio(df)
        bronze_to_silver(s)
        s.stop()
    except Exception as e:
        print(e)
    finally:
        try:
            s.stop()
        except:
            pass
            
if __name__ == "__main__":
    main()


In [9]:
os.environ

environ{'PATH': '/opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/spark/bin',
        'HOSTNAME': 'a5951c63236d',
        'PYSPARK_DRIVER_PYTHON_OPTS': '""',
        'PYSPARK_NO_DRIVER': 'true',
        'PYSPARK_SUBMIT_ARGS': '--jars /usr/local/spark/jars/delta-core_2.12-2.4.0.jar,/usr/local/spark/jars/delta-storage-2.4.0.jar pyspark-shell',
        'JUPYTER_ENABLE_LAB': 'yes',
        'DEBIAN_FRONTEND': 'noninteractive',
        'CONDA_DIR': '/opt/conda',
        'SHELL': '/bin/bash',
        'NB_USER': 'jovyan',
        'NB_UID': '1000',
        'NB_GID': '100',
        'LC_ALL': 'en_US.UTF-8',
        'LANG': 'en_US.UTF-8',
        'LANGUAGE': 'en_US.UTF-8',
        'HOME': '/home/jovyan',
        'JUPYTER_PORT': '8888',
        'XDG_CACHE_HOME': '/home/jovyan/.cache/',
        'APACHE_SPARK_VERSION': '3.5.0',
        'HADOOP_VERSION': '3',
        'SPARK_HOME': '/usr/local/spark',
        'SPARK_OPTS': '--driver-java-options=-Xms1024M --driver-

# Extract from mysql

In [4]:
# s.read.format("jdbc") \
#  .option("url","jdbc:mysql://root:!QAZ2wsx@mysql-business-only:3306") \
#  .option("dbtable","(SELECT 1) AS t") \
#  .option("user","user").option("password","!QAZ2wsx") \
#  .option("driver","com.mysql.cj.jdbc.Driver") \
#  .load().show()

In [4]:
df = (s.read.format("jdbc")
      .option("url", "jdbc:mysql://root:!QAZ2wsx@mysql-business-only:3306/whole_corp"
                      "?useUnicode=true&characterEncoding=utf8"
                      "&serverTimezone=Asia/Taipei"
                      "&useSSL=false&allowPublicKeyRetrieval=true")
      .option("dbtable", "whole_corp")  # or "schema.table" if needed
      .option("user", "user")
      .option("password", "!QAZ2wsx")
      .option("driver", "com.mysql.cj.jdbc.Driver")
      .load())

In [20]:
df.select('資本額', '實收資本總額').show(10)

+--------+------------+
|  資本額|實收資本總額|
+--------+------------+
|100000.0|        NULL|
|200000.0|        NULL|
| 80000.0|        NULL|
|200000.0|        NULL|
| 50000.0|        NULL|
|100000.0|        NULL|
|200000.0|        NULL|
|200000.0|        NULL|
| 80000.0|        NULL|
|100000.0|        NULL|
+--------+------------+
only showing top 10 rows



In [42]:
df.filter((df.資本額<=1) & (df.資本額.isNotNull())).select(df.資本額+1, '公司名稱').show()

+------------+------------------+
|(資本額 + 1)|          公司名稱|
+------------+------------------+
|         2.0|      金來來工藝社|
|         2.0|          一安葯局|
|         2.0|  進興鏟裝機材料行|
|         2.0|          百盛車行|
|         2.0|          成發商行|
|         2.0|          宗成蛋行|
|         2.0|        正豐機械廠|
|         2.0|        益興皮鞋行|
|         2.0|巴特皮爾影像工作室|
|         2.0|多技美匠美髮工作室|
|         2.0|    仁森攝影工作室|
|         2.0|      均達汽車商行|
|         2.0|  耶斯列資訊工作室|
+------------+------------------+



In [44]:
df.filter( (df.實收資本總額.isNotNull())).select(df.資本額+1, '公司名稱').show()

+------------+------------------------------+
|(資本額 + 1)|                      公司名稱|
+------------+------------------------------+
|   3000001.0|      雅風築雲餐飲股份有限公司|
|   3000001.0|              妙品股份有限公司|
| 2.8000001E7|          馡曄國際股份有限公司|
| 2.0000001E7|          憲賣國際股份有限公司|
|    500001.0|              入味股份有限公司|
|   5800001.0|              田聖股份有限公司|
| 6.0000001E7|              柏悅股份有限公司|
|   1000001.0|      濠饗國際餐飲股份有限公司|
| 1.6860001E7|              古傳股份有限公司|
|   6000001.0|          專成國際股份有限公司|
| 1.5000001E7|          卡菲努努股份有限公司|
| 3.0000001E7|              展得股份有限公司|
| 1.2680001E7|    金根本文化創意股份有限公司|
|   2000001.0|          呷意小吃股份有限公司|
| 8.3000001E7|好味全餐飲管理顧問股份有限公司|
|   3000001.0|        金瑞德國際股份有限公司|
|   6000001.0|    大聯盛國際食品股份有限公司|
| 1.5100001E7|  密斯特杰飲食文化股份有限公司|
| 2.0000001E7|          得得國際股份有限公司|
|   7000001.0|          星恩國際股份有限公司|
+------------+------------------------------+
only showing top 20 rows



In [47]:
# Coerce types
to_int = ["資本額","實收資本總額","員工","年營收"]
for c in to_int:
    if c in df.columns:
        df = df.withColumn(c, F.regexp_replace(F.col(c), r"[^\d]", "").cast("long"))

In [None]:
df = df.withColumn(
    "xx",
    F.when((F.col("成立年分")!="")& (ddd), F.col('成立年分'))
    .otherwise
)