In [1]:
import os
from pyspark.sql import SparkSession, functions as F
from pyspark.ml.classification import GBTClassifier, LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.tuning import TrainValidationSplit
from pyspark import StorageLevel

In [2]:
def spark_session():
    # Stop any old session so new configs take effect in notebooks
    return (
        SparkSession.builder
        .appName("MySQL_to_Delta_on_MinIO")
        .master("spark://spark-master:7077")
        .config("spark.jars.packages",
                ",".join([
                    # Delta
                    "io.delta:delta-spark_2.12:3.1.0",
                    # MySQL JDBC
                    "mysql:mysql-connector-java:8.0.33",
                    # S3A / MinIO (versions must match your Hadoop)
                    "org.apache.hadoop:hadoop-aws:3.3.2",
                    "com.amazonaws:aws-java-sdk-bundle:1.11.1026",
                ]))
        # Delta integration
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
        # MinIO (S3A) configs
        .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
        .config("spark.hadoop.fs.s3a.access.key", "minioadmin")
        .config("spark.hadoop.fs.s3a.secret.key", "minioadmin")
        .config("spark.hadoop.fs.s3a.path.style.access", "true")
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
        .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
        .config("spark.ui.port", "4040")                 # fix the port
        .config("spark.driver.bindAddress", "0.0.0.0")   # listen on all ifaces
        .config("spark.driver.host", "jupyter")          # OR "spark-master" – the container's DNS name
        .config("spark.ui.showConsoleProgress", "true")
        # Resources
        # .config("spark.executor.cores", "2")
        # .config("spark.executor.memory", "2g")
        # .config("spark.executor.memoryOverhead", "1536m")
        # .config("spark.network.timeout", "600s")
        .config("spark.executor.cores", "1")           # 1 task per executor (more stable for trees)
        .config("spark.executor.memory", "3g")
        .config("spark.executor.memoryOverhead", "1g")  # or omit in Standalone
        .config("spark.sql.shuffle.partitions", "50")
        .config("spark.local.dir", "/mnt/spark-tmp/local") # For giving it much more space to run CV
        .config("spark.network.timeout", "600s")
        .getOrCreate()
    )

s = spark_session()

In [None]:

gold = os.getenv("GOLD_PATH","s3a://deltabucket/gold/wholeCorp_delta")
df = s.read.format("delta").load(gold)

# Label from 公司狀態 → Active=1 else 0 (adjust mapping as needed)
base = s.read.format("delta").load(os.getenv("SILVER_PATH","s3a://deltabucket/silver/wholeCorp_delta"))\
        .select("統一編號","公司狀態")
data = df.join(base, "統一編號", "left")
data = data.withColumn("label", F.when(F.col("公司狀態")=="核准設立", 1).otherwise(0))

# Split
train, test = data.randomSplit([0.8, 0.2], seed=42)

s.conf.set("spark.sql.shuffle.partitions", "50")  # was 200
train = train.select("label","features").repartition(50)\
             .persist(StorageLevel.MEMORY_AND_DISK)
train.count()  # materialize
paramGrid = ParamGridBuilder().build() # no grid first cv = CrossValidator(estimator=clf, estimatorParamMaps=paramGrid,
clf = GBTClassifier(labelCol="label", featuresCol="features",
                    maxDepth=4, maxIter=30, subsamplingRate=0.8,
                    maxBins=32, cacheNodeIds=True)

evaluator=BinaryClassificationEvaluator(labelCol="label",
                                        rawPredictionCol="rawPrediction",
                                        metricName="areaUnderPR")
tvs = TrainValidationSplit(estimator=clf,
                           estimatorParamMaps=paramGrid,
                           evaluator=evaluator,
                           trainRatio=0.8, parallelism=2)

best = tvs.fit(train)



In [11]:
preds = best.transform(test)
print("AUPRC:", evaluator.evaluate(preds))

# Save model path for scoring
out_uri = os.getenv("STATUS_MODEL_URI","s3a://deltabucket/models/status_gbt")
best.bestModel.write().overwrite().save(out_uri)
