In [3]:
import os
from pyspark.sql import SparkSession, functions as F
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml import Pipeline
import pyspark.pandas as ps

def spark_session():
    # Stop any old session so new configs take effect in notebooks
    return (
        SparkSession.builder
        .appName("MySQL_to_Delta_on_MinIO")
        .master("spark://spark-master:7077")
        .config("spark.jars.packages",
                ",".join([
                    # Delta
                    "io.delta:delta-spark_2.12:3.1.0",
                    # MySQL JDBC
                    "mysql:mysql-connector-java:8.0.33",
                    # S3A / MinIO (versions must match your Hadoop)
                    "org.apache.hadoop:hadoop-aws:3.3.2",
                    "com.amazonaws:aws-java-sdk-bundle:1.11.1026",
                ]))
        # Delta integration
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
        # MinIO (S3A) configs
        .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
        .config("spark.hadoop.fs.s3a.access.key", "minioadmin")
        .config("spark.hadoop.fs.s3a.secret.key", "minioadmin")
        .config("spark.hadoop.fs.s3a.path.style.access", "true")
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
        .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
        # Resources
        .config("spark.executor.cores", "2")
        .config("spark.executor.memory", "2g")
        .config("spark.executor.memoryOverhead", "512m")
        .getOrCreate()
    )
import os
from pyspark.sql import SparkSession, functions as F
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml import Pipeline
import pyspark.pandas as ps
from sparksession import spark_session

# Read Silver data
def read_silver(spark):
    silver = os.getenv("SILVER_PATH","s3a://deltabucket/silver/wholeCorp_delta")
    return spark.read.format("delta").load(silver)

# Basic flags
def vevtorize(df):
    def has(col): return (F.col(col).isNotNull() & (F.length(F.col(col))>0)).cast("int")
    df = (df
          .withColumn("has_官網", has("官網"))
          .withColumn("has_電話", has("電話"))
          .withColumn("log_資本額", F.log1p(F.col("資本額")))
    )
    df = df.fillna({"log_資本額": 0})
    
    # Text
    text_col = F.coalesce(F.col("類別_全"))
    df = df.withColumn("text_all", text_col)
    df = df.withColumn("text_str", F.col("text_all").cast("string"))
    df = df.fillna({"text_str": ""})  # or drop: df = df.dropna(subset=["text_str"])
    
    tok = RegexTokenizer(inputCol="text_str", outputCol="tok",
                         pattern="\\s+", gaps=True, toLowercase=True)
    stop = StopWordsRemover(inputCol="tok", outputCol="tok_clean")
    tf = HashingTF(inputCol="tok_clean", outputCol="tf", numFeatures=1<<18)
    idf = IDF(inputCol="tf", outputCol="tfidf")
    
    # Categorical
    cats = []
    for c in ["縣市名稱", "區域名稱", "上市櫃_基本資料"]:
        if c in df.columns: cats.append(c)
    
    # indexers = [StringIndexer(inputCol=c, outputCol=f"{c}_idx", handleInvalid="keep") for c in cats]
    # encoders = [OneHotEncoder(inputCols=[f"{c}_idx"], outputCols=[f"{c}_ohe"]) for c in cats]
    
    num_cols = [c for c in ["log_資本額","log_實收資本總額"] if c in df.columns]
    bin_cols = ["has_官網","has_電話"]
    # ohe_cols = [f"{c}_ohe" for c in cats]
    
    assembler = VectorAssembler(inputCols=["tfidf"] + num_cols + bin_cols,
                                outputCol="features_raw")
    scaler = StandardScaler(inputCol="features_raw", outputCol="features")
    
    pipe = Pipeline(stages=[tok, stop, tf, idf]+ [assembler, scaler])
    model = pipe.fit(df)
    out = model.transform(df)
    return out
    
def save_gold(out):
    gold = os.getenv("GOLD_PATH","s3a://deltabucket/gold/wholeCorp_delta")
    (out.select("統一編號","公司名稱","features")
        .write.format("delta").mode("overwrite").save(gold))

def main():
    try:
        s = spark_session()
        df = read_silver(s)
        out = vevtorize(df)
        save_gold(out)
        s.stop()
    except Exception as e:
        print(f"Error in spark job: {e}")
    finally:
        if 's' in locals():
            s.stop()

In [8]:
df.select(F.countDistinct("官網")).collect()

[Row(count(DISTINCT 官網)=32656)]

In [11]:
df.select("log_資本額").distinct().show()

+------------------+
|        log_資本額|
+------------------+
| 16.41820031748273|
|16.759949589762293|
|14.436087583323648|
| 18.72078534381011|
|17.196301163869315|
|13.104201443434462|
| 16.32103657658766|
|13.791589722477323|
|14.845523112142669|
|13.222914345709215|
| 16.90609840800616|
|15.452563831983857|
|15.292559510663105|
|16.120093753421386|
| 14.19394767861579|
| 9.169622538697624|
|17.491811255187805|
| 19.34493964944395|
|13.998665933701242|
|17.212997363947366|
+------------------+
only showing top 20 rows



In [39]:
from pyspark.sql import functions as F

inputs = ["log_資本額","log_實收資本總額","has_官網","has_聯絡人","has_電話","has_手機"]

for c in inputs:
    n = df.filter(F.col(c).isNull()).count()
    print(c, "nulls:", n)


log_資本額 nulls: 261305
log_實收資本總額 nulls: 1218984
has_官網 nulls: 0
has_聯絡人 nulls: 0
has_電話 nulls: 0
has_手機 nulls: 0
