In [1]:
import os
from pyspark.sql import SparkSession
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import MiniBatchKMeans, KMeans
from scipy.sparse import csr_matrix, vstack
import numpy as np
import joblib

from sklearn.random_projection import SparseRandomProjection
from sklearn.neighbors import NearestNeighbors

from scipy.sparse import vstack as sp_vstack

from pyspark.sql import functions as F
from pyspark.sql.types import BinaryType, IntegerType, StructType, StructField, ArrayType, DoubleType

from pyspark.ml.linalg import SparseVector, DenseVector

import pandas as pd

import boto3

In [2]:
def spark_session():
    # Stop any old session so new configs take effect in notebooks
    return (
        SparkSession.builder
        .appName("MySQL_to_Delta_on_MinIO")
        .master("spark://spark-master:7077")
        .config("spark.jars.packages",
                ",".join([
                    # Delta
                    "io.delta:delta-spark_2.12:3.1.0",
                    # MySQL JDBC
                    # "mysql:mysql-connector-java:8.0.33",
                    # S3A / MinIO (versions must match your Hadoop)
                    "org.apache.hadoop:hadoop-aws:3.3.2",
                    "com.amazonaws:aws-java-sdk-bundle:1.11.1026",
                    "org.postgresql:postgresql:42.7.3"
                ]))
        # Delta integration
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
        # MinIO (S3A) configs
        .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
        .config("spark.hadoop.fs.s3a.access.key", "minioadmin")
        .config("spark.hadoop.fs.s3a.secret.key", "minioadmin")
        .config("spark.hadoop.fs.s3a.path.style.access", "true")
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
        .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
        .config("spark.ui.port", "4040")                 # fix the port
        .config("spark.driver.bindAddress", "0.0.0.0")   # listen on all ifaces
        .config("spark.driver.host", "jupyter")          # OR "spark-master" – the container's DNS name
        .config("spark.ui.showConsoleProgress", "true")
        # Resources
        # .config("spark.executor.cores", "2")
        # .config("spark.executor.memory", "2g")
        # .config("spark.executor.memoryOverhead", "1536m")
        # .config("spark.network.timeout", "600s")
        .config("spark.executor.cores", "2")           # 1 task per executor (more stable for trees)
        .config("spark.executor.memory", "3g")
        .config("spark.executor.memoryOverhead", "1g")  # or omit in Standalone
        .config("spark.sql.shuffle.partitions", "50")
        .config("spark.local.dir", "/mnt/spark-tmp/local") # For giving it much more space to run CV
        .config("spark.network.timeout", "600s")
        .getOrCreate()
    )


In [3]:
import os
from pyspark.sql import SparkSession
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import MiniBatchKMeans
from scipy.sparse import csr_matrix
import numpy as np
import joblib

from sklearn.random_projection import SparseRandomProjection
from sklearn.neighbors import NearestNeighbors

from scipy.sparse import vstack as sp_vstack

from pyspark.sql import functions as F
from pyspark.sql.types import BinaryType, IntegerType, StructType, StructField, ArrayType, DoubleType

from pyspark.ml.linalg import SparseVector, DenseVector

import pandas as pd


def read_gold(spark):
    global OUT

    GOLD = os.getenv("GOLD_PATH","s3a://deltabucket/gold/wholeCorp_delta")
    OUT = os.getenv("CLUSTER_PATH","s3a://deltabucket/gold/wholeCorp_clusters")

    return spark.read.format("delta").load(GOLD)

from pyspark.sql import Row
from scipy.sparse import csr_matrix
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import MiniBatchKMeans

def fit_predict(data_whole, sample_frac=0.005, sample_cap=5000, batch_size=1000):
    """
    Run SVD + MiniBatchKMeans on a manageable sample.
    Returns: Spark DataFrame [統一編號, cluster, Xr_vector]
    """

    # --- get feature size from one row
    first_vec = data_whole.limit(1).collect()[0][2]
    num_features = int(first_vec.size)

    # --- helper: Spark Row -> CSR batch
    def to_csr(rows):
        indptr = [0]; indices = []; vals = []
        for sv in rows:
            indices.extend(sv.indices.tolist())
            vals.extend(sv.values.tolist())
            indptr.append(indptr[-1] + len(sv.indices))
        return csr_matrix((np.array(vals, dtype=np.float64),
                           np.array(indices, dtype=np.int32),
                           np.array(indptr, dtype=np.int32)),
                          shape=(len(rows), num_features))

    # 1) collect a small sample for SVD training
    sample_rows = []
    for i, row in enumerate(data_whole.sample(False, sample_frac, seed=42).toLocalIterator()):
        sample_rows.append(row['features'])
        if i >= sample_cap:   # hard cap
            break

    X_sample = to_csr(sample_rows)
    print(f"Sample size for SVD: {X_sample.shape}")

    svd = TruncatedSVD(n_components=100, random_state=42).fit(X_sample)

    # --- Stage 2: incremental clustering
    kmeans = MiniBatchKMeans(
        n_clusters=15,
        random_state=42,
        batch_size=batch_size,
        verbose=1,
        n_init='auto'
    )

    ids, labels, Xr_vector = [], [], []
    batch_features, batch_ids = [], []

    for row in data_whole.select("統一編號", "features").toLocalIterator():
        batch_features.append(row["features"])
        batch_ids.append(row["統一編號"])
        if len(batch_features) >= batch_size:
            Xb = to_csr(batch_features)
            Xr = svd.transform(Xb)
            Xr_vector.extend(Xr)

            kmeans.partial_fit(Xr)
            preds = kmeans.predict(Xr)

            ids.extend(batch_ids)
            labels.extend(preds.tolist())
            batch_features.clear(); batch_ids.clear()

    # last leftover batch
    if batch_features:
        Xb = to_csr(batch_features)
        Xr = svd.transform(Xb)
        Xr_vector.extend(Xr)

        kmeans.partial_fit(Xr)
        preds = kmeans.predict(Xr)
        ids.extend(batch_ids)
        labels.extend(preds.tolist())

    # --- return as Spark DataFrame
    rows = [
        Row(
            統一編號=str(i),
            cluster=int(c),
            Xr_vector=[float(v) for v in x.tolist()]
        )
        for i, c, x in zip(ids, labels, Xr_vector)
    ]

    spark = data_whole.sparkSession
    sdf = spark.createDataFrame(rows)

    return sdf, svd, kmeans


def save_(pdf, spark, kmeans, svd):

    to_pg(spark, pdf)
    
    # s3 = boto3.client(
    #     "s3",
    #     endpoint_url = 'http://minio:9000',
    #     aws_access_key_id="minioadmin",
    #     aws_secret_access_key="minioadmin"
    # )

    # # save locally
    # joblib.dump(svd, "/tmp/svd.pkl")
    # joblib.dump(kmeans, "/tmp/kmeans.pkl")
    
    # # upload to MinIO
    # s3.upload_file("/tmp/svd.pkl", "deltabucket", "models/sk_svd.pkl")
    # s3.upload_file("/tmp/kmeans.pkl", "deltabucket", "models/sk_kmeans.pkl")
    
# from sparksession import spark_session
s = None
s = spark_session()
df = read_gold(s)


In [None]:

def fit_predict(data_whole, sample_frac=0.005, sample_cap=5000, batch_size=1000):
    """
    Run SVD + MiniBatchKMeans on a manageable sample.
    Returns: Spark DataFrame [統一編號, cluster, Xr_vector]
    """

    # --- get feature size from one row
    first_vec = data_whole.limit(1).collect()[0][2]
    num_features = int(first_vec.size)

    # --- helper: Spark Row -> CSR batch
    def to_csr(rows):
        indptr = [0]; indices = []; vals = []
        for sv in rows:
            indices.extend(sv.indices.tolist())
            vals.extend(sv.values.tolist())
            indptr.append(indptr[-1] + len(sv.indices))
        return csr_matrix((np.array(vals, dtype=np.float64),
                           np.array(indices, dtype=np.int32),
                           np.array(indptr, dtype=np.int32)),
                          shape=(len(rows), num_features))

    # 1) collect a small sample for SVD training
    sample_rows = []
    for i, row in enumerate(data_whole.sample(False, sample_frac, seed=42).toLocalIterator()):
        sample_rows.append(row['features'])
        if i >= sample_cap:   # hard cap
            break

    X_sample = to_csr(sample_rows)
    print(f"Sample size for SVD: {X_sample.shape}")

    svd = TruncatedSVD(n_components=100, random_state=42).fit(X_sample)

    # # --- Stage 2: incremental clustering
    # kmeans = MiniBatchKMeans(
    #     n_clusters=15,
    #     random_state=42,
    #     batch_size=batch_size,
    #     verbose=1,
    #     n_init='auto'
    # )

    ids, labels, Xr_vector = [], [], []
    batch_features, batch_ids = [], []

    for row in data_whole.select("統一編號", "features").toLocalIterator():
        batch_features.append(row["features"])
        batch_ids.append(row["統一編號"])
        
        if len(batch_features) >= batch_size:
            print(f"number {}{len(batch_features)}")
            Xb = to_csr(batch_features)
            Xr = svd.transform(Xb)
            Xr_vector.extend(Xr)

            # kmeans.partial_fit(Xr)
            # preds = kmeans.predict(Xr)

            # ids.extend(batch_ids)
            # labels.extend(preds.tolist())
            # batch_features.clear(); batch_ids.clear()

    # last leftover batch
    if batch_features:
        Xb = to_csr(batch_features)
        Xr = svd.transform(Xb)
        Xr_vector.extend(Xr)

        # kmeans.partial_fit(Xr)
        # preds = kmeans.predict(Xr)
        # ids.extend(batch_ids)
        # labels.extend(preds.tolist())

    # --- return as Spark DataFrame
    rows = [
        Row(
            統一編號=str(i),
            cluster=int(c),
            Xr_vector=[float(v) for v in x.tolist()]
        )
        for i, c, x in zip(ids, labels, Xr_vector)
    ]

    spark = data_whole.sparkSession
    sdf = spark.createDataFrame(rows)

    return sdf, svd, kmeans

sdf, svd, kmeans = fit_predict(df)


Sample size for SVD: (5001, 32771)


In [9]:
def to_pg(spark, sdf):
    pg_url = "jdbc:postgresql://pg_vector:5432/vector_db"
    pg_table = "wholecorp_clusters_vector"

    pg_properties = {
        "user": "postgres",
        "password": "infopower",
        "driver": "org.postgresql.Driver"
    }

    # Tune these knobs depending on DB size/resources
    num_partitions = 8     # how many parallel writers
    batch_size = 5000      # rows per batch insert

    (sdf.write
        .format("jdbc")
        .option("url", pg_url)
        .option("dbtable", pg_table)
        .option("user", pg_properties["user"])
        .option("password", pg_properties["password"])
        .option("driver", pg_properties["driver"])
        .option("batchsize", batch_size)
        .option("numPartitions", num_partitions)
        .option("truncate", True)       # ensures overwrite works cleanly
        .mode("overwrite")
        .save()
    )

    print(f"✅ Successfully wrote DataFrame to PostgreSQL table '{pg_table}'")

    # Verify readback
    read_df = (spark.read
               .format("jdbc")
               .option("url", pg_url)
               .option("dbtable", pg_table)
               .option("user", pg_properties["user"])
               .option("password", pg_properties["password"])
               .option("driver", pg_properties["driver"])
               .load())
    read_df.show(5)


to_pg(s, df)


IllegalArgumentException: Can't get JDBC type for struct<type:tinyint,size:int,indices:array<int>,values:array<double>>.

In [9]:
def read_gold(spark):
    global OUT

    GOLD = os.getenv("GOLD_PATH","s3a://deltabucket/gold/wholeCorp_delta")
    OUT = os.getenv("CLUSTER_PATH","s3a://deltabucket/gold/wholeCorp_clusters_vector")

    return spark.read.format("delta").load(GOLD)

def save_(pdf, spark, kmeans, svd):

    spark.createDataFrame(pdf).write.format("delta").mode("overwrite").save(OUT)

    s3 = boto3.client(
        "s3",
        endpoint_url = 'http://minio:9000',
        aws_access_key_id="minioadmin",
        aws_secret_access_key="minioadmin"
    )

    # save locally
    joblib.dump(svd, "/tmp/svd.pkl")
    joblib.dump(kmeans, "/tmp/kmeans.pkl")
    
    # upload to MinIO
    s3.upload_file("/tmp/svd.pkl", "deltabucket", "models/sk_svd.pkl")
    s3.upload_file("/tmp/kmeans.pkl", "deltabucket", "models/sk_kmeans.pkl")

In [None]:
pdf["統一編號"] = pdf["統一編號"].astype(str)
pdf["Xr_vector"] = pdf["Xr_vector"].apply(
    lambda v: [float(x) for x in (v.tolist() if isinstance(v, np.ndarray) else v)]
)

schema = StructType([
    StructField("統一編號", StringType(), False),
    StructField("cluster", IntegerType(), True),
    StructField("Xr_vector", ArrayType(DoubleType()), True),
])

sdf = spark.createDataFrame(pdf.to_dict(orient="records"), schema=schema)
sdf.write.format("delta").mode("overwrite").save(OUT)

In [None]:
Xr /= (np.linalg.norm(Xr, axis=1, keepdims=True) + 1e-12)


# Metrics to measure

In [None]:
from sklearn.metrics import silhouette_score

wcss = []
for k in range(2,20):
    km = KMeans(n_clusters=k, random_state=42, n_init='auto').fit(Xr)
    km.fit(Xr)
    wcss.append(km.inertia_)
    
    score = silhouette_score(Xr, km.labels_)
    print(f"k={k}, silhouette={score:0.3f}")

# 