In [1]:
import os
from pyspark.sql import SparkSession
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import MiniBatchKMeans, KMeans
from scipy.sparse import csr_matrix, vstack
import numpy as np
import joblib

from sklearn.random_projection import SparseRandomProjection
from sklearn.neighbors import NearestNeighbors

from scipy.sparse import vstack as sp_vstack

from pyspark.sql import functions as F
from pyspark.sql.types import BinaryType, IntegerType, StructType, StructField, ArrayType, DoubleType

from pyspark.ml.linalg import SparseVector, DenseVector

import pandas as pd

import boto3

In [2]:
def spark_session():
    # Stop any old session so new configs take effect in notebooks
    return (
        SparkSession.builder
        .appName("MySQL_to_Delta_on_MinIO")
        .master("spark://spark-master:7077")
        .config("spark.jars.packages",
                ",".join([
                    # Delta
                    "io.delta:delta-spark_2.12:3.1.0",
                    # MySQL JDBC
                    "mysql:mysql-connector-java:8.0.33",
                    # S3A / MinIO (versions must match your Hadoop)
                    "org.apache.hadoop:hadoop-aws:3.3.2",
                    "com.amazonaws:aws-java-sdk-bundle:1.11.1026",
                ]))
        # Delta integration
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
        # MinIO (S3A) configs
        .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
        .config("spark.hadoop.fs.s3a.access.key", "minioadmin")
        .config("spark.hadoop.fs.s3a.secret.key", "minioadmin")
        .config("spark.hadoop.fs.s3a.path.style.access", "true")
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
        .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
        .config("spark.ui.port", "4040")                 # fix the port
        .config("spark.driver.bindAddress", "0.0.0.0")   # listen on all ifaces
        .config("spark.driver.host", "jupyter")          # OR "spark-master" – the container's DNS name
        .config("spark.ui.showConsoleProgress", "true")
        # Resources
        # .config("spark.executor.cores", "2")
        # .config("spark.executor.memory", "2g")
        # .config("spark.executor.memoryOverhead", "1536m")
        # .config("spark.network.timeout", "600s")
        .config("spark.executor.cores", "2")           # 1 task per executor (more stable for trees)
        .config("spark.executor.memory", "3g")
        .config("spark.executor.memoryOverhead", "1g")  # or omit in Standalone
        .config("spark.sql.shuffle.partitions", "50")
        .config("spark.local.dir", "/mnt/spark-tmp/local") # For giving it much more space to run CV
        .config("spark.network.timeout", "600s")
        .getOrCreate()
    )


In [3]:
import os
from pyspark.sql import SparkSession
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import MiniBatchKMeans
from scipy.sparse import csr_matrix
import numpy as np
import joblib

from sklearn.random_projection import SparseRandomProjection
from sklearn.neighbors import NearestNeighbors

from scipy.sparse import vstack as sp_vstack

from pyspark.sql import functions as F
from pyspark.sql.types import BinaryType, IntegerType, StructType, StructField, ArrayType, DoubleType

from pyspark.ml.linalg import SparseVector, DenseVector

import pandas as pd


def read_gold(spark):
    global OUT

    GOLD = os.getenv("GOLD_PATH","s3a://deltabucket/gold/wholeCorp_delta")
    OUT = os.getenv("CLUSTER_PATH","s3a://deltabucket/gold/wholeCorp_clusters")

    return spark.read.format("delta").load(GOLD)


def fit_predict(data_whole):
    # --- get feature size from one row
    first_vec = data_whole.limit(1).collect()[0][2]
    num_features = int(first_vec.size)

    # --- helper: Spark Row -> CSR batch
    def to_csr(rows):
        indptr = [0]; indices = []; vals = []
        for sv in rows:
            indices.extend(sv.indices.tolist())
            vals.extend(sv.values.tolist())
            indptr.append(indptr[-1] + len(sv.indices))
        return csr_matrix((np.array(vals, dtype=np.float64),
                        np.array(indices, dtype=np.int32),
                        np.array(indptr, dtype=np.int32)),
                        shape=(len(rows), num_features
    ))

    # 1) collect a manageable sample from Spark
    sample_rows = []
    for i, row in enumerate(data_whole.sample(False, 0.02, seed=42).toLocalIterator()):  # ~2% example
        sample_rows.append(row['features'])
        if i >= 20000:      # cap by count if you like
            break

    X_sample = to_csr(sample_rows)            # CSR (n_sample, num_features)
    print(X_sample)
    
    svd = TruncatedSVD(n_components=100, random_state=42).fit(X_sample)

    # --- Stage 2: fit MiniBatchKMeans on reduced features
    kmeans = MiniBatchKMeans(n_clusters=15,
                            random_state=42,
                            batch_size=2000,
                            verbose=1,
                            n_init='auto')
    ids, labels = [], []
    batch_features, batch_ids = [], []

    Xr_vector = []
    for row in data_whole.select("統一編號","features").toLocalIterator():
        batch_features.append(row["features"])
        batch_ids.append(row["統一編號"])
        if len(batch_features) >= 2000:
            Xb = to_csr(batch_features)
            Xr = svd.transform(Xb)
            Xr_vector.extend(Xr)
            
            kmeans.partial_fit(Xr)
            preds = kmeans.predict(Xr)            # ndarray
            ids.extend(batch_ids)                  # flatten ids
            labels.extend(preds.tolist())          # flatten labels
            batch_features.clear(); batch_ids.clear()

    if batch_features:
        Xb = to_csr(batch_features)
        Xr = svd.transform(Xb)
        Xr_vector.extend(Xr)
        
        kmeans.partial_fit(Xr)
        preds = kmeans.predict(Xr)
        ids.extend(batch_ids)
        labels.extend(preds.tolist())

    pdf = pd.DataFrame({"統一編號": ids, "cluster": labels, "Xr_vector":Xr_vector})

    return pdf, svd, kmeans, Xr_vector


def save_(pdf, spark, kmeans, svd):

    spark.createDataFrame(pdf).write.format("delta").mode("overwrite").save(OUT)

    s3 = boto3.client(
        "s3",
        endpoint_url = 'http://minio:9000',
        aws_access_key_id="minioadmin",
        aws_secret_access_key="minioadmin"
    )

    # save locally
    joblib.dump(svd, "/tmp/svd.pkl")
    joblib.dump(kmeans, "/tmp/kmeans.pkl")
    
    # upload to MinIO
    s3.upload_file("/tmp/svd.pkl", "deltabucket", "models/sk_svd.pkl")
    s3.upload_file("/tmp/kmeans.pkl", "deltabucket", "models/sk_kmeans.pkl")

# from sparksession import spark_session
s = None
s = spark_session()
df = read_gold(s)
pdf, svd, kmeans, Xr_vector = fit_predict(df)


  (0, 11356)	37.40266267386384
  (0, 32768)	2.2077088848113995
  (0, 32769)	0.0
  (0, 32770)	0.0
  (1, 677)	104.23746404352207
  (1, 32768)	2.440939021179923
  (1, 32769)	0.0
  (1, 32770)	0.0
  (2, 31190)	9.459186080170428
  (2, 32768)	2.324323910935715
  (2, 32769)	0.0
  (2, 32770)	0.0
  (3, 31190)	9.459186080170428
  (3, 32768)	2.440939021179923
  (3, 32769)	0.0
  (3, 32770)	0.0
  (4, 31190)	9.459186080170428
  (4, 32768)	2.4716128155207024
  (4, 32769)	0.0
  (4, 32770)	0.0
  (5, 31190)	9.459186080170428
  (5, 32768)	1.8203246226517424
  (5, 32769)	0.0
  (5, 32770)	0.0
  (6, 360)	14.22791115141308
  :	:
  (19998, 13986)	12.850234088562502
  (19998, 14467)	15.735769353841357
  (19998, 14981)	26.37058174062191
  (19998, 16433)	11.706956967466958
  (19998, 19253)	24.41822558547225
  (19998, 20139)	27.5295722953701
  (19998, 20652)	37.158925234059694
  (19998, 21722)	16.107109732966386
  (19998, 24285)	25.166452785139843
  (19998, 25492)	12.700286825449568
  (19998, 26099)	22.68272157790

In [9]:
def read_gold(spark):
    global OUT

    GOLD = os.getenv("GOLD_PATH","s3a://deltabucket/gold/wholeCorp_delta")
    OUT = os.getenv("CLUSTER_PATH","s3a://deltabucket/gold/wholeCorp_clusters_vector")

    return spark.read.format("delta").load(GOLD)

def save_(pdf, spark, kmeans, svd):

    spark.createDataFrame(pdf).write.format("delta").mode("overwrite").save(OUT)

    s3 = boto3.client(
        "s3",
        endpoint_url = 'http://minio:9000',
        aws_access_key_id="minioadmin",
        aws_secret_access_key="minioadmin"
    )

    # save locally
    joblib.dump(svd, "/tmp/svd.pkl")
    joblib.dump(kmeans, "/tmp/kmeans.pkl")
    
    # upload to MinIO
    s3.upload_file("/tmp/svd.pkl", "deltabucket", "models/sk_svd.pkl")
    s3.upload_file("/tmp/kmeans.pkl", "deltabucket", "models/sk_kmeans.pkl")

In [None]:
pdf["統一編號"] = pdf["統一編號"].astype(str)
pdf["Xr_vector"] = pdf["Xr_vector"].apply(
    lambda v: [float(x) for x in (v.tolist() if isinstance(v, np.ndarray) else v)]
)

schema = StructType([
    StructField("統一編號", StringType(), False),
    StructField("cluster", IntegerType(), True),
    StructField("Xr_vector", ArrayType(DoubleType()), True),
])

sdf = spark.createDataFrame(pdf.to_dict(orient="records"), schema=schema)
sdf.write.format("delta").mode("overwrite").save(OUT)

In [8]:
save_(pdf, s, kmeans, svd)


PySparkTypeError: [CANNOT_INFER_TYPE_FOR_FIELD] Unable to infer the type of the field `Xr_vector`.

In [None]:
[vector.tolist() for i in Xr_vector for vector in i]

In [4]:
main()

NameError: name 'main' is not defined

In [None]:
# --- get feature size from one row
first_vec = data_whole.limit(1).collect()[0][2]
num_features = int(first_vec.size)

# --- helper: Spark Row -> CSR batch
def to_csr(rows):
    indptr = [0]; indices = []; vals = []
    for sv in rows:
        indices.extend(sv.indices.tolist())
        vals.extend(sv.values.tolist())
        indptr.append(indptr[-1] + len(sv.indices))
    return csr_matrix((np.array(vals, dtype=np.float64),
                    np.array(indices, dtype=np.int32),
                    np.array(indptr, dtype=np.int32)),
                    shape=(len(rows), num_features
))

# 1) collect a manageable sample from Spark
sample_rows = []
for i, row in enumerate(data_whole.sample(False, 0.02, seed=42).toLocalIterator()):  # ~2% example
    sample_rows.append(row['features'])
    if i >= 20000:      # cap by count if you like
        break

X_sample = to_csr(sample_rows)            # CSR (n_sample, num_features)
svd = TruncatedSVD(n_components=100, random_state=42).fit(X_sample)

# --- Stage 2: fit MiniBatchKMeans on reduced features
kmeans = MiniBatchKMeans(n_clusters=15,
                        random_state=42,
                        batch_size=2000,
                        verbose=1,
                        n_init='auto')
ids, labels = [], []
batch_features, batch_ids = [], []

for row in data_whole.select("統一編號","features").toLocalIterator():
    batch_features.append(row["features"])
    batch_ids.append(row["統一編號"])
    if len(batch_features) >= 2000:
        Xb = to_csr(batch_features)
        Xr = svd.transform(Xb)

        Xr /= (np.linalg.norm(Xr, axis=1, keepdims=True) + 1e-12)
        
        kmeans.partial_fit(Xr)

        preds = kmeans.predict(Xr)            # ndarray
        ids.extend(batch_ids)                  # flatten ids
        labels.extend(preds.tolist())          # flatten labels
        batch_features.clear(); batch_ids.clear()

if batch_features:
    Xb = to_csr(batch_features)
    Xr = svd.transform(Xb)
    kmeans.partial_fit(Xr)
    preds = kmeans.predict(Xr)
    ids.extend(batch_ids)
    labels.extend(preds.tolist())

pdf = pd.DataFrame({"統一編號": ids, "cluster": labels})



In [8]:
df.limit(1).collect()[0]["features"]


SparseVector(262147, {63958: 9.4622, 262144: 2.4409, 262145: 0.0, 262146: 0.0})

In [None]:
Xr /= (np.linalg.norm(Xr, axis=1, keepdims=True) + 1e-12)


# Metrics to measure

In [None]:
from sklearn.metrics import silhouette_score

wcss = []
for k in range(2,20):
    km = KMeans(n_clusters=k, random_state=42, n_init='auto').fit(Xr)
    km.fit(Xr)
    wcss.append(km.inertia_)
    
    score = silhouette_score(Xr, km.labels_)
    print(f"k={k}, silhouette={score:0.3f}")

# 