In [1]:
import os
from pyspark.sql import SparkSession
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import MiniBatchKMeans
from scipy.sparse import csr_matrix, vstack
import numpy as np
import joblib

from sklearn.random_projection import SparseRandomProjection


In [2]:
def spark_session():
    # Stop any old session so new configs take effect in notebooks
    return (
        SparkSession.builder
        .appName("MySQL_to_Delta_on_MinIO")
        .master("spark://spark-master:7077")
        .config("spark.jars.packages",
                ",".join([
                    # Delta
                    "io.delta:delta-spark_2.12:3.1.0",
                    # MySQL JDBC
                    "mysql:mysql-connector-java:8.0.33",
                    # S3A / MinIO (versions must match your Hadoop)
                    "org.apache.hadoop:hadoop-aws:3.3.2",
                    "com.amazonaws:aws-java-sdk-bundle:1.11.1026",
                ]))
        # Delta integration
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
        # MinIO (S3A) configs
        .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
        .config("spark.hadoop.fs.s3a.access.key", "minioadmin")
        .config("spark.hadoop.fs.s3a.secret.key", "minioadmin")
        .config("spark.hadoop.fs.s3a.path.style.access", "true")
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
        .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
        .config("spark.ui.port", "4040")                 # fix the port
        .config("spark.driver.bindAddress", "0.0.0.0")   # listen on all ifaces
        .config("spark.driver.host", "jupyter")          # OR "spark-master" – the container's DNS name
        .config("spark.ui.showConsoleProgress", "true")
        # Resources
        # .config("spark.executor.cores", "2")
        # .config("spark.executor.memory", "2g")
        # .config("spark.executor.memoryOverhead", "1536m")
        # .config("spark.network.timeout", "600s")
        .config("spark.executor.cores", "1")           # 1 task per executor (more stable for trees)
        .config("spark.executor.memory", "3g")
        .config("spark.executor.memoryOverhead", "1g")  # or omit in Standalone
        .config("spark.sql.shuffle.partitions", "50")
        .config("spark.local.dir", "/mnt/spark-tmp/local") # For giving it much more space to run CV
        .config("spark.network.timeout", "600s")
        .getOrCreate()
    )

spark = spark_session()

In [4]:
# --- Spark session (reuse yours)
spark = SparkSession.getActiveSession() or SparkSession.builder.getOrCreate()
GOLD = os.getenv("GOLD_PATH","s3a://deltabucket/gold/wholeCorp_delta")

data = spark.read.format("delta").load(GOLD).select("features")

# --- get feature size from one row
first_vec = data.limit(1).collect()[0][0]
num_features = int(first_vec.size)

# --- helper: Spark Row -> CSR batch
def to_csr(rows):
    indptr = [0]; indices = []; vals = []
    for r in rows:
        sv = r["features"]
        indices.extend(sv.indices.tolist())
        vals.extend(sv.values.tolist())
        indptr.append(indptr[-1] + len(sv.indices))
    return csr_matrix((np.array(vals, dtype=np.float64),
                       np.array(indices, dtype=np.int32),
                       np.array(indptr, dtype=np.int32)),
                      shape=(len(rows), num_features))

# Initialize with target dim; "fit" only checks input dimension
srp = SparseRandomProjection(n_components=100, dense_output=False, random_state=42)

# do a one-shot "fit" on a tiny dummy batch to establish input dim
tiny = to_csr(data.limit(1).collect())
srp.fit(tiny)

# Stream: reduce then cluster
from sklearn.cluster import MiniBatchKMeans
kmeans = MiniBatchKMeans(n_init='auto', n_clusters=10, batch_size=2000, random_state=42, verbose=1)

batch = []
for row in data.toLocalIterator():
    batch.append(row)
    if len(batch) >= 2000:
        Xb = to_csr(batch)
        Xr = srp.transform(Xb)            # fast, memory-light
        kmeans.partial_fit(Xr)
        batch.clear()
if batch:
    Xb = to_csr(batch); Xr = srp.transform(Xb); kmeans.partial_fit(Xr)

  super()._check_params_vs_input(X, default_n_init=3)


[MiniBatchKMeans] Reassigning 9 cluster centers.
[MiniBatchKMeans] Reassigning 1 cluster centers.
[MiniBatchKMeans] Reassigning 1 cluster centers.
[MiniBatchKMeans] Reassigning 1 cluster centers.


In [5]:
print("Cluster centers shape:", kmeans.cluster_centers_.shape)

# --- Save both
joblib.dump(srp, "svd_100.joblib")
joblib.dump(kmeans, "kmeans_10.joblib")

Cluster centers shape: (10, 100)


['kmeans_10.joblib']

In [12]:
from sklearn.neighbors import NearestNeighbors

# Stream: build reduced matrix in chunks (use memmap if big)
chunks = []
batch = []
for row in data.toLocalIterator():
    batch.append(row)
    if len(batch) >= 2000:
        Xb = to_csr(batch)
        Xr = srp.transform(Xb)          # or svd.transform(Xb)
        chunks.append(Xr)
        batch.clear()
if batch:
    Xb = to_csr(batch); chunks.append(srp.transform(Xb))

X_all = np.vstack(chunks)               # if too big, use np.memmap on disk
nn = NearestNeighbors(n_neighbors=10, metric="cosine").fit(X_all)

# --- Example: query nearest neighbors for company i=123
distances, indices = nn.kneighbors(X_all[123].reshape(1,-1))
print("Nearest neighbor indices:", indices, "distances:", distances)

ValueError: setting an array element with a sequence.