# Pyspark Compatibility Tests

## PCA
From: https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.PCA.html#pyspark.ml.feature.PCA

In [1]:
PYSPARK = False
SPARK_RAPIDS_ML = not PYSPARK

import shutil

In [2]:
if PYSPARK:
    from pyspark.ml.feature import PCA, PCAModel
else:
    from spark_rapids_ml.feature import PCA, PCAModel

from pyspark.ml.linalg import Vectors

In [3]:
data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
        (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
        (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]

data

[(SparseVector(5, {1: 1.0, 3: 7.0}),),
 (DenseVector([2.0, 0.0, 3.0, 4.0, 5.0]),),
 (DenseVector([4.0, 0.0, 0.0, 6.0, 7.0]),)]

In [4]:
df = spark.createDataFrame(data,["features"])
df.show(); df.schema

                                                                                

+--------------------+
|            features|
+--------------------+
| (5,[1,3],[1.0,7.0])|
|[2.0,0.0,3.0,4.0,...|
|[4.0,0.0,0.0,6.0,...|
+--------------------+



StructType([StructField('features', VectorUDT(), True)])

In [5]:
pca = PCA()
print(pca.explainParams())

inputCol: input column name. (undefined)
inputCols: input column names. (undefined)
k: the number of principal components (undefined)
outputCol: output column name. (default: PCA_819d7faeb0cc__output)


In [6]:
pca = PCA(k=2, inputCol="features")
# pca = PCA(k=2, inputCol="features", n_components=3)
# pca = PCA(inputCol="features", n_components=3)
pca.setOutputCol("pca_features")

PCA_2932417e77a8

In [7]:
print(pca.explainParams())

inputCol: input column name. (current: features)
inputCols: input column names. (undefined)
k: the number of principal components (current: 2)
outputCol: output column name. (default: PCA_2932417e77a8__output, current: pca_features)


In [8]:
if SPARK_RAPIDS_ML:
    print(pca._param_mapping())
    print(pca.cuml_params)

{'k': 'n_components'}
{'n_components': 2, 'svd_solver': 'auto', 'verbose': False, 'whiten': False}


In [9]:
pca.setK(3)

PCA_2932417e77a8

In [10]:
print(pca.explainParams())

inputCol: input column name. (current: features)
inputCols: input column names. (undefined)
k: the number of principal components (current: 3)
outputCol: output column name. (default: PCA_2932417e77a8__output, current: pca_features)


In [11]:
if SPARK_RAPIDS_ML:
    print(pca.cuml_params)

{'n_components': 3, 'svd_solver': 'auto', 'verbose': False, 'whiten': False}


In [12]:
pca.setK(2)

PCA_2932417e77a8

In [13]:
model = pca.fit(df)

                                                                                

In [14]:
model.getK()
# 2

2

In [15]:
model.setOutputCol("output")

PCAModel_cd703ff75496

In [16]:
print(model.explainParams())

inputCol: input column name. (current: features)
inputCols: input column names. (undefined)
k: the number of principal components (current: 2)
outputCol: output column name. (default: PCA_2932417e77a8__output, current: output)


In [17]:
if SPARK_RAPIDS_ML:
    print(model.cuml_params)

{'n_components': 2, 'svd_solver': 'auto', 'verbose': False, 'whiten': False}


In [18]:
model.transform(df).collect()[0].output
# DenseVector([1.648..., -4.013...])

                                                                                

[-1.6485728230896184, -4.013282697765595]

In [19]:
model.explainedVariance
# DenseVector([0.794..., 0.205...])

DenseVector([0.7944, 0.2056])

In [20]:
model.pc
# DenseMatrix(5, 2, [-0.4486, 0.133, -0.1252, 0.2165, -0.8477, -0.2842, -0.0562, 0.7636, -0.5653, -0.1156], 0)

DenseMatrix(5, 2, [0.4486, -0.133, 0.1252, -0.2165, 0.8477, -0.2842, -0.0562, 0.7636, -0.5653, -0.1156], False)

In [21]:
temp_path = "/tmp"
pcaPath = temp_path + "/pca"

In [22]:
shutil.rmtree(pcaPath, ignore_errors=True)

In [23]:
pca.save(pcaPath)

In [24]:
loadedPca = PCA.load(pcaPath)
loadedPca.getK() == pca.getK()
# True

True

In [25]:
# confirm saved estimator cuml_params
if SPARK_RAPIDS_ML:
    print(pca.cuml_params)
    print(loadedPca.cuml_params)

{'n_components': 2, 'svd_solver': 'auto', 'verbose': False, 'whiten': False}
{'n_components': 2, 'svd_solver': 'auto', 'verbose': False, 'whiten': False}


In [26]:
modelPath = temp_path + "/pca-model"
shutil.rmtree(modelPath, ignore_errors=True)

In [27]:
model.save(modelPath)

In [28]:
loadedModel = PCAModel.load(modelPath)
loadedModel.pc == model.pc
# True

True

In [29]:
# confirm saved model cuml_params
if SPARK_RAPIDS_ML:
    print(model.cuml_params)
    print(loadedModel.cuml_params)

{'n_components': 2, 'svd_solver': 'auto', 'verbose': False, 'whiten': False}
{'n_components': 2, 'svd_solver': 'auto', 'verbose': False, 'whiten': False}


In [30]:
loadedModel.explainedVariance == model.explainedVariance
# True

True

In [31]:
loadedModel.transform(df).take(1) == model.transform(df).take(1)
# True

                                                                                

True

## KMeans
From: https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.clustering.KMeans.html#pyspark.ml.clustering.KMeans

In [32]:
PYSPARK = False
SPARK_RAPIDS_ML = not PYSPARK

import shutil

In [33]:
if PYSPARK:
    from pyspark.ml.clustering import KMeans, KMeansModel
else:
    from spark_rapids_ml.clustering import KMeans, KMeansModel

In [34]:
from pyspark.ml.linalg import Vectors

In [35]:
data = [(Vectors.dense([0.0, 0.0]), 2.0), (Vectors.dense([1.0, 1.0]), 2.0),
        (Vectors.dense([9.0, 8.0]), 2.0), (Vectors.dense([8.0, 9.0]), 2.0)]

In [36]:
df = spark.createDataFrame(data, ["features", "weighCol"]).repartition(1)
df.show(); df.schema

+---------+--------+
| features|weighCol|
+---------+--------+
|[0.0,0.0]|     2.0|
|[1.0,1.0]|     2.0|
|[9.0,8.0]|     2.0|
|[8.0,9.0]|     2.0|
+---------+--------+



StructType([StructField('features', VectorUDT(), True), StructField('weighCol', DoubleType(), True)])

In [37]:
kmeans = KMeans()

In [38]:
print(kmeans.explainParams())

distanceMeasure: the distance measure. Supported options: 'euclidean' and 'cosine'. (default: euclidean)
featuresCol: features column name. (default: features)
featuresCols: features column names for multi-column input. (undefined)
initMode: The initialization algorithm. This can be either "random" to choose random points as initial cluster centers, or "k-means||" to use a parallel variant of k-means++ (default: k-means||)
initSteps: The number of steps for k-means|| initialization mode. Must be > 0. (default: 2)
k: The number of clusters to create. Must be > 1. (default: 2)
maxIter: max number of iterations (>= 0). (default: 20)
predictionCol: prediction column name. (default: prediction)
seed: random seed. (default: 1909113551)
tol: the convergence tolerance for iterative algorithms (>= 0). (default: 0.0001)
weightCol: weight column name. If this is not set or empty, we treat all instance weights as 1.0. (undefined)


In [39]:
if SPARK_RAPIDS_ML:
    print(kmeans._param_mapping())
    print(kmeans.cuml_params)

{'distanceMeasure': None, 'initMode': 'init', 'k': 'n_clusters', 'initSteps': '', 'maxIter': 'max_iter', 'seed': 'random_state', 'tol': 'tol', 'weightCol': None}
{'n_clusters': 2, 'max_iter': 20, 'tol': 0.0001, 'verbose': False, 'random_state': 1909113551, 'init': 'k-means||', 'n_init': 1, 'oversampling_factor': 2.0, 'max_samples_per_batch': 32768}


In [40]:
kmeans = KMeans(k=2)
kmeans.setSeed(1)
kmeans.setMaxIter(10)

if PYSPARK:
    kmeans.setWeightCol("weighCol")

In [41]:
print(kmeans.explainParams())

distanceMeasure: the distance measure. Supported options: 'euclidean' and 'cosine'. (default: euclidean)
featuresCol: features column name. (default: features)
featuresCols: features column names for multi-column input. (undefined)
initMode: The initialization algorithm. This can be either "random" to choose random points as initial cluster centers, or "k-means||" to use a parallel variant of k-means++ (default: k-means||)
initSteps: The number of steps for k-means|| initialization mode. Must be > 0. (default: 2)
k: The number of clusters to create. Must be > 1. (default: 2, current: 2)
maxIter: max number of iterations (>= 0). (default: 20, current: 10)
predictionCol: prediction column name. (default: prediction)
seed: random seed. (default: 1909113551, current: 1)
tol: the convergence tolerance for iterative algorithms (>= 0). (default: 0.0001)
weightCol: weight column name. If this is not set or empty, we treat all instance weights as 1.0. (undefined)


In [42]:
if SPARK_RAPIDS_ML:
    print(kmeans.cuml_params)

{'n_clusters': 2, 'max_iter': 10, 'tol': 0.0001, 'verbose': False, 'random_state': 1, 'init': 'k-means||', 'n_init': 1, 'oversampling_factor': 2.0, 'max_samples_per_batch': 32768}


In [43]:
kmeans.getMaxIter()
# 10

10

In [44]:
kmeans.clear(kmeans.maxIter)

In [45]:
print(kmeans.explainParams())

distanceMeasure: the distance measure. Supported options: 'euclidean' and 'cosine'. (default: euclidean)
featuresCol: features column name. (default: features)
featuresCols: features column names for multi-column input. (undefined)
initMode: The initialization algorithm. This can be either "random" to choose random points as initial cluster centers, or "k-means||" to use a parallel variant of k-means++ (default: k-means||)
initSteps: The number of steps for k-means|| initialization mode. Must be > 0. (default: 2)
k: The number of clusters to create. Must be > 1. (default: 2, current: 2)
maxIter: max number of iterations (>= 0). (default: 20)
predictionCol: prediction column name. (default: prediction)
seed: random seed. (default: 1909113551, current: 1)
tol: the convergence tolerance for iterative algorithms (>= 0). (default: 0.0001)
weightCol: weight column name. If this is not set or empty, we treat all instance weights as 1.0. (undefined)


In [46]:
if SPARK_RAPIDS_ML:
    print(kmeans.cuml_params)

{'n_clusters': 2, 'max_iter': 20, 'tol': 0.0001, 'verbose': False, 'random_state': 1, 'init': 'k-means||', 'n_init': 1, 'oversampling_factor': 2.0, 'max_samples_per_batch': 32768}


In [47]:
kmeans.setFeaturesCol("features")

KMeans_fc49c5bd5da0

In [48]:
model = kmeans.fit(df)

                                                                                

In [49]:
model.getDistanceMeasure()
# 'euclidean'
# Note: this is not used in spark_rapids_ml (may be implied)

'euclidean'

In [50]:
model.setPredictionCol("newPrediction")

KMeansModel_243a3f48468e

In [51]:
print(model.explainParams())

distanceMeasure: the distance measure. Supported options: 'euclidean' and 'cosine'. (default: euclidean)
featuresCol: features column name. (default: features, current: features)
featuresCols: features column names for multi-column input. (undefined)
initMode: The initialization algorithm. This can be either "random" to choose random points as initial cluster centers, or "k-means||" to use a parallel variant of k-means++ (default: k-means||)
initSteps: The number of steps for k-means|| initialization mode. Must be > 0. (default: 2)
k: The number of clusters to create. Must be > 1. (default: 2, current: 2)
maxIter: max number of iterations (>= 0). (default: 20)
predictionCol: prediction column name. (default: prediction, current: newPrediction)
seed: random seed. (default: 1909113551, current: 1)
tol: the convergence tolerance for iterative algorithms (>= 0). (default: 0.0001)
weightCol: weight column name. If this is not set or empty, we treat all instance weights as 1.0. (undefined)


In [52]:
if SPARK_RAPIDS_ML:
    print(model.cuml_params)

{'n_clusters': 2, 'max_iter': 20, 'tol': 0.0001, 'verbose': False, 'random_state': 1, 'init': 'k-means||', 'n_init': 1, 'oversampling_factor': 2.0, 'max_samples_per_batch': 32768}


In [53]:
if PYSPARK:
    model.predict(df.head().features)
    # 0
else:
    # NotImplementedError: 'predict' method is not supported, use 'transform' instead.
    pass

In [54]:
centers = model.clusterCenters()
len(centers)
# 2

2

In [55]:
centers
# [array([0.5, 0.5]), array([8.5, 8.5])]

[array([8.5, 8.5]), array([0.5, 0.5])]

In [56]:
if PYSPARK:
    transformed = model.transform(df).select("features", "newPrediction")
else:
    # AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `features` cannot be resolved. Did you mean one of the following? [`prediction`].;
    # 'Project ['features, 'newPrediction]
    # +- MapInPandas _transform_udf(weighCol#1, features#29)#35, [prediction#36]
    #    +- Project [weighCol#1, features#29]
    #       +- Project [cuml_values_c3BhcmtjdW1sCg==#26, weighCol#1, UDF(cuml_values_c3BhcmtjdW1sCg==#26) AS features#29]
    #          +- Project [features#0 AS cuml_values_c3BhcmtjdW1sCg==#26, weighCol#1]
    #             +- Repartition 1, true
    #                +- LogicalRDD [features#0, weighCol#1], false    
    transformed = model.transform(df)
    
rows = transformed.collect()

In [57]:
transformed = model.transform(df)
transformed.show()
# +---------+--------+-------------+
# | features|weighCol|newPrediction|
# +---------+--------+-------------+
# |[0.0,0.0]|     2.0|            0|
# |[1.0,1.0]|     2.0|            0|
# |[9.0,8.0]|     2.0|            1|
# |[8.0,9.0]|     2.0|            1|
# +---------+--------+-------------+

+--------+----------+-------------+
|weighCol|  features|newPrediction|
+--------+----------+-------------+
|     2.0|[0.0, 0.0]|            1|
|     2.0|[1.0, 1.0]|            1|
|     2.0|[9.0, 8.0]|            0|
|     2.0|[8.0, 9.0]|            0|
+--------+----------+-------------+



In [58]:
rows[0].newPrediction == rows[1].newPrediction
# True

True

In [59]:
rows[2].newPrediction == rows[3].newPrediction
# True

True

In [60]:
model.hasSummary
# True

False

In [61]:
if PYSPARK:
    summary = model.summary
    summary.k
    # 2

In [62]:
if PYSPARK:
    summary.clusterSizes
    # [2, 2]

In [63]:
if PYSPARK:
    summary.trainingCost
    # 4.0

In [64]:
temp_path = "/tmp"
kmeans_path = temp_path + "/kmeans"
shutil.rmtree(kmeans_path, ignore_errors=True)

In [65]:
kmeans.save(kmeans_path)

In [66]:
kmeans2 = KMeans.load(kmeans_path)
kmeans2.getK()
# 2

2

In [67]:
# confirm saved estimator cuml_params
if SPARK_RAPIDS_ML:
    print(kmeans.cuml_params)
    print(kmeans2.cuml_params)

{'n_clusters': 2, 'max_iter': 20, 'tol': 0.0001, 'verbose': False, 'random_state': 1, 'init': 'k-means||', 'n_init': 1, 'oversampling_factor': 2.0, 'max_samples_per_batch': 32768}
{'n_clusters': 2, 'max_iter': 20, 'tol': 0.0001, 'verbose': False, 'random_state': 1, 'init': 'k-means||', 'n_init': 1, 'oversampling_factor': 2.0, 'max_samples_per_batch': 32768}


In [68]:
model_path = temp_path + "/kmeans_model"
shutil.rmtree(model_path, ignore_errors=True)

In [69]:
model.save(model_path)

In [70]:
model2 = KMeansModel.load(model_path)

In [71]:
# confirm saved model cuml_params
if SPARK_RAPIDS_ML:
    print(model.cuml_params)
    print(model2.cuml_params)

{'n_clusters': 2, 'max_iter': 20, 'tol': 0.0001, 'verbose': False, 'random_state': 1, 'init': 'k-means||', 'n_init': 1, 'oversampling_factor': 2.0, 'max_samples_per_batch': 32768}
{'n_clusters': 2, 'max_iter': 20, 'tol': 0.0001, 'verbose': False, 'random_state': 1, 'init': 'k-means||', 'n_init': 1, 'oversampling_factor': 2.0, 'max_samples_per_batch': 32768}


In [72]:
model2.hasSummary
# False

False

In [73]:
model.clusterCenters()[0] == model2.clusterCenters()[0]
# array([ True,  True], dtype=bool)

array([ True,  True])

In [74]:
model.clusterCenters()[1] == model2.clusterCenters()[1]
# array([ True,  True], dtype=bool)

array([ True,  True])

In [75]:
model.transform(df).take(1) == model2.transform(df).take(1)
# True

                                                                                

True

In [76]:
model.transform(df).take(1)
# [Row(features=DenseVector([0.0, 0.0]), weighCol=2.0, newPrediction=0)]

                                                                                

[Row(weighCol=2.0, features=[0.0, 0.0], newPrediction=1)]

In [77]:
df.take(1)
# [Row(features=DenseVector([0.0, 0.0]), weighCol=2.0)]

[Row(features=DenseVector([0.0, 0.0]), weighCol=2.0)]

## LinearRegression

From: https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.regression.LinearRegression.html#pyspark.ml.regression.LinearRegression

In [78]:
PYSPARK = False
SPARK_RAPIDS_ML = not PYSPARK

import shutil

In [79]:
if PYSPARK:
    from pyspark.ml.regression import LinearRegression, LinearRegressionModel
else:
    from spark_rapids_ml.regression import LinearRegression, LinearRegressionModel

from pyspark.ml.linalg import Vectors

In [80]:
# Note: spark_rapids_ml.regression.LinearRegression doesn't support datasets with only one feature, so padding dataset
df = spark.createDataFrame([
    (1.0, 2.0, Vectors.dense(1.0, 0.0)),
    (0.0, 2.0, Vectors.sparse(2, [], []))], ["label", "weight", "features"])

df.show(); df.schema

+-----+------+---------+
|label|weight| features|
+-----+------+---------+
|  1.0|   2.0|[1.0,0.0]|
|  0.0|   2.0|(2,[],[])|
+-----+------+---------+



StructType([StructField('label', DoubleType(), True), StructField('weight', DoubleType(), True), StructField('features', VectorUDT(), True)])

In [81]:
if PYSPARK:
    lr = LinearRegression(regParam=0.0, solver="normal", weightCol="weight")
else:
    # 'solver: normal' gets value mapped to 'solver: eig'
    # 'weightCol` is explicitly not supported
    lr = LinearRegression(regParam=0.0, solver="normal")

lr.setMaxIter(5)
lr.getMaxIter()
# 5

5

In [82]:
print(lr.explainParams())

aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
epsilon: The shape parameter to control the amount of robustness. Must be > 1.0. Only valid when loss is huber (default: 1.35)
featuresCol: features column name. (default: features)
featuresCols: features column names for multi-column input. (undefined)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label)
loss: The loss function to be optimized. Supported options: squaredError, huber. (default: squaredError)
maxBlockSizeInMB: maximum memory in MB for stacking input data into blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. Default 0.0 represents choosing optimal value, depends on specific algorithm. Must be >= 0. (defau

In [83]:
if SPARK_RAPIDS_ML:
    print(lr.cuml_params)

{'algorithm': 'eig', 'fit_intercept': True, 'normalize': True, 'verbose': False, 'alpha': 0.0, 'solver': 'eig', 'loss': 'squared_loss', 'l1_ratio': 0.0, 'max_iter': 5, 'tol': 1e-06, 'shuffle': True}


In [84]:
lr.setRegParam(0.1)
lr.getRegParam()
# 0.1

0.1

In [85]:
lr.setRegParam(0.0)

LinearRegression_d43ae6912d79

In [86]:
print(lr.explainParams())

aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
epsilon: The shape parameter to control the amount of robustness. Must be > 1.0. Only valid when loss is huber (default: 1.35)
featuresCol: features column name. (default: features)
featuresCols: features column names for multi-column input. (undefined)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label)
loss: The loss function to be optimized. Supported options: squaredError, huber. (default: squaredError)
maxBlockSizeInMB: maximum memory in MB for stacking input data into blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. Default 0.0 represents choosing optimal value, depends on specific algorithm. Must be >= 0. (defau

In [87]:
if SPARK_RAPIDS_ML:
    print(lr.cuml_params)

{'algorithm': 'eig', 'fit_intercept': True, 'normalize': True, 'verbose': False, 'alpha': 0.0, 'solver': 'eig', 'loss': 'squared_loss', 'l1_ratio': 0.0, 'max_iter': 5, 'tol': 1e-06, 'shuffle': True}


In [88]:
model = lr.fit(df)

                                                                                

In [89]:
model.setFeaturesCol("features")
model.setPredictionCol("newPrediction")
model.getMaxIter()
# 5

5

In [90]:
model.getMaxBlockSizeInMB()
# 0.0

0.0

In [91]:
# Note: spark_rapids_ml.regression.LinearRegression doesn't supports datasets with only one feature, so padding dataset
test0 = spark.createDataFrame([(Vectors.dense(1.0, 1.0),)], ["features"])

In [92]:
if PYSPARK:
    print(abs(model.predict(test0.head().features) - (1.0)) < 0.001)
    # True
else:
    # NotImplementedError: 'predict' method is not supported, use 'transform' instead.
    pass

In [93]:
abs(model.transform(test0).head().newPrediction - (1.0)) < 0.001
# True

True

In [94]:
abs(model.coefficients[0] - 1.0) < 0.001
# True

True

In [95]:
model.coefficients
# DenseVector([1.0, 0.0])

DenseVector([1.0, 0.0])

In [96]:
abs(model.intercept - 0.0) < 0.001
# True

True

In [97]:
temp_path = "/tmp"
lr_path = temp_path + "/lr"
shutil.rmtree(lr_path, ignore_errors=True)

In [98]:
lr.save(lr_path)

In [99]:
lr2 = LinearRegression.load(lr_path)
lr2.getMaxIter()
# 5

5

In [100]:
model_path = temp_path + "/lr_model"
shutil.rmtree(model_path, ignore_errors=True)

In [101]:
model.save(model_path)

In [102]:
model2 = LinearRegressionModel.load(model_path)
model.coefficients[0] == model2.coefficients[0]
# True

True

In [103]:
model.intercept == model2.intercept
# True

True

In [104]:
model.transform(test0).take(1) == model2.transform(test0).take(1)
# True

                                                                                

True

In [105]:
model.numFeatures
# 2

2