# Pyspark Compatibility Tests

## PCA
From: https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.PCA.html#pyspark.ml.feature.PCA

In [1]:
PYSPARK = False
SPARK_RAPIDS_ML = not PYSPARK

import shutil

In [2]:
if PYSPARK:
    from pyspark.ml.feature import PCA, PCAModel
else:
    from spark_rapids_ml.feature import PCA, PCAModel

from pyspark.ml.linalg import Vectors

In [3]:
data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
        (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
        (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]

data

[(SparseVector(5, {1: 1.0, 3: 7.0}),),
 (DenseVector([2.0, 0.0, 3.0, 4.0, 5.0]),),
 (DenseVector([4.0, 0.0, 0.0, 6.0, 7.0]),)]

In [4]:
df = spark.createDataFrame(data,["features"])
df.show(); df.schema

                                                                                

+--------------------+
|            features|
+--------------------+
| (5,[1,3],[1.0,7.0])|
|[2.0,0.0,3.0,4.0,...|
|[4.0,0.0,0.0,6.0,...|
+--------------------+



StructType([StructField('features', VectorUDT(), True)])

In [5]:
pca = PCA()
print(pca.explainParams())

inputCol: input column name. (undefined)
inputCols: input column names. (undefined)
k: the number of principal components (undefined)
num_workers: (cuML) number of Spark CuML workers, where each CuML worker corresponds to one Spark task. (default: 1)
outputCol: output column name. (default: PCA_10f6851bdaeb__output)
outputCols: output column names. (undefined)


In [6]:
pca = PCA(k=2, inputCol="features")
# pca = PCA(k=2, inputCol="features", n_components=3)
# pca = PCA(inputCol="features", n_components=3)
pca.setOutputCol("pca_features")

PCA_75286c798362

In [7]:
print(pca.explainParams())

inputCol: input column name. (current: features)
inputCols: input column names. (undefined)
k: the number of principal components (current: 2)
num_workers: (cuML) number of Spark CuML workers, where each CuML worker corresponds to one Spark task. (default: 1)
outputCol: output column name. (default: PCA_75286c798362__output, current: pca_features)
outputCols: output column names. (undefined)


In [8]:
if SPARK_RAPIDS_ML:
    print(pca._param_mapping())
    print(pca.cuml_params)

{'k': 'n_components'}
{'n_components': 2, 'svd_solver': 'auto', 'verbose': False, 'whiten': False}


In [9]:
pca.setK(3)

PCA_75286c798362

In [10]:
print(pca.explainParams())

inputCol: input column name. (current: features)
inputCols: input column names. (undefined)
k: the number of principal components (current: 3)
num_workers: (cuML) number of Spark CuML workers, where each CuML worker corresponds to one Spark task. (default: 1)
outputCol: output column name. (default: PCA_75286c798362__output, current: pca_features)
outputCols: output column names. (undefined)


In [11]:
if SPARK_RAPIDS_ML:
    print(pca.cuml_params)

{'n_components': 3, 'svd_solver': 'auto', 'verbose': False, 'whiten': False}


In [12]:
pca.setK(2)

PCA_75286c798362

In [13]:
model = pca.fit(df)

                                                                                

In [14]:
model.getK()
# 2

2

In [15]:
model.setOutputCol("output")

PCAModel_d59102329b2f

In [16]:
print(model.explainParams())

inputCol: input column name. (current: features)
inputCols: input column names. (undefined)
k: the number of principal components (current: 2)
num_workers: (cuML) number of Spark CuML workers, where each CuML worker corresponds to one Spark task. (default: 1)
outputCol: output column name. (default: PCA_75286c798362__output, current: output)
outputCols: output column names. (undefined)


In [17]:
if SPARK_RAPIDS_ML:
    print(model.cuml_params)

{'n_components': 2, 'svd_solver': 'auto', 'verbose': False, 'whiten': False}


In [18]:
model.transform(df).collect()[0].output
# DenseVector([1.648..., -4.013...])

                                                                                

[-1.6485728230896184, -4.013282697765595]

In [19]:
model.explainedVariance
# DenseVector([0.794..., 0.205...])

DenseVector([0.7944, 0.2056])

In [20]:
model.pc
# DenseMatrix(5, 2, [-0.4486, 0.133, -0.1252, 0.2165, -0.8477, -0.2842, -0.0562, 0.7636, -0.5653, -0.1156], 0)

DenseMatrix(5, 2, [0.4486, -0.133, 0.1252, -0.2165, 0.8477, -0.2842, -0.0562, 0.7636, -0.5653, -0.1156], False)

In [21]:
temp_path = "/tmp"
pcaPath = temp_path + "/pca"

In [22]:
shutil.rmtree(pcaPath, ignore_errors=True)

In [23]:
pca.save(pcaPath)

In [24]:
loadedPca = PCA.load(pcaPath)
loadedPca.getK() == pca.getK()
# True

True

In [25]:
# confirm saved estimator cuml_params
if SPARK_RAPIDS_ML:
    print(pca.cuml_params)
    print(loadedPca.cuml_params)

{'n_components': 2, 'svd_solver': 'auto', 'verbose': False, 'whiten': False}
{'n_components': 2, 'svd_solver': 'auto', 'verbose': False, 'whiten': False}


In [26]:
modelPath = temp_path + "/pca-model"
shutil.rmtree(modelPath, ignore_errors=True)

In [27]:
model.save(modelPath)

In [28]:
loadedModel = PCAModel.load(modelPath)
loadedModel.pc == model.pc
# True

True

In [29]:
# confirm saved model cuml_params
if SPARK_RAPIDS_ML:
    print(model.cuml_params)
    print(loadedModel.cuml_params)

{'n_components': 2, 'svd_solver': 'auto', 'verbose': False, 'whiten': False}
{'n_components': 2, 'svd_solver': 'auto', 'verbose': False, 'whiten': False}


In [30]:
loadedModel.explainedVariance == model.explainedVariance
# True

True

In [31]:
loadedModel.transform(df).take(1) == model.transform(df).take(1)
# True

                                                                                

True

## KMeans
From: https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.clustering.KMeans.html#pyspark.ml.clustering.KMeans

In [32]:
PYSPARK = False
SPARK_RAPIDS_ML = not PYSPARK

import shutil

In [33]:
if PYSPARK:
    from pyspark.ml.clustering import KMeans, KMeansModel
else:
    from spark_rapids_ml.clustering import KMeans, KMeansModel

In [34]:
from pyspark.ml.linalg import Vectors

In [35]:
data = [(Vectors.dense([0.0, 0.0]), 2.0), (Vectors.dense([1.0, 1.0]), 2.0),
        (Vectors.dense([9.0, 8.0]), 2.0), (Vectors.dense([8.0, 9.0]), 2.0)]

In [36]:
df = spark.createDataFrame(data, ["features", "weighCol"]).repartition(1)
df.show(); df.schema

+---------+--------+
| features|weighCol|
+---------+--------+
|[0.0,0.0]|     2.0|
|[1.0,1.0]|     2.0|
|[9.0,8.0]|     2.0|
|[8.0,9.0]|     2.0|
+---------+--------+



StructType([StructField('features', VectorUDT(), True), StructField('weighCol', DoubleType(), True)])

In [37]:
kmeans = KMeans()



In [38]:
print(kmeans.explainParams())

distanceMeasure: the distance measure. Supported options: 'euclidean' and 'cosine'. (default: euclidean)
featuresCol: features column name. (default: features)
featuresCols: features column names for multi-column input. (undefined)
initMode: The initialization algorithm. This can be either "random" to choose random points as initial cluster centers, or "k-means||" to use a parallel variant of k-means++ (default: k-means||)
initSteps: The number of steps for k-means|| initialization mode. Must be > 0. (default: 2)
k: The number of clusters to create. Must be > 1. (default: 2)
maxBlockSizeInMB: maximum memory in MB for stacking input data into blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. Default 0.0 represents choosing optimal value, depends on specific algorithm. Must be >= 0. (default: 0.0)
maxIter: max number of iterations (>= 0). (default: 20)
num_workers: (cuML) number of Spark CuML workers, where ea

In [39]:
if SPARK_RAPIDS_ML:
    print(kmeans._param_mapping())
    print(kmeans.cuml_params)

{'distanceMeasure': '', 'k': 'n_clusters', 'initSteps': '', 'maxIter': 'max_iter', 'seed': 'random_state', 'tol': 'tol', 'weightCol': None}
{'n_clusters': 2, 'max_iter': 20, 'tol': 0.0001, 'verbose': False, 'random_state': 1909113551, 'init': 'scalable-k-means++', 'n_init': 1, 'oversampling_factor': 2.0, 'max_samples_per_batch': 32768}


In [40]:
kmeans = KMeans(k=2)
kmeans.setSeed(1)
kmeans.setMaxIter(10)

if PYSPARK:
    kmeans.setWeightCol("weighCol")



In [41]:
print(kmeans.explainParams())

distanceMeasure: the distance measure. Supported options: 'euclidean' and 'cosine'. (default: euclidean)
featuresCol: features column name. (default: features)
featuresCols: features column names for multi-column input. (undefined)
initMode: The initialization algorithm. This can be either "random" to choose random points as initial cluster centers, or "k-means||" to use a parallel variant of k-means++ (default: k-means||)
initSteps: The number of steps for k-means|| initialization mode. Must be > 0. (default: 2)
k: The number of clusters to create. Must be > 1. (default: 2, current: 2)
maxBlockSizeInMB: maximum memory in MB for stacking input data into blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. Default 0.0 represents choosing optimal value, depends on specific algorithm. Must be >= 0. (default: 0.0)
maxIter: max number of iterations (>= 0). (default: 20, current: 10)
num_workers: (cuML) number of Spa

In [42]:
if SPARK_RAPIDS_ML:
    print(kmeans.cuml_params)

{'n_clusters': 2, 'max_iter': 10, 'tol': 0.0001, 'verbose': False, 'random_state': 1, 'init': 'scalable-k-means++', 'n_init': 1, 'oversampling_factor': 2.0, 'max_samples_per_batch': 32768}


In [43]:
kmeans.getMaxIter()
# 10

10

In [44]:
kmeans.clear(kmeans.maxIter)

In [45]:
print(kmeans.explainParams())

distanceMeasure: the distance measure. Supported options: 'euclidean' and 'cosine'. (default: euclidean)
featuresCol: features column name. (default: features)
featuresCols: features column names for multi-column input. (undefined)
initMode: The initialization algorithm. This can be either "random" to choose random points as initial cluster centers, or "k-means||" to use a parallel variant of k-means++ (default: k-means||)
initSteps: The number of steps for k-means|| initialization mode. Must be > 0. (default: 2)
k: The number of clusters to create. Must be > 1. (default: 2, current: 2)
maxBlockSizeInMB: maximum memory in MB for stacking input data into blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. Default 0.0 represents choosing optimal value, depends on specific algorithm. Must be >= 0. (default: 0.0)
maxIter: max number of iterations (>= 0). (default: 20)
num_workers: (cuML) number of Spark CuML worke

In [46]:
if SPARK_RAPIDS_ML:
    print(kmeans.cuml_params)

{'n_clusters': 2, 'max_iter': 20, 'tol': 0.0001, 'verbose': False, 'random_state': 1, 'init': 'scalable-k-means++', 'n_init': 1, 'oversampling_factor': 2.0, 'max_samples_per_batch': 32768}


In [47]:
kmeans.setFeaturesCol("features")

KMeans_01693d3ca150

In [48]:
model = kmeans.fit(df)

[Stage 28:>                                                         (0 + 1) / 1]



                                                                                

In [49]:
model.getDistanceMeasure()
# 'euclidean'
# Note: this is not used in spark_rapids_ml (may be implied)

'euclidean'

In [50]:
model.setPredictionCol("newPrediction")

KMeansModel_415f60cb48d0

In [51]:
print(model.explainParams())

distanceMeasure: the distance measure. Supported options: 'euclidean' and 'cosine'. (default: euclidean)
featuresCol: features column name. (default: features, current: features)
featuresCols: features column names for multi-column input. (undefined)
initMode: The initialization algorithm. This can be either "random" to choose random points as initial cluster centers, or "k-means||" to use a parallel variant of k-means++ (default: k-means||)
initSteps: The number of steps for k-means|| initialization mode. Must be > 0. (default: 2)
k: The number of clusters to create. Must be > 1. (default: 2, current: 2)
maxBlockSizeInMB: maximum memory in MB for stacking input data into blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. Default 0.0 represents choosing optimal value, depends on specific algorithm. Must be >= 0. (default: 0.0)
maxIter: max number of iterations (>= 0). (default: 20)
num_workers: (cuML) number 

In [52]:
if SPARK_RAPIDS_ML:
    print(model.cuml_params)

{'n_clusters': 2, 'max_iter': 20, 'tol': 0.0001, 'verbose': False, 'random_state': 1, 'init': 'scalable-k-means++', 'n_init': 1, 'oversampling_factor': 2.0, 'max_samples_per_batch': 32768}


In [53]:
if PYSPARK:
    model.predict(df.head().features)
    # 0
else:
    # NotImplementedError: 'predict' method is not supported, use 'transform' instead.
    pass

In [54]:
centers = model.clusterCenters()
len(centers)
# 2

2

In [55]:
centers
# [array([0.5, 0.5]), array([8.5, 8.5])]

[[8.5, 8.5], [0.5, 0.5]]

In [56]:
if PYSPARK:
    transformed = model.transform(df).select("features", "newPrediction")
else:
    # AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `features` cannot be resolved. Did you mean one of the following? [`prediction`].;
    # 'Project ['features, 'newPrediction]
    # +- MapInPandas _transform_udf(weighCol#1, features#29)#35, [prediction#36]
    #    +- Project [weighCol#1, features#29]
    #       +- Project [cuml_values_c3BhcmtjdW1sCg==#26, weighCol#1, UDF(cuml_values_c3BhcmtjdW1sCg==#26) AS features#29]
    #          +- Project [features#0 AS cuml_values_c3BhcmtjdW1sCg==#26, weighCol#1]
    #             +- Repartition 1, true
    #                +- LogicalRDD [features#0, weighCol#1], false    
    transformed = model.transform(df)
    
rows = transformed.collect()

In [57]:
transformed = model.transform(df)
transformed.show()
# +---------+--------+-------------+
# | features|weighCol|newPrediction|
# +---------+--------+-------------+
# |[0.0,0.0]|     2.0|            0|
# |[1.0,1.0]|     2.0|            0|
# |[9.0,8.0]|     2.0|            1|
# |[8.0,9.0]|     2.0|            1|
# +---------+--------+-------------+

+--------+----------+-------------+
|weighCol|  features|newPrediction|
+--------+----------+-------------+
|     2.0|[0.0, 0.0]|            1|
|     2.0|[1.0, 1.0]|            1|
|     2.0|[9.0, 8.0]|            0|
|     2.0|[8.0, 9.0]|            0|
+--------+----------+-------------+



In [58]:
rows[0].newPrediction == rows[1].newPrediction
# True

True

In [59]:
rows[2].newPrediction == rows[3].newPrediction
# True

True

In [60]:
model.hasSummary
# True

False

In [61]:
if PYSPARK:
    summary = model.summary
    summary.k
    # 2

In [62]:
if PYSPARK:
    summary.clusterSizes
    # [2, 2]

In [63]:
if PYSPARK:
    summary.trainingCost
    # 4.0

In [64]:
temp_path = "/tmp"
kmeans_path = temp_path + "/kmeans"
shutil.rmtree(kmeans_path, ignore_errors=True)

In [65]:
kmeans.save(kmeans_path)

In [66]:
kmeans2 = KMeans.load(kmeans_path)
kmeans2.getK()
# 2



2

In [67]:
# confirm saved estimator cuml_params
if SPARK_RAPIDS_ML:
    print(kmeans.cuml_params)
    print(kmeans2.cuml_params)

{'n_clusters': 2, 'max_iter': 20, 'tol': 0.0001, 'verbose': False, 'random_state': 1, 'init': 'scalable-k-means++', 'n_init': 1, 'oversampling_factor': 2.0, 'max_samples_per_batch': 32768}
{'n_clusters': 2, 'max_iter': 20, 'tol': 0.0001, 'verbose': False, 'random_state': 1, 'init': 'scalable-k-means++', 'n_init': 1, 'oversampling_factor': 2.0, 'max_samples_per_batch': 32768}


In [68]:
model_path = temp_path + "/kmeans_model"
shutil.rmtree(model_path, ignore_errors=True)

In [69]:
model.save(model_path)

In [70]:
model2 = KMeansModel.load(model_path)



In [71]:
# confirm saved model cuml_params
if SPARK_RAPIDS_ML:
    print(model.cuml_params)
    print(model2.cuml_params)

{'n_clusters': 2, 'max_iter': 20, 'tol': 0.0001, 'verbose': False, 'random_state': 1, 'init': 'scalable-k-means++', 'n_init': 1, 'oversampling_factor': 2.0, 'max_samples_per_batch': 32768}
{'n_clusters': 2, 'max_iter': 20, 'tol': 0.0001, 'verbose': False, 'random_state': 1, 'init': 'scalable-k-means++', 'n_init': 1, 'oversampling_factor': 2.0, 'max_samples_per_batch': 32768}


In [72]:
model2.hasSummary
# False

False

In [73]:
model.clusterCenters()[0] == model2.clusterCenters()[0]
# array([ True,  True], dtype=bool)

True

In [74]:
model.clusterCenters()[1] == model2.clusterCenters()[1]
# array([ True,  True], dtype=bool)

True

In [75]:
model.transform(df).take(1) == model2.transform(df).take(1)
# True

                                                                                

True

In [76]:
model.transform(df).take(1)
# [Row(features=DenseVector([0.0, 0.0]), weighCol=2.0, newPrediction=0)]

                                                                                

[Row(weighCol=2.0, features=[0.0, 0.0], newPrediction=1)]

In [77]:
df.take(1)
# [Row(features=DenseVector([0.0, 0.0]), weighCol=2.0)]

[Row(features=DenseVector([0.0, 0.0]), weighCol=2.0)]

## LinearRegression

From: https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.regression.LinearRegression.html#pyspark.ml.regression.LinearRegression

In [78]:
PYSPARK = False
SPARK_RAPIDS_ML = not PYSPARK

import shutil

In [79]:
if PYSPARK:
    from pyspark.ml.regression import LinearRegression, LinearRegressionModel
else:
    from spark_rapids_ml.regression import LinearRegression, LinearRegressionModel

from pyspark.ml.linalg import Vectors

In [80]:
df = spark.createDataFrame([
    (1.0, 2.0, Vectors.dense(1.0)),
    (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"])

df.show(); df.schema

+-----+------+---------+
|label|weight| features|
+-----+------+---------+
|  1.0|   2.0|    [1.0]|
|  0.0|   2.0|(1,[],[])|
+-----+------+---------+



StructType([StructField('label', DoubleType(), True), StructField('weight', DoubleType(), True), StructField('features', VectorUDT(), True)])

In [81]:
if PYSPARK:
    lr = LinearRegression(regParam=0.0, solver="normal", weightCol="weight")
else:
    # 'solver: normal' gets value mapped to 'solver: eig'
    # 'weightCol` is explicitly not supported
    lr = LinearRegression(regParam=0.0, solver="normal")

lr.setMaxIter(5)
lr.getMaxIter()
# 5



5

In [82]:
print(lr.explainParams())

aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
epsilon: The shape parameter to control the amount of robustness. Must be > 1.0. Only valid when loss is huber (default: 1.35)
featuresCol: features column name. (default: features)
featuresCols: features column names for multi-column input. (undefined)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label)
loss: The loss function to be optimized. Supported options: squaredError, huber. (default: squaredError)
maxBlockSizeInMB: maximum memory in MB for stacking input data into blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. Default 0.0 represents choosing optimal value, depends on specific algorithm. Must be >= 0. (defau

In [83]:
if SPARK_RAPIDS_ML:
    print(lr.cuml_params)

{'algorithm': 'eig', 'fit_intercept': True, 'normalize': True, 'verbose': False, 'alpha': 0.0, 'solver': 'eig', 'loss': 'squared_loss', 'l1_ratio': 0.0, 'max_iter': 5, 'tol': 1e-06, 'shuffle': True}


In [84]:
lr.setRegParam(0.1)
lr.getRegParam()
# 0.1

0.1

In [85]:
lr.setRegParam(0.0)

LinearRegression_c616df9ba5ab

In [86]:
print(lr.explainParams())

aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
epsilon: The shape parameter to control the amount of robustness. Must be > 1.0. Only valid when loss is huber (default: 1.35)
featuresCol: features column name. (default: features)
featuresCols: features column names for multi-column input. (undefined)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label)
loss: The loss function to be optimized. Supported options: squaredError, huber. (default: squaredError)
maxBlockSizeInMB: maximum memory in MB for stacking input data into blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. Default 0.0 represents choosing optimal value, depends on specific algorithm. Must be >= 0. (defau

In [87]:
if SPARK_RAPIDS_ML:
    print(lr.cuml_params)

{'algorithm': 'eig', 'fit_intercept': True, 'normalize': True, 'verbose': False, 'alpha': 0.0, 'solver': 'eig', 'loss': 'squared_loss', 'l1_ratio': 0.0, 'max_iter': 5, 'tol': 1e-06, 'shuffle': True}


In [88]:
# RuntimeError: exception occured! file=/workspace/.conda-bld/work/cpp/src/glm/ols_mg.cu line=78: olsFit: no algorithm with this id has been implemented
model = lr.fit(df)

23/02/09 14:41:53 WARN TaskSetManager: Lost task 0.0 in stage 60.0 (TID 164) (192.168.86.223 executor 0): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/leey/dev/leey/spark-cuml/src/spark_rapids_ml/core.py", line 411, in _train_udf
    logger.info("Cuml fit complete")
  File "/home/leey/dev/leey/spark-cuml/src/spark_rapids_ml/regression.py", line 228, in _linear_regression_fit
    linear_regression.fit(
  File "/home/leey/miniconda3/envs/rapids-22.10/lib/python3.9/site-packages/cuml/internals/api_decorators.py", line 415, in inner
    return func(*args, **kwargs)
  File "base_mg.pyx", line 90, in cuml.linear_model.base_mg.MGFitMixin.fit
  File "/home/leey/miniconda3/envs/rapids-22.10/lib/python3.9/site-packages/cuml/internals/api_decorators.py", line 415, in inner
    return func(*args, **kwargs)
  File "linear_regression_mg.pyx", line 94, in cuml.linear_model.linear_regression_mg.LinearRegressionMG._fit
RuntimeError: exception occured! fi

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Could not recover from a failed barrier ResultStage. Most recent failure reason: Stage failed because barrier task ResultTask(60, 0) finished unsuccessfully.
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/leey/dev/leey/spark-cuml/src/spark_rapids_ml/core.py", line 411, in _train_udf
    logger.info("Cuml fit complete")
  File "/home/leey/dev/leey/spark-cuml/src/spark_rapids_ml/regression.py", line 228, in _linear_regression_fit
    linear_regression.fit(
  File "/home/leey/miniconda3/envs/rapids-22.10/lib/python3.9/site-packages/cuml/internals/api_decorators.py", line 415, in inner
    return func(*args, **kwargs)
  File "base_mg.pyx", line 90, in cuml.linear_model.base_mg.MGFitMixin.fit
  File "/home/leey/miniconda3/envs/rapids-22.10/lib/python3.9/site-packages/cuml/internals/api_decorators.py", line 415, in inner
    return func(*args, **kwargs)
  File "linear_regression_mg.pyx", line 94, in cuml.linear_model.linear_regression_mg.LinearRegressionMG._fit
RuntimeError: exception occured! file=/workspace/.conda-bld/work/cpp/src/glm/ols_mg.cu line=78: olsFit: no algorithm with this id has been implemented
Obtained 61 stack frames
#0 in /home/leey/miniconda3/envs/rapids-22.10/lib/python3.9/site-packages/cuml/common/../../../../libcuml++.so(_ZN4raft9exception18collect_call_stackEv+0x3b) [0x7f9884b1118b]
#1 in /home/leey/miniconda3/envs/rapids-22.10/lib/python3.9/site-packages/cuml/common/../../../../libcuml++.so(_ZN4raft9exceptionC2ENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE+0x61) [0x7f9884b118e1]
#2 in /home/leey/miniconda3/envs/rapids-22.10/lib/python3.9/site-packages/cuml/common/../../../../libcuml++.so(_ZN2ML3OLS3opg8fit_implIdEEvRN4raft8handle_tERSt6vectorIPN8MLCommon6Matrix4DataIT_EESaISC_EERNS8_14PartDescriptorESF_PSA_SI_bbiPP11CUstream_stib+0x324) [0x7f988553e314]
#3 in /home/leey/miniconda3/envs/rapids-22.10/lib/python3.9/site-packages/cuml/common/../../../../libcuml++.so(_ZN2ML3OLS3opg8fit_implIdEEvRN4raft8handle_tERSt6vectorIPN8MLCommon6Matrix4DataIT_EESaISC_EERNS8_14PartDescriptorESF_PSA_SI_bbib+0x138) [0x7f988553ecc8]
#4 in /home/leey/miniconda3/envs/rapids-22.10/lib/python3.9/site-packages/cuml/linear_model/linear_regression_mg.cpython-39-x86_64-linux-gnu.so(+0x1e1b6) [0x7f983c0641b6]
#5 in python3(PyObject_Call+0x157) [0x561eda9c1997]
#6 in python3(_PyEval_EvalFrameDefault+0x407d) [0x561eda9a490d]
#7 in python3(+0x12a8b7) [0x561eda99f8b7]
#8 in python3(+0x14c198) [0x561eda9c1198]
#9 in python3(PyVectorcall_Call+0x87) [0x561eda9c1b77]
#10 in /home/leey/miniconda3/envs/rapids-22.10/lib/python3.9/site-packages/cuml/linear_model/base_mg.cpython-39-x86_64-linux-gnu.so(+0x1728f) [0x7f98341c228f]
#11 in /home/leey/miniconda3/envs/rapids-22.10/lib/python3.9/site-packages/cuml/linear_model/base_mg.cpython-39-x86_64-linux-gnu.so(+0x1daca) [0x7f98341c8aca]
#12 in python3(PyObject_Call+0x157) [0x561eda9c1997]
#13 in python3(_PyEval_EvalFrameDefault+0x407d) [0x561eda9a490d]
#14 in python3(+0x12a8b7) [0x561eda99f8b7]
#15 in python3(+0x14c0ff) [0x561eda9c10ff]
#16 in python3(_PyEval_EvalFrameDefault+0x4c51) [0x561eda9a54e1]
#17 in python3(+0x12a8b7) [0x561eda99f8b7]
#18 in python3(_PyFunction_Vectorcall+0xb9) [0x561eda9b1e09]
#19 in python3(_PyEval_EvalFrameDefault+0x3bf) [0x561eda9a0c4f]
#20 in python3(+0x1538f4) [0x561eda9c88f4]
#21 in python3(+0x18ca23) [0x561edaa01a23]
#22 in python3(+0x18ca23) [0x561edaa01a23]
#23 in python3(_PyEval_EvalFrameDefault+0x932) [0x561eda9a11c2]
#24 in python3(+0x1538f4) [0x561eda9c88f4]
#25 in python3(_PyEval_EvalFrameDefault+0x932) [0x561eda9a11c2]
#26 in python3(+0x1538f4) [0x561eda9c88f4]
#27 in python3(_PyEval_EvalFrameDefault+0x932) [0x561eda9a11c2]
#28 in python3(+0x13d113) [0x561eda9b2113]
#29 in python3(_PyEval_EvalFrameDefault+0x4c51) [0x561eda9a54e1]
#30 in python3(+0x12a8b7) [0x561eda99f8b7]
#31 in python3(_PyFunction_Vectorcall+0xb9) [0x561eda9b1e09]
#32 in python3(_PyEval_EvalFrameDefault+0x66e) [0x561eda9a0efe]
#33 in python3(+0x12a8b7) [0x561eda99f8b7]
#34 in python3(_PyFunction_Vectorcall+0xb9) [0x561eda9b1e09]
#35 in python3(_PyEval_EvalFrameDefault+0x3bf) [0x561eda9a0c4f]
#36 in python3(+0x12a8b7) [0x561eda99f8b7]
#37 in python3(_PyFunction_Vectorcall+0xb9) [0x561eda9b1e09]
#38 in python3(_PyEval_EvalFrameDefault+0x3bf) [0x561eda9a0c4f]
#39 in python3(+0x13d113) [0x561eda9b2113]
#40 in python3(_PyEval_EvalFrameDefault+0x3bf) [0x561eda9a0c4f]
#41 in python3(+0x12a8b7) [0x561eda99f8b7]
#42 in python3(_PyFunction_Vectorcall+0xb9) [0x561eda9b1e09]
#43 in python3(_PyEval_EvalFrameDefault+0x3bf) [0x561eda9a0c4f]
#44 in python3(+0x12a8b7) [0x561eda99f8b7]
#45 in python3(_PyEval_EvalCodeWithName+0x47) [0x561eda99f577]
#46 in python3(PyEval_EvalCodeEx+0x39) [0x561eda99f529]
#47 in python3(PyEval_EvalCode+0x1b) [0x561edaa5acdb]
#48 in python3(+0x1ea76d) [0x561edaa5f76d]
#49 in python3(+0x13d79d) [0x561eda9b279d]
#50 in python3(_PyEval_EvalFrameDefault+0x3bf) [0x561eda9a0c4f]
#51 in python3(+0x12a8b7) [0x561eda99f8b7]
#52 in python3(_PyFunction_Vectorcall+0xb9) [0x561eda9b1e09]
#53 in python3(_PyEval_EvalFrameDefault+0x3bf) [0x561eda9a0c4f]
#54 in python3(+0x12a8b7) [0x561eda99f8b7]
#55 in python3(_PyFunction_Vectorcall+0xb9) [0x561eda9b1e09]
#56 in python3(+0x207a5b) [0x561edaa7ca5b]
#57 in python3(Py_RunMain+0xcc) [0x561edaa7bf9c]
#58 in python3(Py_BytesMain+0x39) [0x561edaa4e979]
#59 in /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf3) [0x7f99517220b3]
#60 in python3(+0x1d9881) [0x561edaa4e881]


	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:554)
	at org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:118)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:507)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.ContextAwareIterator.hasNext(ContextAwareIterator.scala:39)
	at org.apache.spark.api.python.SerDeUtil$AutoBatchedPickler.hasNext(SerDeUtil.scala:86)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.api.python.SerDeUtil$AutoBatchedPickler.foreach(SerDeUtil.scala:80)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:320)
	at org.apache.spark.api.python.PythonRunner$$anon$2.writeIteratorToStream(PythonRunner.scala:727)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:433)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:2079)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:267)

	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2789)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2725)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2724)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2724)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskCompletion(DAGScheduler.scala:2162)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2982)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2927)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2916)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:971)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2262)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2283)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2302)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2327)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1019)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:405)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1018)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)


In [None]:
model.setFeaturesCol("features")
model.setPredictionCol("newPrediction")
model.getMaxIter()
# 5

In [None]:
model.getMaxBlockSizeInMB()
# 0.0

In [None]:
test0 = spark.createDataFrame([(Vectors.dense(1.0),)], ["features"])

In [None]:
abs(model.predict(test0.head().features) - (-1.0)) < 0.001
# True

In [None]:
abs(model.transform(test0).head().newPrediction - (-1.0)) < 0.001
# True

In [None]:
abs(model.coefficients[0] - 1.0) < 0.001
# True

In [None]:
abs(model.intercept - 0.0) < 0.001
# True

In [None]:
test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"])
abs(model.transform(test1).head().newPrediction - 1.0) < 0.001
# True

In [None]:
lr.setParams(featuresCol="vector")

In [None]:
temp_path = "/tmp"
lr_path = temp_path + "/lr"
shutil.rmtree(lr_path, ignore_errors=True)

In [None]:
lr.save(lr_path)

In [None]:
lr2 = LinearRegression.load(lr_path)
lr2.getMaxIter()
# 5

In [None]:
model_path = temp_path + "/lr_model"
shutil.rmtree(model_path, ignore_errors=True)

In [None]:
model.save(model_path)

In [None]:
model2 = LinearRegressionModel.load(model_path)
model.coefficients[0] == model2.coefficients[0]
# True

In [None]:
model.intercept == model2.intercept
# True

In [None]:
model.transform(test0).take(1) == model2.transform(test0).take(1)
# True

In [None]:
model.numFeatures
# 1

In [None]:
shutil.rmtree(model_path + "_2", ignore_errors=True)

In [None]:
model.write().format("pmml").save(model_path + "_2")

## LinearRegression (custom)

In [89]:
PYSPARK = False
SPARK_RAPIDS_ML = not PYSPARK

import shutil

In [90]:
import numpy as np

from pyspark.ml.functions import array_to_vector
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import array, col

In [91]:
if PYSPARK:
    from pyspark.ml.regression import LinearRegression, LinearRegressionModel
else:
    from spark_rapids_ml.regression import LinearRegression, LinearRegressionModel

In [92]:
X = np.array(
    [[-0.20515826,  1.4940791 ],
     [ 0.12167501,  0.7610377 ],
     [ 1.4542735,   0.14404356],
     [-0.85409576,  0.3130677 ],
     [ 2.2408931,   0.978738  ],
     [-0.1513572,   0.95008844],
     [-0.9772779,   1.867558  ],
     [ 0.41059852, -0.10321885]]
)

In [93]:
y = np.array([2.0374513, 22.403986, 139.4456, -76.19584, 225.72075, -0.6784152, -65.54835, 37.30829])

In [94]:
feature_cols = ["c0", "c1"]
label_col = "label_col"
schema = ["c0 float, c1 float, label_col float"]

In [95]:
feature_cols, label_col

(['c0', 'c1'], 'label_col')

In [96]:
schema

['c0 float, c1 float, label_col float']

In [97]:
df = spark.createDataFrame(
    np.concatenate((X, y.reshape(8, 1)), axis=1).tolist(),
    ",".join(schema),
)

In [98]:
df.show()

                                                                                

+-----------+-----------+----------+
|         c0|         c1| label_col|
+-----------+-----------+----------+
|-0.20515826|  1.4940791| 2.0374513|
| 0.12167501|  0.7610377| 22.403986|
|  1.4542735| 0.14404356|  139.4456|
|-0.85409576|  0.3130677| -76.19584|
|  2.2408931|   0.978738| 225.72075|
| -0.1513572| 0.95008844|-0.6784152|
| -0.9772779|   1.867558| -65.54835|
| 0.41059852|-0.10321885|  37.30829|
+-----------+-----------+----------+



In [99]:
df = df.withColumn("features", array(*feature_cols)).drop(*feature_cols)
df.show(); df.schema

+----------+--------------------+
| label_col|            features|
+----------+--------------------+
| 2.0374513|[-0.20515826, 1.4...|
| 22.403986|[0.12167501, 0.76...|
|  139.4456|[1.4542735, 0.144...|
| -76.19584|[-0.85409576, 0.3...|
| 225.72075|[2.2408931, 0.978...|
|-0.6784152|[-0.1513572, 0.95...|
| -65.54835|[-0.9772779, 1.86...|
|  37.30829|[0.41059852, -0.1...|
+----------+--------------------+



StructType([StructField('label_col', FloatType(), True), StructField('features', ArrayType(FloatType(), True), False)])

In [100]:
if PYSPARK:
    # requires VectorUDT
    df = df.withColumn("features_vec", array_to_vector("features")).drop("features").withColumnRenamed("features_vec", "features")
    df.show()
    print(df.schema)

In [101]:
lr = LinearRegression()



In [102]:
lr.setFeaturesCol("features")
lr.setRegParam(0.0)
lr.setLabelCol("label_col")

LinearRegression_a13e102e88d6

In [103]:
print(lr.explainParams())

aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
epsilon: The shape parameter to control the amount of robustness. Must be > 1.0. Only valid when loss is huber (default: 1.35)
featuresCol: features column name. (default: features, current: features)
featuresCols: features column names for multi-column input. (undefined)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label, current: label_col)
loss: The loss function to be optimized. Supported options: squaredError, huber. (default: squaredError)
maxBlockSizeInMB: maximum memory in MB for stacking input data into blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. Default 0.0 represents choosing optimal value, depends on s

In [104]:
if SPARK_RAPIDS_ML:
    print(lr.cuml_params)

{'algorithm': 'eig', 'fit_intercept': True, 'normalize': True, 'verbose': False, 'alpha': 0.0, 'solver': 'eig', 'loss': 'squared_loss', 'l1_ratio': 0.0, 'max_iter': 100, 'tol': 1e-06, 'shuffle': True}


In [105]:
lr_model = lr.fit(df)

[Stage 70:>                                                         (0 + 1) / 1]



                                                                                

In [106]:
lr_model.coefficients
# [94.46689350900762,14.33532962562045]

[94.46691131591797, 14.33534049987793]