# Pyspark Compatibility Tests

For testing against revision just before renaming package: [3bb8c8b64d0d10c720882f8a2110c20eaff23907](https://gitlab-master.nvidia.com/nvspark/spark-cuml/-/tree/3bb8c8b64d0d10c720882f8a2110c20eaff23907)

## PCA
From: https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.PCA.html#pyspark.ml.feature.PCA

In [1]:
PYSPARK = False
SPARK_RAPIDS_ML = not PYSPARK

import pandas as pd
import shutil

In [2]:
if PYSPARK:
    from pyspark.ml.feature import PCA, PCAModel
else:
    from sparkcuml.feature import PCA, PCAModel

from pyspark.ml.linalg import Vectors

In [3]:
if PYSPARK:
    data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
            (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
            (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]
else:
    # # Sparse vectors not supported in spark-rapids-ml
    # data = [(Vectors.dense([0.0, 1.0, 0.0, 7.0, 0.0]),),
    #         (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
    #         (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]

    # Vectors not supported in this revision of code
    data = [[0.0, 1.0, 0.0, 7.0, 0.0],
            [2.0, 0.0, 3.0, 4.0, 5.0],
            [4.0, 0.0, 0.0, 6.0, 7.0]]

data

[[0.0, 1.0, 0.0, 7.0, 0.0],
 [2.0, 0.0, 3.0, 4.0, 5.0],
 [4.0, 0.0, 0.0, 6.0, 7.0]]

In [4]:
if PYSPARK:
    df = spark.createDataFrame(data, ["features"])
else:
    tmp = pd.DataFrame(data)
    pdf = pd.DataFrame()
    pdf["features"] = tmp.values.tolist()
    df = spark.createDataFrame(pdf)

df.show(); df.schema

                                                                                

+--------------------+
|            features|
+--------------------+
|[0.0, 1.0, 0.0, 7...|
|[2.0, 0.0, 3.0, 4...|
|[4.0, 0.0, 0.0, 6...|
+--------------------+



StructType([StructField('features', ArrayType(DoubleType(), True), True)])

In [5]:
if PYSPARK:
    pca = PCA(k=2, inputCol="features")
else:
    # pca = PCA(k=2, inputCol="features")    # ValueError: Unsupported param 'k'.
    pca = PCA(n_components=2, inputCol="features")

In [6]:
pca.setOutputCol("pca_features")

PCA_60986327fc73

In [7]:
# Note: cuML parameters exposed as Spark ML Params
print(pca.explainParams())

inputCol: input column name. (current: features)
inputCols: input column names. (undefined)
n_components: Refer to CUML doc of cuml.decomposition.pca.PCA for this param n_components (default: None, current: 2)
num_workers: The number of Spark CUML workers. Each CUML worker corresponds to one spark task. (default: 1)
outputCol: output column name. (default: PCA_60986327fc73__output, current: pca_features)
outputCols: output column names. (undefined)
svd_solver: Refer to CUML doc of cuml.decomposition.pca.PCA for this param svd_solver (default: auto)
verbose: Refer to CUML doc of cuml.decomposition.pca.PCA for this param verbose (default: False)
whiten: Refer to CUML doc of cuml.decomposition.pca.PCA for this param whiten (default: False)


In [8]:
pca.setK(3)
print(pca.explainParams())

inputCol: input column name. (current: features)
inputCols: input column names. (undefined)
n_components: Refer to CUML doc of cuml.decomposition.pca.PCA for this param n_components (default: None, current: 3)
num_workers: The number of Spark CUML workers. Each CUML worker corresponds to one spark task. (default: 1)
outputCol: output column name. (default: PCA_60986327fc73__output, current: pca_features)
outputCols: output column names. (undefined)
svd_solver: Refer to CUML doc of cuml.decomposition.pca.PCA for this param svd_solver (default: auto)
verbose: Refer to CUML doc of cuml.decomposition.pca.PCA for this param verbose (default: False)
whiten: Refer to CUML doc of cuml.decomposition.pca.PCA for this param whiten (default: False)


In [9]:
pca.setK(2)
print(pca.explainParams())

inputCol: input column name. (current: features)
inputCols: input column names. (undefined)
n_components: Refer to CUML doc of cuml.decomposition.pca.PCA for this param n_components (default: None, current: 2)
num_workers: The number of Spark CUML workers. Each CUML worker corresponds to one spark task. (default: 1)
outputCol: output column name. (default: PCA_60986327fc73__output, current: pca_features)
outputCols: output column names. (undefined)
svd_solver: Refer to CUML doc of cuml.decomposition.pca.PCA for this param svd_solver (default: auto)
verbose: Refer to CUML doc of cuml.decomposition.pca.PCA for this param verbose (default: False)
whiten: Refer to CUML doc of cuml.decomposition.pca.PCA for this param whiten (default: False)


In [10]:
model = pca.fit(df)

                                                                                

In [11]:
if PYSPARK:
    # Note: no getter for 'n_components'
    model.getK()
    # 2

In [12]:
model.setOutputCol("output")

PCAModel_64bd8de451d8

In [13]:
print(model.explainParams())

inputCol: input column name. (current: features)
inputCols: input column names. (undefined)
n_components: Refer to CUML doc of cuml.decomposition.pca.PCA for this param n_components (default: None, current: 2)
outputCol: output column name. (default: PCA_60986327fc73__output, current: output)
outputCols: output column names. (undefined)
svd_solver: Refer to CUML doc of cuml.decomposition.pca.PCA for this param svd_solver (default: auto, current: auto)
verbose: Refer to CUML doc of cuml.decomposition.pca.PCA for this param verbose (default: False, current: False)
whiten: Refer to CUML doc of cuml.decomposition.pca.PCA for this param whiten (default: False, current: False)


In [14]:
model.transform(df).collect()[0].output
# DenseVector([1.648..., -4.013...])
# Note: result is different because cuML does implied mean normalization, where Spark doesn't

                                                                                

[-4.790376837878261, -0.5239389022984442]

In [15]:
if PYSPARK:
    print(model.explainedVariance)
else:
    print(model.explained_variance)

# DenseVector([0.794..., 0.205...])

[18.00624707305587, 4.660419593610799]


In [16]:
model.pc
# DenseMatrix(5, 2, [-0.4486, 0.133, -0.1252, 0.2165, -0.8477, -0.2842, -0.0562, 0.7636, -0.5653, -0.1156], 0)

[[0.4485917207506905,
  -0.133019857453954,
  0.12523156359767595,
  -0.21650756651938066,
  0.847651293112682],
 [-0.28423808042763415,
  -0.056211552389868726,
  0.7636264781646386,
  -0.565295877910818,
  -0.11560340512131573]]

In [17]:
temp_path = "/tmp"
pcaPath = temp_path + "/pca"

In [18]:
shutil.rmtree(pcaPath, ignore_errors=True)

In [19]:
pca.save(pcaPath)

In [20]:
loadedPca = PCA.load(pcaPath)

if PYSPARK:
    loadedPca.getK() == pca.getK()
    # True

In [21]:
modelPath = temp_path + "/pca-model"
shutil.rmtree(modelPath, ignore_errors=True)

In [22]:
model.save(modelPath)

In [23]:
loadedModel = PCAModel.load(modelPath)
loadedModel.pc == model.pc
# True

True

In [24]:
if PYSPARK:
    print(loadedModel.explainedVariance == model.explainedVariance)
    # True
else:
    print(loadedModel.explained_variance == model.explained_variance)

True


In [25]:
loadedModel.transform(df).take(1) == model.transform(df).take(1)
# True

                                                                                

True

## KMeans
From: https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.clustering.KMeans.html#pyspark.ml.clustering.KMeans

In [26]:
if PYSPARK:
    from pyspark.ml.clustering import KMeans, KMeansModel
else:
    from sparkcuml.clustering import KMeans, KMeansModel

In [27]:
from pyspark.ml.linalg import Vectors

In [28]:
if PYSPARK:
    data = [(Vectors.dense([0.0, 0.0]), 2.0), (Vectors.dense([1.0, 1.0]), 2.0),
            (Vectors.dense([9.0, 8.0]), 2.0), (Vectors.dense([8.0, 9.0]), 2.0)]
    df = spark.createDataFrame(data, ["features", "weighCol"])
else:
    # Vectors not supported in this revision of code
    data = [([0.0, 0.0], 2.0),
            ([1.0, 1.0], 2.0),
            ([9.0, 8.0], 2.0),
            ([8.0, 9.0], 2.0)]
    pdf = pd.DataFrame(data)
    df = spark.createDataFrame(pdf, ["features", "weighCol"]).repartition(1)

df.show(); df.schema

+----------+--------+
|  features|weighCol|
+----------+--------+
|[0.0, 0.0]|     2.0|
|[1.0, 1.0]|     2.0|
|[9.0, 8.0]|     2.0|
|[8.0, 9.0]|     2.0|
+----------+--------+



StructType([StructField('features', ArrayType(DoubleType(), True), True), StructField('weighCol', DoubleType(), True)])

In [29]:
if PYSPARK:
    kmeans = KMeans(k=2)
    kmeans.setSeed(1)
    kmeans.setMaxIter(10)
    kmeans.setWeightCol("weighCol")
else:
    kmeans = KMeans(n_clusters=2, random_state=1, max_iter=10, inputCol="features", outputCol="prediction")

In [30]:
print(kmeans.explainParams())

init: Refer to CUML doc of cuml.cluster.kmeans.KMeans for this param init (default: scalable-k-means++)
inputCol: input column name. (current: features)
inputCols: input column names. (undefined)
max_iter: Refer to CUML doc of cuml.cluster.kmeans.KMeans for this param max_iter (default: 300, current: 10)
max_samples_per_batch: Refer to CUML doc of cuml.cluster.kmeans.KMeans for this param max_samples_per_batch (default: 32768)
n_clusters: Refer to CUML doc of cuml.cluster.kmeans.KMeans for this param n_clusters (default: 8, current: 2)
n_init: Refer to CUML doc of cuml.cluster.kmeans.KMeans for this param n_init (default: 1)
num_workers: The number of Spark CUML workers. Each CUML worker corresponds to one spark task. (default: 1)
outputCol: output column name. (default: KMeans_4d99fa7d922c__output, current: prediction)
outputCols: output column names. (undefined)
oversampling_factor: Refer to CUML doc of cuml.cluster.kmeans.KMeans for this param oversampling_factor (default: 2.0)
rand

In [31]:
if PYSPARK:
    kmeans.getMaxIter()
else:
    # AttributeError: 'KMeans' object has no attribute 'getMaxIter'
    pass

# 10

In [32]:
if PYSPARK:
    kmeans.clear(kmeans.maxIter)
else:
    # AttributeError: 'KMeans' object has no attribute 'maxIter'
    pass

In [33]:
print(kmeans.explainParams())

init: Refer to CUML doc of cuml.cluster.kmeans.KMeans for this param init (default: scalable-k-means++)
inputCol: input column name. (current: features)
inputCols: input column names. (undefined)
max_iter: Refer to CUML doc of cuml.cluster.kmeans.KMeans for this param max_iter (default: 300, current: 10)
max_samples_per_batch: Refer to CUML doc of cuml.cluster.kmeans.KMeans for this param max_samples_per_batch (default: 32768)
n_clusters: Refer to CUML doc of cuml.cluster.kmeans.KMeans for this param n_clusters (default: 8, current: 2)
n_init: Refer to CUML doc of cuml.cluster.kmeans.KMeans for this param n_init (default: 1)
num_workers: The number of Spark CUML workers. Each CUML worker corresponds to one spark task. (default: 1)
outputCol: output column name. (default: KMeans_4d99fa7d922c__output, current: prediction)
outputCols: output column names. (undefined)
oversampling_factor: Refer to CUML doc of cuml.cluster.kmeans.KMeans for this param oversampling_factor (default: 2.0)
rand

In [34]:
model = kmeans.fit(df)

                                                                                

In [35]:
if PYSPARK:
    model.getDistanceMeasure()
    # 'euclidean'
else:
    # AttributeError: 'KMeansModel' object has no attribute 'getDistanceMeasure'
    pass

In [36]:
model.setPredictionCol("newPrediction")

KMeansModel_acb1db2605b1

In [37]:
print(model.explainParams())

init: Refer to CUML doc of cuml.cluster.kmeans.KMeans for this param init (default: scalable-k-means++, current: scalable-k-means++)
inputCol: input column name. (current: features)
inputCols: input column names. (undefined)
max_iter: Refer to CUML doc of cuml.cluster.kmeans.KMeans for this param max_iter (default: 300, current: 10)
max_samples_per_batch: Refer to CUML doc of cuml.cluster.kmeans.KMeans for this param max_samples_per_batch (default: 32768, current: 32768)
n_clusters: Refer to CUML doc of cuml.cluster.kmeans.KMeans for this param n_clusters (default: 8, current: 2)
n_init: Refer to CUML doc of cuml.cluster.kmeans.KMeans for this param n_init (default: 1, current: 1)
outputCol: output column name. (default: KMeans_4d99fa7d922c__output, current: newPrediction)
outputCols: output column names. (undefined)
oversampling_factor: Refer to CUML doc of cuml.cluster.kmeans.KMeans for this param oversampling_factor (default: 2.0, current: 2.0)
random_state: Refer to CUML doc of cum

In [38]:
if PYSPARK:
    model.predict(df.head().features)
    # 0
else:
    # AttributeError: 'KMeansModel' object has no attribute 'predict'
    pass

In [39]:
if PYSPARK:
    centers = model.clusterCenters()
else:
    centers = model.cluster_centers_

len(centers)
# 2

2

In [40]:
centers
# [array([0.5, 0.5]), array([8.5, 8.5])]

[[8.5, 8.5], [0.5, 0.5]]

In [41]:
if PYSPARK:
    transformed = model.transform(df).select("features", "newPrediction")
else:
    transformed = model.transform(df)

rows = transformed.collect()
transformed.show()

+-------------+
|newPrediction|
+-------------+
|            1|
|            1|
|            0|
|            0|
+-------------+



In [42]:
rows[0].newPrediction == rows[1].newPrediction
# True

True

In [43]:
rows[2].newPrediction == rows[3].newPrediction
# True

True

In [44]:
if PYSPARK:
    model.hasSummary
    # True

In [45]:
if PYSPARK:
    summary = model.summary
    summary.k
    # 2

In [46]:
if PYSPARK:
    summary.clusterSizes
    # [2, 2]

In [47]:
if PYSPARK:
    summary.trainingCost
    # 4.0

In [48]:
temp_path = "/tmp"
kmeans_path = temp_path + "/kmeans"
shutil.rmtree(kmeans_path, ignore_errors=True)

In [49]:
kmeans.save(kmeans_path)

In [50]:
kmeans2 = KMeans.load(kmeans_path)

if PYSPARK:
    kmeans2.getK()
    # 2
else:
    # AttributeError: 'KMeans' object has no attribute 'getK'
    pass

In [51]:
model_path = temp_path + "/kmeans_model"
shutil.rmtree(model_path, ignore_errors=True)

In [52]:
model.save(model_path)

In [53]:
model2 = KMeansModel.load(model_path)

In [54]:
if PYSPARK:
    model2.hasSummary
    # False

In [55]:
if PYSPARK:
    model.clusterCenters()[0] == model2.clusterCenters()[0]
    # array([ True,  True], dtype=bool)
else:
    print(model.cluster_centers_[0] == model2.cluster_centers_[0])

True


In [56]:
if PYSPARK:
    model.clusterCenters()[1] == model2.clusterCenters()[1]
    # array([ True,  True], dtype=bool)
else:
    print(model.cluster_centers_[1] == model2.cluster_centers_[1])

True


In [57]:
model.transform(df).take(1) == model2.transform(df).take(1)
# True

                                                                                

True

In [58]:
model.transform(df).take(1)
# [Row(features=DenseVector([0.0, 0.0]), weighCol=2.0, newPrediction=0)]

                                                                                

[Row(newPrediction=1)]

## LinearRegression

From: https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.regression.LinearRegression.html#pyspark.ml.regression.LinearRegression

In [59]:
PYSPARK = False
SPARK_RAPIDS_ML = not PYSPARK

import pandas as pd
import shutil

In [60]:
if PYSPARK:
    from pyspark.ml.regression import LinearRegression, LinearRegressionModel
else:
    from sparkcuml.regression import LinearRegression, LinearRegressionModel

from pyspark.ml.linalg import Vectors

In [61]:
if PYSPARK:
    df = spark.createDataFrame([
        (1.0, 2.0, Vectors.dense(1.0)),
        (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"])
else:
    # Vectors not supported in this revision of code
    data = [(1.0, 2.0, [1.0]),
            (0.0, 2.0, [0.0])]
    pdf = pd.DataFrame(data)
    df = spark.createDataFrame(pdf, ["label", "weight", "features"]).repartition(1)

df.show(); df.schema

+-----+------+--------+
|label|weight|features|
+-----+------+--------+
|  1.0|   2.0|   [1.0]|
|  0.0|   2.0|   [0.0]|
+-----+------+--------+



StructType([StructField('label', DoubleType(), True), StructField('weight', DoubleType(), True), StructField('features', ArrayType(DoubleType(), True), True)])

In [62]:
if PYSPARK:
    lr = LinearRegression(regParam=0.0, solver="normal", weightCol="weight")
else:
    # need to use cuML 'solver' options here
    lr = LinearRegression(regParam=0.0, solver="eig", max_iter=5, inputCol="features")

In [63]:
if PYSPARK:
    lr.setMaxIter(5)
    lr.getMaxIter()
    # 5
else:
    # Note: no cuML parameter setters/getters
    pass

In [64]:
print(lr.explainParams())

algorithm: Refer to CUML doc of cuml.linear_model.linear_regression.LinearRegression, cuml.linear_model.ridge.Ridge, cuml.solvers.cd.CD for this param algorithm (default: eig)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
fit_intercept: Refer to CUML doc of cuml.linear_model.linear_regression.LinearRegression, cuml.linear_model.ridge.Ridge, cuml.solvers.cd.CD for this param fit_intercept (default: True)
inputCol: input column name. (current: features)
inputCols: input column names. (undefined)
labelCol: label column name. (default: label)
loss: Refer to CUML doc of cuml.linear_model.linear_regression.LinearRegression, cuml.linear_model.ridge.Ridge, cuml.solvers.cd.CD for this param loss (default: squared_loss)
max_iter: Refer to CUML doc of cuml.linear_model.linear_regression.LinearRegression, cuml.linear_model.ridge.Ridge, cuml.solvers.cd.CD for this param max_iter (def

In [65]:
lr.setRegParam(0.1)
lr.getRegParam()
# 0.1

0.1

In [66]:
lr.setRegParam(0.0)

LinearRegression_d12dbeb95e6a

In [67]:
print(lr.explainParams())

algorithm: Refer to CUML doc of cuml.linear_model.linear_regression.LinearRegression, cuml.linear_model.ridge.Ridge, cuml.solvers.cd.CD for this param algorithm (default: eig)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
fit_intercept: Refer to CUML doc of cuml.linear_model.linear_regression.LinearRegression, cuml.linear_model.ridge.Ridge, cuml.solvers.cd.CD for this param fit_intercept (default: True)
inputCol: input column name. (current: features)
inputCols: input column names. (undefined)
labelCol: label column name. (default: label)
loss: Refer to CUML doc of cuml.linear_model.linear_regression.LinearRegression, cuml.linear_model.ridge.Ridge, cuml.solvers.cd.CD for this param loss (default: squared_loss)
max_iter: Refer to CUML doc of cuml.linear_model.linear_regression.LinearRegression, cuml.linear_model.ridge.Ridge, cuml.solvers.cd.CD for this param max_iter (def

In [68]:
model = lr.fit(df)

23/02/01 13:18:32 WARN TaskSetManager: Lost task 0.0 in stage 59.0 (TID 168) (192.168.86.223 executor 0): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/leey/dev/nvspark/spark-cuml/src/sparkcuml/core.py", line 447, in _train_udf
    logger.info("Cuml fit complete")
  File "/home/leey/dev/nvspark/spark-cuml/src/sparkcuml/regression.py", line 167, in _linear_regression_fit
    linear_regression.fit(
  File "/home/leey/miniconda3/envs/rapids-22.10/lib/python3.9/site-packages/cuml/internals/api_decorators.py", line 415, in inner
    return func(*args, **kwargs)
  File "base_mg.pyx", line 90, in cuml.linear_model.base_mg.MGFitMixin.fit
  File "/home/leey/miniconda3/envs/rapids-22.10/lib/python3.9/site-packages/cuml/internals/api_decorators.py", line 415, in inner
    return func(*args, **kwargs)
  File "linear_regression_mg.pyx", line 94, in cuml.linear_model.linear_regression_mg.LinearRegressionMG._fit
RuntimeError: exception occured! file=/wo

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Could not recover from a failed barrier ResultStage. Most recent failure reason: Stage failed because barrier task ResultTask(59, 0) finished unsuccessfully.
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/leey/dev/nvspark/spark-cuml/src/sparkcuml/core.py", line 447, in _train_udf
    logger.info("Cuml fit complete")
  File "/home/leey/dev/nvspark/spark-cuml/src/sparkcuml/regression.py", line 167, in _linear_regression_fit
    linear_regression.fit(
  File "/home/leey/miniconda3/envs/rapids-22.10/lib/python3.9/site-packages/cuml/internals/api_decorators.py", line 415, in inner
    return func(*args, **kwargs)
  File "base_mg.pyx", line 90, in cuml.linear_model.base_mg.MGFitMixin.fit
  File "/home/leey/miniconda3/envs/rapids-22.10/lib/python3.9/site-packages/cuml/internals/api_decorators.py", line 415, in inner
    return func(*args, **kwargs)
  File "linear_regression_mg.pyx", line 94, in cuml.linear_model.linear_regression_mg.LinearRegressionMG._fit
RuntimeError: exception occured! file=/workspace/.conda-bld/work/cpp/src/glm/ols_mg.cu line=78: olsFit: no algorithm with this id has been implemented
Obtained 61 stack frames
#0 in /home/leey/miniconda3/envs/rapids-22.10/lib/python3.9/site-packages/cuml/common/../../../../libcuml++.so(_ZN4raft9exception18collect_call_stackEv+0x3b) [0x7f1589b9218b]
#1 in /home/leey/miniconda3/envs/rapids-22.10/lib/python3.9/site-packages/cuml/common/../../../../libcuml++.so(_ZN4raft9exceptionC2ENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE+0x61) [0x7f1589b928e1]
#2 in /home/leey/miniconda3/envs/rapids-22.10/lib/python3.9/site-packages/cuml/common/../../../../libcuml++.so(_ZN2ML3OLS3opg8fit_implIdEEvRN4raft8handle_tERSt6vectorIPN8MLCommon6Matrix4DataIT_EESaISC_EERNS8_14PartDescriptorESF_PSA_SI_bbiPP11CUstream_stib+0x324) [0x7f158a5bf314]
#3 in /home/leey/miniconda3/envs/rapids-22.10/lib/python3.9/site-packages/cuml/common/../../../../libcuml++.so(_ZN2ML3OLS3opg8fit_implIdEEvRN4raft8handle_tERSt6vectorIPN8MLCommon6Matrix4DataIT_EESaISC_EERNS8_14PartDescriptorESF_PSA_SI_bbib+0x138) [0x7f158a5bfcc8]
#4 in /home/leey/miniconda3/envs/rapids-22.10/lib/python3.9/site-packages/cuml/linear_model/linear_regression_mg.cpython-39-x86_64-linux-gnu.so(+0x1e1b6) [0x7f15403101b6]
#5 in python3(PyObject_Call+0x157) [0x5627e9e63997]
#6 in python3(_PyEval_EvalFrameDefault+0x407d) [0x5627e9e4690d]
#7 in python3(+0x12a8b7) [0x5627e9e418b7]
#8 in python3(+0x14c198) [0x5627e9e63198]
#9 in python3(PyVectorcall_Call+0x87) [0x5627e9e63b77]
#10 in /home/leey/miniconda3/envs/rapids-22.10/lib/python3.9/site-packages/cuml/linear_model/base_mg.cpython-39-x86_64-linux-gnu.so(+0x1728f) [0x7f15402b828f]
#11 in /home/leey/miniconda3/envs/rapids-22.10/lib/python3.9/site-packages/cuml/linear_model/base_mg.cpython-39-x86_64-linux-gnu.so(+0x1daca) [0x7f15402beaca]
#12 in python3(PyObject_Call+0x157) [0x5627e9e63997]
#13 in python3(_PyEval_EvalFrameDefault+0x407d) [0x5627e9e4690d]
#14 in python3(+0x12a8b7) [0x5627e9e418b7]
#15 in python3(+0x14c0ff) [0x5627e9e630ff]
#16 in python3(_PyEval_EvalFrameDefault+0x4c51) [0x5627e9e474e1]
#17 in python3(+0x12a8b7) [0x5627e9e418b7]
#18 in python3(_PyFunction_Vectorcall+0xb9) [0x5627e9e53e09]
#19 in python3(_PyEval_EvalFrameDefault+0x3bf) [0x5627e9e42c4f]
#20 in python3(+0x1538f4) [0x5627e9e6a8f4]
#21 in python3(+0x18ca23) [0x5627e9ea3a23]
#22 in python3(+0x18ca23) [0x5627e9ea3a23]
#23 in python3(_PyEval_EvalFrameDefault+0x932) [0x5627e9e431c2]
#24 in python3(+0x1538f4) [0x5627e9e6a8f4]
#25 in python3(_PyEval_EvalFrameDefault+0x932) [0x5627e9e431c2]
#26 in python3(+0x1538f4) [0x5627e9e6a8f4]
#27 in python3(_PyEval_EvalFrameDefault+0x932) [0x5627e9e431c2]
#28 in python3(+0x13d113) [0x5627e9e54113]
#29 in python3(_PyEval_EvalFrameDefault+0x4c51) [0x5627e9e474e1]
#30 in python3(+0x12a8b7) [0x5627e9e418b7]
#31 in python3(_PyFunction_Vectorcall+0xb9) [0x5627e9e53e09]
#32 in python3(_PyEval_EvalFrameDefault+0x66e) [0x5627e9e42efe]
#33 in python3(+0x12a8b7) [0x5627e9e418b7]
#34 in python3(_PyFunction_Vectorcall+0xb9) [0x5627e9e53e09]
#35 in python3(_PyEval_EvalFrameDefault+0x3bf) [0x5627e9e42c4f]
#36 in python3(+0x12a8b7) [0x5627e9e418b7]
#37 in python3(_PyFunction_Vectorcall+0xb9) [0x5627e9e53e09]
#38 in python3(_PyEval_EvalFrameDefault+0x3bf) [0x5627e9e42c4f]
#39 in python3(+0x13d113) [0x5627e9e54113]
#40 in python3(_PyEval_EvalFrameDefault+0x3bf) [0x5627e9e42c4f]
#41 in python3(+0x12a8b7) [0x5627e9e418b7]
#42 in python3(_PyFunction_Vectorcall+0xb9) [0x5627e9e53e09]
#43 in python3(_PyEval_EvalFrameDefault+0x3bf) [0x5627e9e42c4f]
#44 in python3(+0x12a8b7) [0x5627e9e418b7]
#45 in python3(_PyEval_EvalCodeWithName+0x47) [0x5627e9e41577]
#46 in python3(PyEval_EvalCodeEx+0x39) [0x5627e9e41529]
#47 in python3(PyEval_EvalCode+0x1b) [0x5627e9efccdb]
#48 in python3(+0x1ea76d) [0x5627e9f0176d]
#49 in python3(+0x13d79d) [0x5627e9e5479d]
#50 in python3(_PyEval_EvalFrameDefault+0x3bf) [0x5627e9e42c4f]
#51 in python3(+0x12a8b7) [0x5627e9e418b7]
#52 in python3(_PyFunction_Vectorcall+0xb9) [0x5627e9e53e09]
#53 in python3(_PyEval_EvalFrameDefault+0x3bf) [0x5627e9e42c4f]
#54 in python3(+0x12a8b7) [0x5627e9e418b7]
#55 in python3(_PyFunction_Vectorcall+0xb9) [0x5627e9e53e09]
#56 in python3(+0x207a5b) [0x5627e9f1ea5b]
#57 in python3(Py_RunMain+0xcc) [0x5627e9f1df9c]
#58 in python3(Py_BytesMain+0x39) [0x5627e9ef0979]
#59 in /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf3) [0x7f16567900b3]
#60 in python3(+0x1d9881) [0x5627e9ef0881]


	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:554)
	at org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:118)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:507)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.ContextAwareIterator.hasNext(ContextAwareIterator.scala:39)
	at org.apache.spark.api.python.SerDeUtil$AutoBatchedPickler.hasNext(SerDeUtil.scala:86)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.api.python.SerDeUtil$AutoBatchedPickler.foreach(SerDeUtil.scala:80)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:320)
	at org.apache.spark.api.python.PythonRunner$$anon$2.writeIteratorToStream(PythonRunner.scala:727)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:433)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:2079)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:267)

	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2789)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2725)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2724)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2724)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskCompletion(DAGScheduler.scala:2162)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2982)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2927)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2916)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:971)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2262)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2283)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2302)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2327)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1019)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:405)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1018)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)


In [None]:
if PYSPARK:
    model.setFeaturesCol("features")
    model.setPredictionCol("newPrediction")
    model.getMaxIter()
    # 5

In [None]:
model.getMaxBlockSizeInMB()
# 0.0

In [None]:
if PYSPARK:
    test0 = spark.createDataFrame([(Vectors.dense(1.0),)], ["features"])
else:
    test0 = spark.createDataFrame([([1.0],)], ["features"])

test0.show()

In [None]:
abs(model.predict(test0.head().features) - (-1.0)) < 0.001
# True

In [None]:
abs(model.transform(test0).head().newPrediction - (-1.0)) < 0.001
# True

In [None]:
abs(model.coefficients[0] - 1.0) < 0.001
# True

In [None]:
abs(model.intercept - 0.0) < 0.001
# True

In [None]:
test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"])
abs(model.transform(test1).head().newPrediction - 1.0) < 0.001
# True

In [None]:
lr.setParams(featuresCol="vector")

In [None]:
temp_path = "/tmp"
lr_path = temp_path + "/lr"
shutil.rmtree(lr_path, ignore_errors=True)

In [None]:
lr.save(lr_path)

In [None]:
lr2 = LinearRegression.load(lr_path)
lr2.getMaxIter()
# 5

In [None]:
model_path = temp_path + "/lr_model"
shutil.rmtree(model_path, ignore_errors=True)

In [None]:
model.save(model_path)

In [None]:
model2 = LinearRegressionModel.load(model_path)
model.coefficients[0] == model2.coefficients[0]
# True

In [None]:
model.intercept == model2.intercept
# True

In [None]:
model.transform(test0).take(1) == model2.transform(test0).take(1)
# True

In [None]:
model.numFeatures
# 1

In [None]:
shutil.rmtree(model_path + "_2", ignore_errors=True)

In [None]:
model.write().format("pmml").save(model_path + "_2")

# Scratch

In [69]:
import numpy as np

from pyspark.sql.functions import array
from sparkcuml.regression import LinearRegression, LinearRegressionModel

In [70]:
X = np.array(
    [[-0.20515826,  1.4940791 ],
     [ 0.12167501,  0.7610377 ],
     [ 1.4542735,   0.14404356],
     [-0.85409576,  0.3130677 ],
     [ 2.2408931,   0.978738  ],
     [-0.1513572,   0.95008844],
     [-0.9772779,   1.867558  ],
     [ 0.41059852, -0.10321885]]
)

In [71]:
y = np.array([2.0374513, 22.403986, 139.4456, -76.19584, 225.72075, -0.6784152, -65.54835, 37.30829])

In [72]:
m, n = X.shape
m, n

(8, 2)

In [73]:
feature_cols = [f"c{i}" for i in range(n)]
schema = [f"{c} float" for c in feature_cols]

label_col = "label_col"
schema.append("label_col float")

In [74]:
feature_cols, label_col

(['c0', 'c1'], 'label_col')

In [75]:
schema

['c0 float', 'c1 float', 'label_col float']

In [76]:
df = spark.createDataFrame(
    np.concatenate((X, y.reshape(m, 1)), axis=1).tolist(),
    ",".join(schema),
)

In [77]:
df.show()

                                                                                

+-----------+-----------+----------+
|         c0|         c1| label_col|
+-----------+-----------+----------+
|-0.20515826|  1.4940791| 2.0374513|
| 0.12167501|  0.7610377| 22.403986|
|  1.4542735| 0.14404356|  139.4456|
|-0.85409576|  0.3130677| -76.19584|
|  2.2408931|   0.978738| 225.72075|
| -0.1513572| 0.95008844|-0.6784152|
| -0.9772779|   1.867558| -65.54835|
| 0.41059852|-0.10321885|  37.30829|
+-----------+-----------+----------+



In [78]:
df = df.withColumn("features", array(*feature_cols)).drop(*feature_cols)

In [79]:
df.show()

+----------+--------------------+
| label_col|            features|
+----------+--------------------+
| 2.0374513|[-0.20515826, 1.4...|
| 22.403986|[0.12167501, 0.76...|
|  139.4456|[1.4542735, 0.144...|
| -76.19584|[-0.85409576, 0.3...|
| 225.72075|[2.2408931, 0.978...|
|-0.6784152|[-0.1513572, 0.95...|
| -65.54835|[-0.9772779, 1.86...|
|  37.30829|[0.41059852, -0.1...|
+----------+--------------------+



In [80]:
lr = LinearRegression()

In [81]:
lr.setRegParam(0.0)
lr.setFeaturesCol("features")
lr.setLabelCol("label_col")

LinearRegression_c904beda5155

In [82]:
print(lr.explainParams())

algorithm: Refer to CUML doc of cuml.linear_model.linear_regression.LinearRegression, cuml.linear_model.ridge.Ridge, cuml.solvers.cd.CD for this param algorithm (default: eig)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
fit_intercept: Refer to CUML doc of cuml.linear_model.linear_regression.LinearRegression, cuml.linear_model.ridge.Ridge, cuml.solvers.cd.CD for this param fit_intercept (default: True)
inputCol: input column name. (current: features)
inputCols: input column names. (undefined)
labelCol: label column name. (default: label, current: label_col)
loss: Refer to CUML doc of cuml.linear_model.linear_regression.LinearRegression, cuml.linear_model.ridge.Ridge, cuml.solvers.cd.CD for this param loss (default: squared_loss)
max_iter: Refer to CUML doc of cuml.linear_model.linear_regression.LinearRegression, cuml.linear_model.ridge.Ridge, cuml.solvers.cd.CD for this

In [83]:
lr_model = lr.fit(df)

                                                                                

In [84]:
lr_model.coef

[94.46688842773438, 14.3353271484375]