# Pyspark Compatibility Tests

## PCA
From: https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.PCA.html#pyspark.ml.feature.PCA

In [None]:
PYSPARK = False
SPARK_RAPIDS_ML = not PYSPARK

import shutil

In [None]:
if PYSPARK:
    from pyspark.ml.feature import PCA, PCAModel
else:
    from spark_rapids_ml.feature import PCA, PCAModel

from pyspark.ml.linalg import Vectors

In [None]:
data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
        (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
        (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]

data

In [None]:
df = spark.createDataFrame(data,["features"])
df.show(); df.schema

In [None]:
pca = PCA()
print(pca.explainParams())

In [None]:
pca = PCA(k=2, inputCol="features")
pca.setOutputCol("pca_features")

In [None]:
print(pca.explainParams())

In [None]:
if SPARK_RAPIDS_ML:
    print(pca._param_mapping())
    print(pca.cuml_params)

In [None]:
pca.setK(3)

In [None]:
print(pca.explainParams())

In [None]:
if SPARK_RAPIDS_ML:
    print(pca.cuml_params)

In [None]:
pca.setK(2)

In [None]:
model = pca.fit(df)

In [None]:
model.getK()
# 2

In [None]:
model.setOutputCol("output")

In [None]:
print(model.explainParams())

In [None]:
if SPARK_RAPIDS_ML:
    print(model.cuml_params)

In [None]:
model.transform(df).collect()[0].output
# DenseVector([1.648..., -4.013...])

In [None]:
model.explainedVariance
# DenseVector([0.794..., 0.205...])

In [None]:
model.pc
# DenseMatrix(5, 2, [-0.4486, 0.133, -0.1252, 0.2165, -0.8477, -0.2842, -0.0562, 0.7636, -0.5653, -0.1156], 0)

In [None]:
pcaPath = "/tmp/pca"
pca.write().overwrite().save(pcaPath)

In [None]:
loadedPca = PCA.load(pcaPath)
loadedPca.getK() == pca.getK()
# True

In [None]:
# confirm saved estimator cuml_params
if SPARK_RAPIDS_ML:
    print(pca.cuml_params)
    print(loadedPca.cuml_params)

In [None]:
modelPath = "/tmp/pca-model"
model.write().overwrite().save(modelPath)

In [None]:
loadedModel = PCAModel.load(modelPath)
loadedModel.pc == model.pc
# True

In [None]:
# confirm saved model cuml_params
if SPARK_RAPIDS_ML:
    print(model.cuml_params)
    print(loadedModel.cuml_params)

In [None]:
loadedModel.explainedVariance == model.explainedVariance
# True

In [None]:
loadedModel.transform(df).take(1) == model.transform(df).take(1)
# True

## KMeans
From: https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.clustering.KMeans.html#pyspark.ml.clustering.KMeans

In [None]:
PYSPARK = False
SPARK_RAPIDS_ML = not PYSPARK

import shutil

In [None]:
if PYSPARK:
    from pyspark.ml.clustering import KMeans, KMeansModel
else:
    from spark_rapids_ml.clustering import KMeans, KMeansModel

In [None]:
from pyspark.ml.linalg import Vectors

In [None]:
data = [(Vectors.dense([0.0, 0.0]), 2.0), (Vectors.dense([1.0, 1.0]), 2.0),
        (Vectors.dense([9.0, 8.0]), 2.0), (Vectors.dense([8.0, 9.0]), 2.0)]

In [None]:
df = spark.createDataFrame(data, ["features", "weighCol"]).repartition(1)
df.show(); df.schema

In [None]:
kmeans = KMeans()

In [None]:
print(kmeans.explainParams())

In [None]:
if SPARK_RAPIDS_ML:
    print(kmeans._param_mapping())
    print(kmeans.cuml_params)

In [None]:
kmeans = KMeans(k=2)
kmeans.setSeed(1)
kmeans.setMaxIter(10)

if PYSPARK:
    kmeans.setWeightCol("weighCol")

In [None]:
print(kmeans.explainParams())

In [None]:
if SPARK_RAPIDS_ML:
    print(kmeans.cuml_params)

In [None]:
kmeans.getMaxIter()
# 10

In [None]:
kmeans.clear(kmeans.maxIter)

In [None]:
print(kmeans.explainParams())

In [None]:
if SPARK_RAPIDS_ML:
    print(kmeans.cuml_params)

In [None]:
kmeans.setFeaturesCol("features")

In [None]:
model = kmeans.fit(df)

In [None]:
model.getDistanceMeasure()
# 'euclidean'
# Note: this is not used in spark_rapids_ml (may be implied)

In [None]:
model.setPredictionCol("newPrediction")

In [None]:
print(model.explainParams())

In [None]:
if SPARK_RAPIDS_ML:
    print(model.cuml_params)

In [None]:
if PYSPARK:
    model.predict(df.head().features)
    # 0
else:
    # NotImplementedError: 'predict' method is not supported, use 'transform' instead.
    pass

In [None]:
centers = model.clusterCenters()
len(centers)
# 2

In [None]:
centers
# [array([0.5, 0.5]), array([8.5, 8.5])]

In [None]:
if PYSPARK:
    transformed = model.transform(df).select("features", "newPrediction")
else:
    # AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `features` cannot be resolved. Did you mean one of the following? [`prediction`].;
    # 'Project ['features, 'newPrediction]
    # +- MapInPandas _transform_udf(weighCol#1, features#29)#35, [prediction#36]
    #    +- Project [weighCol#1, features#29]
    #       +- Project [cuml_values_c3BhcmtjdW1sCg==#26, weighCol#1, UDF(cuml_values_c3BhcmtjdW1sCg==#26) AS features#29]
    #          +- Project [features#0 AS cuml_values_c3BhcmtjdW1sCg==#26, weighCol#1]
    #             +- Repartition 1, true
    #                +- LogicalRDD [features#0, weighCol#1], false    
    transformed = model.transform(df)
    
rows = transformed.collect()

In [None]:
transformed = model.transform(df)
transformed.show()
# +---------+--------+-------------+
# | features|weighCol|newPrediction|
# +---------+--------+-------------+
# |[0.0,0.0]|     2.0|            0|
# |[1.0,1.0]|     2.0|            0|
# |[9.0,8.0]|     2.0|            1|
# |[8.0,9.0]|     2.0|            1|
# +---------+--------+-------------+

In [None]:
rows[0].newPrediction == rows[1].newPrediction
# True

In [None]:
rows[2].newPrediction == rows[3].newPrediction
# True

In [None]:
model.hasSummary
# True

In [None]:
if PYSPARK:
    summary = model.summary
    summary.k
    # 2

In [None]:
if PYSPARK:
    summary.clusterSizes
    # [2, 2]

In [None]:
if PYSPARK:
    summary.trainingCost
    # 4.0

In [None]:
kmeans_path = "/tmp/kmeans"
kmeans.write().overwrite().save(kmeans_path)

In [None]:
kmeans2 = KMeans.load(kmeans_path)
kmeans2.getK()
# 2

In [None]:
# confirm saved estimator cuml_params
if SPARK_RAPIDS_ML:
    print(kmeans.cuml_params)
    print(kmeans2.cuml_params)

In [None]:
model_path = "/tmp/kmeans_model"
model.write().overwrite().save(model_path)

In [None]:
model2 = KMeansModel.load(model_path)

In [None]:
# confirm saved model cuml_params
if SPARK_RAPIDS_ML:
    print(model.cuml_params)
    print(model2.cuml_params)

In [None]:
model2.hasSummary
# False

In [None]:
model.clusterCenters()[0] == model2.clusterCenters()[0]
# array([ True,  True], dtype=bool)

In [None]:
model.clusterCenters()[1] == model2.clusterCenters()[1]
# array([ True,  True], dtype=bool)

In [None]:
model.transform(df).take(1) == model2.transform(df).take(1)
# True

In [None]:
model.transform(df).take(1)
# [Row(features=DenseVector([0.0, 0.0]), weighCol=2.0, newPrediction=0)]

In [None]:
df.take(1)
# [Row(features=DenseVector([0.0, 0.0]), weighCol=2.0)]

## LinearRegression

From: https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.regression.LinearRegression.html#pyspark.ml.regression.LinearRegression

In [None]:
PYSPARK = False
SPARK_RAPIDS_ML = not PYSPARK

import shutil

In [None]:
if PYSPARK:
    from pyspark.ml.regression import LinearRegression, LinearRegressionModel
else:
    from spark_rapids_ml.regression import LinearRegression, LinearRegressionModel

from pyspark.ml.linalg import Vectors

In [None]:
# Note: spark_rapids_ml.regression.LinearRegression doesn't support datasets with only one feature, so padding dataset
df = spark.createDataFrame([
    (1.0, 2.0, Vectors.dense(1.0, 0.0)),
    (0.0, 2.0, Vectors.sparse(2, [], []))], ["label", "weight", "features"])

df.show(); df.schema

In [None]:
if PYSPARK:
    lr = LinearRegression(regParam=0.0, solver="normal", weightCol="weight")
else:
    # 'solver: normal' gets value mapped to 'solver: eig'
    # 'weightCol` is explicitly not supported
    lr = LinearRegression(regParam=0.0, solver="normal")

lr.setMaxIter(5)
lr.getMaxIter()
# 5

In [None]:
print(lr.explainParams())

In [None]:
if SPARK_RAPIDS_ML:
    print(lr.cuml_params)

In [None]:
lr.setRegParam(0.1)
lr.getRegParam()
# 0.1

In [None]:
lr.setRegParam(0.0)

In [None]:
print(lr.explainParams())

In [None]:
if SPARK_RAPIDS_ML:
    print(lr.cuml_params)

In [None]:
model = lr.fit(df)

In [None]:
model.setFeaturesCol("features")
model.setPredictionCol("newPrediction")
model.getMaxIter()
# 5

In [None]:
model.getMaxBlockSizeInMB()
# 0.0

In [None]:
# Note: spark_rapids_ml.regression.LinearRegression doesn't supports datasets with only one feature, so padding dataset
test0 = spark.createDataFrame([(Vectors.dense(1.0, 1.0),)], ["features"])

In [None]:
if PYSPARK:
    print(abs(model.predict(test0.head().features) - (1.0)) < 0.001)
    # True
else:
    # NotImplementedError: 'predict' method is not supported, use 'transform' instead.
    pass

In [None]:
abs(model.transform(test0).head().newPrediction - (1.0)) < 0.001
# True

In [None]:
abs(model.coefficients[0] - 1.0) < 0.001
# True

In [None]:
model.coefficients
# DenseVector([1.0, 0.0])

In [None]:
abs(model.intercept - 0.0) < 0.001
# True

In [None]:
lr_path = "/tmp/lr"
lr.write().overwrite().save(lr_path)

In [None]:
lr2 = LinearRegression.load(lr_path)
lr2.getMaxIter()
# 5

In [None]:
model_path = "/tmp/lr_model"
model.write().overwrite().save(model_path)

In [None]:
model2 = LinearRegressionModel.load(model_path)
model.coefficients[0] == model2.coefficients[0]
# True

In [None]:
model.intercept == model2.intercept
# True

In [None]:
model.transform(test0).take(1) == model2.transform(test0).take(1)
# True

In [None]:
model.numFeatures
# 2