# Linear Regression

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_regression
import time

### Create synthetic dataset

In [None]:
n_rows = 50000
n_cols = 3000
dtype='float32'
X, y = make_regression(n_samples=n_rows, n_features=n_cols, noise=10, random_state=1)
X = X.astype(dtype)
y = y.astype(dtype)

### Convert dataset to Spark DataFrame

In [None]:
pd_data = pd.DataFrame({"features": list(X), "label": y})
df = spark.createDataFrame(pd_data)

In [None]:
df.printSchema()

### We will use this function to build both the Spark RAPIDS ML (GPU) and Spark ML (CPU) linear estimator objects, demonstrating the common API, and verify they yield similar results on our synthetic dataset

Configure for Lasso regularization. Note that Spark RAPIDS ML and Spark ML currently optimize different objectives for certain input and regularization scenarios.  For Lasso and no standardization, the objectives are equivalent, so we use that here.  See the documentation for LinearRegression for more information.

In [None]:
def build_linear_regression_estimator(estimator_class):
    return ( estimator_class()
                .setTol(1.0e-20)
                .setFeaturesCol("features")
                .setLabelCol("label")
                .setRegParam(0.05)
                .setElasticNetParam(1.0)
                .setMaxIter(10)
                .setStandardization(False)
           )

## Spark RAPIDS ML (GPU)

In [None]:
from spark_rapids_ml.regression import LinearRegression
gpu_linear_reg = build_linear_regression_estimator(LinearRegression)

Spark Rapids ML estimator can be persisted and reloaded similarly to Spark ML

In [None]:
estimator_path = "/tmp/spark-rapids-ml-linear-reg-estimator"

In [None]:
gpu_linear_reg.write().overwrite().save(estimator_path)
gpu_linear_reg_loaded = LinearRegression.load(estimator_path)

### Fit

In [None]:
start_time = time.time()
gpu_model = gpu_linear_reg_loaded.fit(df)
print(f"Fit took: {time.time() - start_time} sec")

In [None]:
coefs = gpu_model.coefficients

Notice the presence of 0's in the trained model coefficients displayed in the next cell.  This is a well known property of Lasso, with the sparsity increasing with the `regParam` regularization penalty. 

In [None]:
coefs[0:10]

### Transform

In [None]:
model_path = "/tmp/spark-rapids-ml-linear-reg-model"

In [None]:
gpu_model.write().overwrite().save(model_path)

In [None]:
gpu_model_loaded = gpu_model.read().load(model_path)

In [None]:
gpu_model_loaded.coefficients[0:10]

In [None]:
gpu_model.intercept

In [None]:
transformed_df = gpu_model_loaded.setPredictionCol("transformed").transform(df)

In [None]:
transformed_df.printSchema()

In [None]:
transformed_df.count()

In [None]:
transformed_df.show(10)

## Spark ML (CPU)

In [None]:
from pyspark.ml.regression import LinearRegression
cpu_linear_reg = build_linear_regression_estimator(LinearRegression)

Convert array sql type to VectorUDT expected by Spark ML (Note: Spark RAPIDS ML also accepts VectorUDT Dataframes in addition to array type Dataframe above, along with a scalar column format - see docs).

In [None]:
from pyspark.ml.functions import array_to_vector

In [None]:
vector_df = df.select(array_to_vector(df.features).alias("features"),"label")

### Fit

In [None]:
start_time = time.time()
cpu_model = cpu_linear_reg.fit(vector_df)
print(f"Fit took: {time.time() - start_time} sec")

In [None]:
cpu_model.coefficients[0:10]

### Transform

In [None]:
cpu_transformed = cpu_model.setPredictionCol("transformed").transform(vector_df)

In [None]:
cpu_transformed.show(10)

In [None]:
cpu_model.intercept