# Logistic Regression Classification

In [None]:
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
import time

### Create synthetic dataset

The number of classes in the dataset is set to 2 below.  Larger values for `n_classes` are also supported.

In [None]:
n_classes = 2

In [None]:
n_rows = 50000
n_cols = 300
dtype='float32'
X, y = make_classification(n_samples=n_rows, n_features=n_cols, n_informative=n_cols//3, 
                           n_redundant=n_cols//3, random_state=1, n_classes=n_classes)
X = X.astype(dtype)
y = y.astype(dtype)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

### Convert dataset to Spark DataFrame

In [None]:
pd_data_train = pd.DataFrame({"features": list(X_train), "label": y_train})
pd_data_test = pd.DataFrame({"features": list(X_test), "label": y_test})
df_train = spark.createDataFrame(pd_data_train)
df_test = spark.createDataFrame(pd_data_test)

In [None]:
df_train.printSchema()

### Classifier builder
We will use this function to build both the Spark RAPIDS ML (GPU) and Spark ML (CPU) logistic regression classifier objects, demonstrating the common API, and verify they yield similar performance on our synthetic dataset.   NOTE: GPU LogisisticRegression does not yet support `standardization=True`

In [None]:
def build_lr_classifier(estimator_class):
    return ( estimator_class(standardization=False)
                .setFeaturesCol("features")
                .setLabelCol("label")
                .setRegParam(0.001)
                .setElasticNetParam(0.5)
                .setMaxIter(200)
                .setTol(1.0e-30)
           )

## Spark RAPIDS ML (GPU)

In [None]:
from spark_rapids_ml.classification import LogisticRegression
gpu_lr_classifier = build_lr_classifier(LogisticRegression)

Spark Rapids ML estimator can be persisted and reloaded similarly to Spark ML

In [None]:
estimator_path = "/tmp/spark-rapids-ml-lr-classifier-estimator"

In [None]:
gpu_lr_classifier.write().overwrite().save(estimator_path)
gpu_lr_classifier_loaded = LogisticRegression.load(estimator_path)

### Fit

In [None]:
start_time = time.time()
gpu_model = gpu_lr_classifier_loaded.fit(df_train)
print(f"Fit took: {time.time() - start_time} sec")

In [None]:
gpu_model.coefficients[0:10] if gpu_model.numClasses <= 2 else gpu_model.coefficientMatrix.toArray()[:,0:10]

In [None]:
gpu_model.numClasses

### Transform

In [None]:
model_path = "/tmp/spark-rapids-ml-lr-classifier-model"

In [None]:
gpu_model.write().overwrite().save(model_path)

In [None]:
gpu_model_loaded = gpu_model.read().load(model_path)

In [None]:
gpu_model_loaded.coefficients[0:10] if gpu_model_loaded.numClasses <= 2 else gpu_model_loaded.coefficientMatrix.toArray()[:,0:10]

In [None]:
gpu_model_loaded.numClasses

In [None]:
transformed_df = gpu_model_loaded.setPredictionCol("prediction").setProbabilityCol("probability").transform(df_test)

In [None]:
transformed_df.printSchema()

In [None]:
transformed_df.count()

In [None]:
transformed_df.select("features","label","prediction","probability").sort("features").show(10)

Check the auc on the test set of the GPU trained model.

In [None]:
evaluator = ( BinaryClassificationEvaluator() 
                .setRawPredictionCol("probability")
                .setLabelCol("label")
            ) if gpu_model_loaded.numClasses <= 2 else (
                MulticlassClassificationEvaluator()
                .setProbabilityCol("probability")
                .setLabelCol("label")
                .setMetricName("logLoss")
            )
if gpu_model_loaded.numClasses <= 2:
    print(f"auc: {evaluator.evaluate(transformed_df)}")
else:
    print(f"logLoss: {evaluator.evaluate(transformed_df)}")

## Spark ML (CPU)

In [None]:
from pyspark.ml.classification import LogisticRegression
cpu_lr_classifier = build_lr_classifier(LogisticRegression)

Convert array sql type to VectorUDT expected by Spark ML (Note: Spark RAPIDS ML also accepts VectorUDT Dataframes in addition to array type Dataframe above, along with a scalar column format - see docs).

In [None]:
from pyspark.ml.functions import array_to_vector

In [None]:
vector_df_train = df_train.select(array_to_vector(df_train.features).alias("features"),"label")

### Fit

In [None]:
start_time = time.time()
cpu_model = cpu_lr_classifier.fit(vector_df_train)
print(f"Fit took: {time.time() - start_time} sec")

In [None]:
cpu_model.coefficients[0:10] if cpu_model.numClasses <= 2 else cpu_model.coefficientMatrix.toArray()[:,0:10]

In [None]:
cpu_model.numClasses

### Transform

In [None]:
vector_df_test = df_test.select(array_to_vector(df_test.features).alias("features"),"label")

In [None]:
cpu_transformed_df = cpu_model.setPredictionCol("prediction").setProbabilityCol("probability").transform(vector_df_test)

In [None]:
cpu_transformed_df.select("features","label","prediction","probability").sort("features").show(10)

Test set AUCs of GPU model above and CPU model below are comparable.

In [None]:
if cpu_model.numClasses <= 2:
    print(f"auc: {evaluator.evaluate(cpu_transformed_df)}")
else:
    print(f"logLoss: {evaluator.evaluate(cpu_transformed_df)}")

## Sparse Vectors

Standardization needs to be false for now. Will be fixed in 24.02.

In [None]:
# prepare dataframe

from pyspark.ml.feature import CountVectorizer, RegexTokenizer
from pyspark.sql import Row
from sklearn.datasets import fetch_20newsgroups

try:
    twenty_train = fetch_20newsgroups(subset="train", shuffle=True, random_state=42)
except:
    print("Error fetching 20 newsgroup dataset")
X = twenty_train.data
y = twenty_train.target.tolist()

data = [
    Row(
        label=y[i],
        weight=1.0,
        text=X[i],
    )
    for i in range(len(X))
]

df = spark.createDataFrame(data)

# convert text to sparse vector
tokenizer = RegexTokenizer(inputCol="text", outputCol="tokens")
df = tokenizer.transform(df)
cv = CountVectorizer(inputCol="tokens", outputCol="features")
cv_model = cv.fit(df)
df = cv_model.transform(df)

df_train, df_test = df.randomSplit([0.8, 0.2], seed=0)

In [None]:
def sparse_vectors_compat(EstimatorClass):
    from pyspark.ml.evaluation import MulticlassClassificationEvaluator

    lr = EstimatorClass(
        regParam=0.01,
        maxIter=100,
        fitIntercept=True,
        standardization=False,
        featuresCol="features",
        labelCol="label",
    )

    # fit and transform
    start_time = time.time()
    model = lr.fit(df_train)
    fit_time = time.time() - start_time

    trainsformed_df_test = model.transform(df_test)

    # evaluate
    evaluator = (
        MulticlassClassificationEvaluator()
        .setPredictionCol(model.getPredictionCol())
        .setProbabilityCol(model.getProbabilityCol())
        .setLabelCol(model.getLabelCol())
    )
    
    evaluator.setMetricName("logLoss")
    test_logLoss = evaluator.evaluate(trainsformed_df_test)
    return (lr, model, fit_time, test_logLoss)

In [None]:
from spark_rapids_ml.classification import LogisticRegression as GPULR
gpu_lr, gpu_model, gpu_fit_time, gpu_test_logLoss = sparse_vectors_compat(GPULR)
print(f"GPU fit took: {gpu_fit_time} sec")
print(f"GPU training objective: {gpu_model.objective}")
print(f"GPU test logLoss: {gpu_test_logLoss}")

In [None]:
from pyspark.ml.classification import LogisticRegression as CPULR
cpu_lr, cpu_model, cpu_fit_time, cpu_test_logLoss = sparse_vectors_compat(CPULR)
print(f"CPU fit took: {cpu_fit_time} sec")
print(f"CPU training objective: {cpu_model.summary.objectiveHistory[-1]}")
print(f"CPU test logLoss: {cpu_test_logLoss}")