In [1]:
!pip install pyspark



In [2]:
!pip install recommenders



In [3]:
!pip install papermill



In [4]:
!pip install scrapbook

Collecting scrapbook
  Downloading scrapbook-0.5.0-py3-none-any.whl (34 kB)
Installing collected packages: scrapbook
Successfully installed scrapbook-0.5.0


In [7]:
import os
import sys


import pyspark
from pyspark.ml import PipelineModel
from pyspark.ml.feature import FeatureHasher
import papermill as pm
import scrapbook as sb

from recommenders.utils.notebook_utils import is_databricks
from recommenders.utils.spark_utils import start_or_get_spark
from recommenders.datasets.criteo import load_spark_df
from recommenders.datasets.spark_splitters import spark_random_split

# Setup MML Spark
# from recommenders.utils.spark_utils import MMLSPARK_REPO, MMLSPARK_PACKAGE
# packages = [MMLSPARK_PACKAGE]
# repos = [MMLSPARK_REPO]
# spark = start_or_get_spark(packages=packages, repositories=repos)
# dbutils = None
# print("MMLSpark version: {}".format(MMLSPARK_PACKAGE))

spark = pyspark.sql.SparkSession.builder.appName("MyApp") \
            .config("spark.jars.packages", "com.microsoft.azure:synapseml_2.12:0.9.4") \
            .config("spark.jars.repositories", "https://mmlspark.azureedge.net/maven") \
            .getOrCreate()
import synapse.ml

from synapse.ml.train import ComputeModelStatistics
from synapse.ml.lightgbm import LightGBMClassifier

print("System version: {}".format(sys.version))
print("PySpark version: {}".format(pyspark.version.__version__))

System version: 3.7.12 (default, Sep 10 2021, 00:21:48) 
[GCC 7.5.0]
PySpark version: 3.2.0


In [8]:
DATA_SIZE = "sample"
NUM_LEAVES = 32
NUM_ITERATIONS = 50
LEARNING_RATE = 0.1
FEATURE_FRACTION = 0.8
EARLY_STOPPING_ROUND = 10

MODEL_NAME = 'lightgbm_criteo.mml'


In [11]:
dbutils = None
raw_data = load_spark_df(size=DATA_SIZE, spark=spark, dbutils=dbutils)
# visualize data
raw_data.limit(2).toPandas().head()

100%|██████████| 8.58k/8.58k [00:00<00:00, 11.3kKB/s]


Unnamed: 0,label,int00,int01,int02,int03,int04,int05,int06,int07,int08,int09,int10,int11,int12,cat00,cat01,cat02,cat03,cat04,cat05,cat06,cat07,cat08,cat09,cat10,cat11,cat12,cat13,cat14,cat15,cat16,cat17,cat18,cat19,cat20,cat21,cat22,cat23,cat24,cat25
0,0,1,1,5,0,1382,4,15,2,181,1,2,,2,68fd1e64,80e26c9b,fb936136,7b4723c4,25c83c98,7e0ccccf,de7995b8,1f89b562,a73ee510,a8cd5504,b2cb9c98,37c9c164,2824a5f6,1adce6ef,8ba8b39a,891b62e7,e5ba7672,f54016b9,21ddcdc9,b1252a9d,07b5194c,,3a171ecb,c5c50484,e8b83407,9727dd16
1,0,2,0,44,1,102,8,2,2,4,1,1,,4,68fd1e64,f0cf0024,6f67f7e5,41274cd7,25c83c98,fe6b92e5,922afcc0,0b153874,a73ee510,2b53e5fb,4f1b46f3,623049e6,d7020589,b28479f6,e6c5b5cd,c92f3b61,07c540c4,b04e4670,21ddcdc9,5840adea,60f6221e,,3a171ecb,43f13e8b,e8b83407,731c3655


In [12]:
raw_train, raw_test = spark_random_split(raw_data, ratio=0.8, seed=42)
columns = [c for c in raw_data.columns if c != 'label']
feature_processor = FeatureHasher(inputCols=columns, outputCol='features')
train = feature_processor.transform(raw_train)
test = feature_processor.transform(raw_test)

In [13]:
lgbm = LightGBMClassifier(
    labelCol="label",
    featuresCol="features",
    objective="binary",
    isUnbalance=True,
    boostingType="gbdt",
    boostFromAverage=True,
    baggingSeed=42,
    numLeaves=NUM_LEAVES,
    numIterations=NUM_ITERATIONS,
    learningRate=LEARNING_RATE,
    featureFraction=FEATURE_FRACTION,
    earlyStoppingRound=EARLY_STOPPING_ROUND
)

In [14]:
model = lgbm.fit(train)


In [15]:
predictions = model.transform(test)

In [16]:
evaluator = (
    ComputeModelStatistics()
    .setScoredLabelsCol("prediction")
    .setLabelCol("label")
    .setEvaluationMetric("AUC")
)

result = evaluator.transform(predictions)
auc = result.select("AUC").collect()[0][0]
result.show()

+---------------+------------------+
|evaluation_type|               AUC|
+---------------+------------------+
| Classification|0.6565337613455959|
+---------------+------------------+



In [17]:
sb.glue("auc", auc)

In [18]:
# pipeline = PipelineModel(stages=[feature_processor, model])
# pipeline.write().overwrite().save(MODEL_NAME)

In [19]:
spark.stop()