In [0]:
df = spark.read.format("csv") \
    .option("header","true") \
    .option("inferSchema","true") \
    .load("/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv")

In [0]:
from pyspark.sql import functions as F

df.select(F.min("event_time"), F.max("event_time")).show()

+-------------------+-------------------+
|    min(event_time)|    max(event_time)|
+-------------------+-------------------+
|2019-10-01 00:00:00|2019-10-31 23:59:59|
+-------------------+-------------------+



In [0]:
cutoff = "2019-10-20"

In [0]:
feature_data = df.filter(F.col("event_time") <= cutoff)

features_df = feature_data.groupBy("user_id").agg(
    F.count("*").alias("total_events"),
    F.sum(F.when(F.col("event_type")=="view",1).otherwise(0)).alias("views"),
    F.sum(F.when(F.col("event_type")=="cart",1).otherwise(0)).alias("carts"),
    F.countDistinct("product_id").alias("unique_products"),
    F.avg("price").alias("avg_price")
)

In [0]:
label_data = df.filter(F.col("event_time") > cutoff)

label_df = label_data.groupBy("user_id").agg(
    F.max(
        F.when(F.col("event_type")=="purchase",1).otherwise(0)
    ).alias("purchased")
)

In [0]:
training_data = features_df.join(label_df, "user_id", "left") \
    .fillna({"purchased":0})

In [0]:
training_data.groupBy("purchased").count().show()

+---------+-------+
|purchased|  count|
+---------+-------+
|        1|  92199|
|        0|2040071|
+---------+-------+



In [0]:
train, test = training_data.randomSplit([0.8, 0.2], seed=42)

print("Train:", train.count())
print("Test:", test.count())

Train: 1706532
Test: 425738


In [0]:
train.groupBy("purchased").count().show()
test.groupBy("purchased").count().show()

+---------+-------+
|purchased|  count|
+---------+-------+
|        1|  73720|
|        0|1632812|
+---------+-------+

+---------+------+
|purchased| count|
+---------+------+
|        1| 18479|
|        0|407259|
+---------+------+



In [0]:
class_counts = train.groupBy("purchased").count().collect()

neg = [r['count'] for r in class_counts if r['purchased']==0][0]
pos = [r['count'] for r in class_counts if r['purchased']==1][0]

ratio = neg / pos

print("Imbalance ratio:", ratio)

Imbalance ratio: 22.1488334237656


In [0]:
train = train.withColumn(
    "class_weight",
    F.when(F.col("purchased")==1, ratio).otherwise(1.0)
)

In [0]:
from pyspark.ml.feature import VectorAssembler

feature_cols = [
    "total_events",
    "views",
    "carts",
    "unique_products",
    "avg_price"
]

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)

train = assembler.transform(train)
test = assembler.transform(test)

In [0]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(
    labelCol="purchased",
    featuresCol="features",
    weightCol="class_weight"
)

model = lr.fit(train)

In [0]:
predictions = model.transform(test)

predictions.select("purchased","probability","prediction").show(5)

+---------+--------------------+----------+
|purchased|         probability|prediction|
+---------+--------------------+----------+
|        0|[0.63560667708261...|       0.0|
|        0|[0.62770274649042...|       0.0|
|        0|[0.63112820976346...|       0.0|
|        0|[0.55481310282516...|       0.0|
|        0|[0.60799528595077...|       0.0|
+---------+--------------------+----------+
only showing top 5 rows


In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(
    labelCol="purchased"
)

auc = evaluator.evaluate(predictions)
print("AUC:", auc)

AUC: 0.7683547067112431
