In [2]:

from pyspark.sql import SparkSession


spark = SparkSession\
    .builder\
    .appName("CTR")\
    .getOrCreate()





from pyspark.sql.types import StructField, StringType, StructType, IntegerType

schema = StructType([
    StructField("id", StringType(), True),
    StructField("click", IntegerType(), True),
    StructField("hour", IntegerType(), True),
    StructField("C1", StringType(), True),
    StructField("banner_pos", StringType(), True),
    StructField("site_id", StringType(), True),
    StructField("site_domain", StringType(), True),
    StructField("site_category", StringType(), True),
    StructField("app_id", StringType(), True),
    StructField("app_domain", StringType(), True),
    StructField("app_category", StringType(), True),
    StructField("device_id", StringType(), True),
    StructField("device_ip", StringType(), True),
    StructField("device_model", StringType(), True),
    StructField("device_type", StringType(), True),
    StructField("device_conn_type", StringType(), True),
    StructField("C14", StringType(), True),
    StructField("C15", StringType(), True),
    StructField("C16", StringType(), True),
    StructField("C17", StringType(), True),
    StructField("C18", StringType(), True),
    StructField("C19", StringType(), True),
    StructField("C20", StringType(), True),
    StructField("C21", StringType(), True),
])



df = spark.read.csv("desktop/train.csv", schema=schema, header=True)

df = df.drop('id').drop('hour').drop('device_id').drop('device_ip')

df = df.withColumnRenamed("click", "label")




df_train, df_test = df.randomSplit([0.7, 0.3], 42)

df_train.cache()

df_test.cache()



categorical = df_train.columns
categorical.remove('label')
print(categorical)



['C1', 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category', 'device_model', 'device_type', 'device_conn_type', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']


In [3]:

cat_inter = ['C14', 'C15']

concat = '+'.join(categorical)
interaction = ':'.join(cat_inter)
formula = "label ~ " + concat + '+' + interaction

print(formula)

label ~ C1+banner_pos+site_id+site_domain+site_category+app_id+app_domain+app_category+device_model+device_type+device_conn_type+C14+C15+C16+C17+C18+C19+C20+C21+C14:C15


In [4]:

from pyspark.ml.feature import RFormula
interactor = RFormula(
    formula=formula,
    featuresCol="features",
    labelCol="label").setHandleInvalid("keep")

interactor.fit(df_train).transform(df_train).select("features").show()


23/06/26 15:07:27 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
23/06/26 15:07:28 WARN DAGScheduler: Broadcasting large task binary with size 1227.2 KiB


+--------------------+
|            features|
+--------------------+
|(55004,[5,7,3758,...|
|(55004,[5,7,792,4...|
|(55004,[5,7,792,4...|
|(55004,[5,7,792,4...|
|(55004,[5,7,792,4...|
|(55004,[5,7,792,4...|
|(55004,[5,7,792,4...|
|(55004,[5,7,792,4...|
|(55004,[5,7,792,4...|
|(55004,[5,7,792,4...|
|(55004,[5,7,792,4...|
|(55004,[5,7,1322,...|
|(55004,[5,7,1322,...|
|(55004,[5,7,1539,...|
|(55004,[5,7,14,45...|
|(55004,[5,7,14,45...|
|(55004,[5,7,14,45...|
|(55004,[5,7,14,45...|
|(55004,[5,7,14,45...|
|(55004,[5,7,14,45...|
+--------------------+
only showing top 20 rows



23/06/26 17:20:17 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 974522 ms exceeds timeout 120000 ms
23/06/26 17:20:17 WARN SparkContext: Killing executors is not supported by current scheduler.


In [None]:

from pyspark.ml.classification import LogisticRegression

classifier = LogisticRegression(maxIter=20, regParam=0.000, elasticNetParam=0.000)

stages = [interactor, classifier]

from pyspark.ml import Pipeline

pipeline = Pipeline(stages=stages)


model = pipeline.fit(df_train)

predictions = model.transform(df_test)


predictions.cache()

predictions.show()


from pyspark.ml.evaluation import BinaryClassificationEvaluator

ev = BinaryClassificationEvaluator(rawPredictionCol = "rawPrediction", metricName = "areaUnderROC")
print(ev.evaluate(predictions))


spark.stop()