In [0]:
train_df = spark.read.load("/Volumes/dai/phase2/silver/train_df")
display(train_df.head(5))

CustomerID,total_spent,total_transactions,total_quantity,last_purchase_date,is_high_valued,class_weight,features
,1447682.1199996774,135080,269562,2011-12-09,1,2.418694690265487,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""1447682.1199996774"",""135080.0"",""269562.0""]}"
12346.0,0.0,2,0,2011-01-18,0,0.6302969155376189,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""0.0"",""2.0"",""0.0""]}"
12347.0,4309.999999999997,182,2458,2011-12-07,1,2.418694690265487,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""4309.999999999997"",""182.0"",""2458.0""]}"
12348.0,1797.24,31,2341,2011-09-25,0,0.6302969155376189,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""1797.24"",""31.0"",""2341.0""]}"
12349.0,1757.55,73,631,2011-11-21,0,0.6302969155376189,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""1757.55"",""73.0"",""631.0""]}"


In [0]:
test_df = spark.read.load("/Volumes/dai/phase2/silver/test_df")

In [0]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(
    featuresCol="features",
    labelCol="is_high_valued",
    weightCol="class_weight"
)

In [0]:
from pyspark.ml.tuning import ParamGridBuilder

paramGrid_lr = (ParamGridBuilder()
    .addGrid(lr.regParam, [0.0, 0.01, 0.1])
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
    .build())

## Performing HyperParameter Tuning for Logistic Regression

Training and evaluation is also done in the process

In [0]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

reg_params = [0.0, 0.01, 0.1]
elastic_net_params = [0.0, 0.5, 1.0]

best_model = None
best_score = float('-inf')
best_params = None

for reg in reg_params:
    for enet in elastic_net_params:
        lr = LogisticRegression(featuresCol="features", labelCol="is_high_valued",
                               regParam=reg, elasticNetParam=enet)
        model = lr.fit(train_df)
        predictions = model.transform(test_df)
        evaluator = BinaryClassificationEvaluator(labelCol="is_high_valued", metricName="areaUnderROC")
        score = evaluator.evaluate(predictions)
        if score > best_score:
            best_score = score
            best_model = model
            best_params = {'regParam': reg, 'elasticNetParam': enet}

print(f"Best params: {best_params}, Best ROC AUC: {best_score}")

Best params: {'regParam': 0.0, 'elasticNetParam': 0.0}, Best ROC AUC: 1.0


## Performing HyperParameter Tuning for Logistic Regression

Training and evaluation is also done in the process

In [0]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

num_trees = [10, 30, 45, 60]
max_depths = [5, 10, 20]

best_rf_model = None
best_rf_score = float('-inf')
best_rf_params = None

for ntrees in num_trees:
    for depth in max_depths:
        rf = RandomForestClassifier(
            featuresCol="features",
            labelCol="is_high_valued",
            numTrees=ntrees,
            maxDepth=depth
        )
        rf_model = rf.fit(train_df)
        rf_predictions = rf_model.transform(test_df)
        evaluator = BinaryClassificationEvaluator(labelCol="is_high_valued", metricName="areaUnderROC")
        rf_score = evaluator.evaluate(rf_predictions)
        if rf_score > best_rf_score:
            best_rf_score = rf_score
            best_rf_model = rf_model
            best_rf_params = {'numTrees': ntrees, 'maxDepth': depth}

print(f"Best params: {best_rf_params}, Best ROC AUC: {best_rf_score}")

Best params: {'numTrees': 45, 'maxDepth': 20}, Best ROC AUC: 0.9996125832945917
