##### In this session, we are going to see how we are going to select the best model for production by comparing parameters and metrics. We will also see about the model versioning while changing the parameters. 

> ## Welcome To MLFlow Tracking
---

In [0]:
train_df = spark.read.load("/Volumes/dai/phase2/silver/train_df")
test_df = spark.read.load("/Volumes/dai/phase2/silver/test_df")

In [0]:
display(train_df.head(10))

CustomerID,total_spent,total_transactions,total_quantity,last_purchase_date,is_high_valued,class_weight,features
,1447682.1199996774,135080,269562,2011-12-09,1,2.418694690265487,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""1447682.1199996774"",""135080.0"",""269562.0""]}"
12346.0,0.0,2,0,2011-01-18,0,0.6302969155376189,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""0.0"",""2.0"",""0.0""]}"
12347.0,4309.999999999997,182,2458,2011-12-07,1,2.418694690265487,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""4309.999999999997"",""182.0"",""2458.0""]}"
12348.0,1797.24,31,2341,2011-09-25,0,0.6302969155376189,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""1797.24"",""31.0"",""2341.0""]}"
12349.0,1757.55,73,631,2011-11-21,0,0.6302969155376189,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""1757.55"",""73.0"",""631.0""]}"
12350.0,334.40000000000003,17,197,2011-02-02,0,0.6302969155376189,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""334.40000000000003"",""17.0"",""197.0""]}"
12352.0,1545.4100000000003,95,470,2011-11-03,0,0.6302969155376189,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""1545.4100000000005"",""95.0"",""470.0""]}"
12353.0,89.0,4,20,2011-05-19,0,0.6302969155376189,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""89.0"",""4.0"",""20.0""]}"
12355.0,459.4,13,240,2011-05-09,0,0.6302969155376189,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""459.4"",""13.0"",""240.0""]}"
12356.0,2811.4300000000007,59,1591,2011-11-17,1,2.418694690265487,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""2811.4300000000007"",""59.0"",""1591.0""]}"


> ##### We have to do the same as we did in previous day, i.e., Converting the features to be trained into one vector. I will use `RandomForest Classifier` here

In [0]:
# from pyspark.ml.feature import VectorAssembler

# assembler = VectorAssembler(
#     inputCols=["total_spent", "total_transactions", "total_quantity"],
#     outputCol="features"
# )

# train_df = assembler.transform(train_df)
# test_df = assembler.transform(test_df)

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(
    labelCol="is_high_valued",
    rawPredictionCol="rawPrediction",
    metricName="areaUnderROC"
)


In [0]:
import mlflow
import mlflow.spark
from pyspark.ml.classification import RandomForestClassifier

In [0]:
mlflow.set_experiment("/Workspace/Users/kalyanmistcse@gmail.com/Databricks 14 Day AI Challenge - 02/Day - 07")

In [0]:
with mlflow.start_run():
    num_trees = 46
    max_depth = 7

    rf = RandomForestClassifier(
        featuresCol="features",
        labelCol="is_high_valued",
        weightCol="class_weight",
        numTrees=num_trees,
        maxDepth=max_depth
    )

    model = rf.fit(train_df)
    # Make predictions
    predictions = model.transform(test_df)

    # Evaluate
    auc = evaluator.evaluate(predictions)

    # log parameter
    mlflow.log_params({
        'numTrees': num_trees,
        'maxDepth': max_depth
    })

    # log metrics
    mlflow.log_metric("AUC-ROC",auc)

    # log model
    mlflow.spark.log_model(model, "random_forest_model",dfs_tmpdir="/Volumes/dai/phase2/silver/tmp")

#     # log artifact
#     mlflow.log_artifact("/dbfs/FileStore/tables/RF.png")
# # Load model
# loaded_model = mlflow.spark.load_model("runs:/<run_id>/random_forest_model")

print("AUC:", auc)




AUC: 0.9999429070118346


In [0]:
for i in [12,24,35,19,88]:
    with mlflow.start_run():
        num_trees = i
        max_depth = 7

        rf = RandomForestClassifier(
            featuresCol="features",
            labelCol="is_high_valued",
            weightCol="class_weight",
            numTrees=num_trees,
            maxDepth=max_depth
        )

        model = rf.fit(train_df)
        # Make predictions
        predictions = model.transform(test_df)

        # Evaluate
        auc = evaluator.evaluate(predictions)

        # log parameter
        mlflow.log_params({
            'numTrees': num_trees,
            'maxDepth': max_depth
        })

        # log metrics
        mlflow.log_metric("AUC-ROC",auc)

        # log model
        mlflow.spark.log_model(model, "random_forest_model",dfs_tmpdir="/Volumes/dai/phase2/silver/tmp")

    #     # log artifact
    #     mlflow.log_artifact("/dbfs/FileStore/tables/RF.png")
    # # Load model
    # loaded_model = mlflow.spark.load_model("runs:/<run_id>/random_forest_model")

    print("AUC:", auc)




AUC: 0.9999429070118346




AUC: 0.9999592192941675




AUC: 0.9999102824471686




AUC: 0.9999021263060022




AUC: 0.9999347508706681


### By comparing the models in the mlflow ui, i came to know that the model with `numTrees` in the range 20-30 gives best `AUC-ROC` score

In [0]:
for i in range(20,30):
    with mlflow.start_run() as run:
        num_trees = i
        max_depth = 7

        rf = RandomForestClassifier(
            featuresCol="features",
            labelCol="is_high_valued",
            weightCol="class_weight",
            numTrees=num_trees,
            maxDepth=max_depth
        )

        model = rf.fit(train_df)
        # Make predictions
        predictions = model.transform(test_df)

        # Evaluate
        auc = evaluator.evaluate(predictions)

        # log parameter
        mlflow.log_params({
            'numTrees': num_trees,
            'maxDepth': max_depth
        })

        # log metrics
        mlflow.log_metric("AUC-ROC",auc)

        # log model
        mlflow.spark.log_model(model, "random_forest_model",dfs_tmpdir="/Volumes/dai/phase2/silver/tmp")

        # Set unique tag for each run
        mlflow.set_tag("unique_run_id", f"rf_{num_trees}_trees_{max_depth}_depth_{run.info.run_id}")

    print("AUC:", auc)



AUC: 0.9998531894590031




AUC: 0.9999429070118345




AUC: 0.9996900666356734




AUC: 0.9997879403296713




AUC: 0.9999592192941675




AUC: 0.9998531894590031




AUC: 0.9999265947295015




AUC: 0.9998939701648356




AUC: 0.9998776578825026




AUC: 0.9999347508706681


![image](/Workspace/Users/kalyanmistcse@gmail.com/Databricks 14 Day AI Challenge - 02/Day - 07/image.webp)
---

### As you can see in the above image, with `numTrees` = 24, I am getting the best performace