In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor, GBTRegressor, MultilayerPerceptronRegressor
from pyspark.sql.functions import col, expr
import findspark

In [3]:
findspark.init()

In [4]:
spark = SparkSession.builder.appName("RegressionModels").getOrCreate()

In [5]:
spark

In [12]:
df = spark.read.csv(r"C:\Users\Robyi\Documents\Data Science Dataset\delivery.csv", header = True, inferSchema = True)

In [13]:
df.show()

+--------+-----------+-------+-------------+-----------+------------+--------------------+----------------------+-----------------+
|Order_ID|Distance_km|Weather|Traffic_Level|Time_of_Day|Vehicle_Type|Preparation_Time_min|Courier_Experience_yrs|Delivery_Time_min|
+--------+-----------+-------+-------------+-----------+------------+--------------------+----------------------+-----------------+
|     522|       7.93|  Windy|          Low|  Afternoon|     Scooter|                  12|                   1.0|               43|
|     738|      16.42|  Clear|       Medium|    Evening|        Bike|                  20|                   2.0|               84|
|     741|       9.52|  Foggy|          Low|      Night|     Scooter|                  28|                   1.0|               59|
|     661|       7.44|  Rainy|       Medium|  Afternoon|     Scooter|                   5|                   1.0|               37|
|     412|      19.03|  Clear|          Low|    Morning|        Bike|       

In [14]:
df = df.dropDuplicates()

In [15]:
df = df.dropna(how="any")

In [16]:
assembler = VectorAssembler(inputCols=[
    "Distance_km", "Preparation_Time_min", "Courier_Experience_yrs"], 
                            outputCol="features")
df = assembler.transform(df)

In [17]:
Standarization = StandardScaler(inputCol="features", outputCol="features_scaled")
df = Standarization.fit(df).transform(df)

In [18]:
df.show()

+--------+-----------+-------+-------------+-----------+------------+--------------------+----------------------+-----------------+----------------+--------------------+
|Order_ID|Distance_km|Weather|Traffic_Level|Time_of_Day|Vehicle_Type|Preparation_Time_min|Courier_Experience_yrs|Delivery_Time_min|        features|     features_scaled|
+--------+-----------+-------+-------------+-----------+------------+--------------------+----------------------+-----------------+----------------+--------------------+
|     573|       8.23|  Snowy|          Low|    Evening|        Bike|                  17|                   3.0|               56| [8.23,17.0,3.0]|[1.44675765980526...|
|     676|      19.66|  Clear|          Low|    Morning|         Car|                  19|                   6.0|               70|[19.66,19.0,6.0]|[3.45604563691027...|
|     120|      11.02|  Clear|         High|    Morning|     Scooter|                  22|                   2.0|               70|[11.02,22.0,2.0]|[1

In [19]:
train, test = df.randomSplit([0.8, 0.2], seed=42)

In [None]:
lr = LinearRegression(featuresCol="features", labelCol="Delivery_Time_min")

paramGrid_lr = (ParamGridBuilder()
                .addGrid(lr.regParam, [0.01, 0.1, 0.5])
                .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
                .build())

cv_lr = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid_lr, evaluator=RegressionEvaluator(labelCol="Delivery_Time_min"), numFolds=5)

In [None]:
dt = DecisionTreeRegressor(featuresCol="features", labelCol="Delivery_Time_min")

paramGrid_dt = (ParamGridBuilder()
                .addGrid(dt.maxDepth, [3, 5, 10])
                .addGrid(dt.minInstancesPerNode, [1, 5, 10])
                .build())

cv_dt = CrossValidator(estimator=dt, estimatorParamMaps=paramGrid_dt, evaluator=RegressionEvaluator(labelCol="Delivery_Time_min"), numFolds=5)

In [None]:
rf = RandomForestRegressor(featuresCol="features", labelCol="Delivery_Time_min")

paramGrid_rf = (ParamGridBuilder()
                .addGrid(rf.numTrees, [10, 50, 100])
                .addGrid(rf.maxDepth, [5, 10, 15])
                .build())

cv_rf = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid_rf, evaluator=RegressionEvaluator(labelCol="Delivery_Time_min"), numFolds=5)

In [None]:
gbt = GBTRegressor(featuresCol="features", labelCol="Delivery_Time_min")

paramGrid_gbt = (ParamGridBuilder()
                .addGrid(gbt.maxDepth, [3, 5, 10])
                .addGrid(gbt.maxIter, [10, 50, 100])
                .build())

cv_gbt = CrossValidator(estimator=gbt, estimatorParamMaps=paramGrid_gbt, evaluator=RegressionEvaluator(labelCol="Delivery_Time_min"), numFolds=5)
gbt_model = cv_gbt.fit(train)

In [None]:
mlp = MultilayerPerceptronClassifier(featuresCol="features_scaled", labelCol="Delivery_Time_min")

paramGrid = ParamGridBuilder() \
    .addGrid(mlp.layers, [[3, 5, 1], [3, 10,1]]) \
    .addGrid(mlp.stepSize, [0.01, 0.1]) \
    .build()
# layer di akhir 1 karena kelas targetnya regresi itu 1
# layer di awal 3 karena adaa 3 fitur
# dan hidden layer 5 dan 10

crossval = CrossValidator(estimator=mlp, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

In [None]:
best_model = gbt_model.bestModel
print(f"Best Model: {best_model}")

In [None]:
predictions = best_model.transform(test)

In [None]:
evaluator_rmse = RegressionEvaluator(labelCol="Delivery_Time_min", predictionCol="prediction", metricName="rmse")
rmse = evaluator_rmse.evaluate(predictions)
evaluator_r2 = RegressionEvaluator(labelCol="Delivery_Time_min", predictionCol="prediction", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)

print(f"Best Model RMSE: {rmse:.4f}")
print(f"Best Model R-squared: {r2:.4f}")

In [None]:
print("Best Parameters for Gradient Boosting Regressor:")
print(best_model.explainParams())

In [None]:
predictions.select("prediction", "Delivery_Time_min").show(10)

In [None]:
predictions = predictions.withColumn("residual", col("Delivery_Time_min") - col("prediction"))

In [None]:
predictions.select("Delivery_Time_min", "prediction", "residual").show(10)

In [None]:
feature_importances = best_model.featureImportances

for idx, importance in enumerate(feature_importances):
    print(f"Feature {feature_columns[idx]}: {importance:.4f}")