In [0]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

In [0]:
spark = SparkSession.builder.appName("IoT_Traffic_Model_Testing").getOrCreate()
df = spark.table("iot_edge_computing_public_management_cleaned")

In [0]:
feature_cols = [
    "vehicle_speed_kmph", 
    "latitude", 
    "longitude", 
    "hour_of_day", 
    "day_of_week"
]

label_col = "accident_hotspot"

In [0]:
label_indexer = StringIndexer(inputCol=label_col, outputCol="label")

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

In [0]:
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

In [0]:
rf = RandomForestClassifier(
    labelCol="label",
    featuresCol="features",
    numTrees=50,
    maxDepth=8,
    seed=42
)

In [0]:
pipeline = Pipeline(stages=[label_indexer, assembler, rf])

model = pipeline.fit(train_df)


In [0]:
predictions = model.transform(test_df)

display(predictions.select("sensor_id", "vehicle_speed_kmph", "latitude", "longitude", "prediction", "label"))

sensor_id,vehicle_speed_kmph,latitude,longitude,prediction,label
Vehicle_9,34,38.23071478990284,-73.82381311916097,0.0,0.0
Vehicle_2,66,36.26349187270031,-77.03314129864162,0.0,0.0
Vehicle_4,100,38.55062888574194,-118.37285438026656,0.0,0.0
Vehicle_2,38,39.10920356232421,-79.91782381292626,0.0,0.0
Vehicle_4,103,37.44630600905868,-95.60845402669736,0.0,0.0
Vehicle_5,79,39.18091248953248,-91.42794442727732,0.0,0.0
Vehicle_6,60,39.39085482236847,-77.37719618182187,0.0,1.0
Vehicle_8,24,39.5665924532276,-93.17240022552484,0.0,0.0
Vehicle_1,64,36.82673946815028,-104.44921741855654,0.0,1.0
Vehicle_5,59,34.852504826472114,-103.96538290741704,0.0,0.0


In [0]:
evaluator_acc = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
evaluator_f1  = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")

accuracy = evaluator_acc.evaluate(predictions)
f1_score = evaluator_f1.evaluate(predictions)

print(f"✅ Model Accuracy: {accuracy:.4f}")
print(f"✅ Model F1 Score: {f1_score:.4f}")

✅ Model Accuracy: 0.8090
✅ Model F1 Score: 0.7236


In [0]:
new_data = [
    (90, 19.0760, 72.8777, 14, 4),   # Example: speed, lat, lon, hour, day
    (35, 28.6139, 77.2090, 9, 2)
]

new_schema = ["vehicle_speed_kmph", "latitude", "longitude", "hour_of_day", "day_of_week"]
new_df = spark.createDataFrame(new_data, new_schema)

In [0]:
new_features = assembler.transform(new_df)
new_predictions = model.stages[-1].transform(new_features)  

In [0]:
display(new_predictions.select("vehicle_speed_kmph", "latitude", "longitude", "prediction"))

print("✅ Model successfully tested with new IoT sensor data.")

vehicle_speed_kmph,latitude,longitude,prediction
90,19.076,72.8777,0.0
35,28.6139,77.209,0.0


✅ Model successfully tested with new IoT sensor data.
