## Environment Setup and Configuration

In [1]:
from controller import PipelineController
from data_ingestion import DataIngestion
from preprocessing import Preprocessor
from local_model_manager import LocalModelManager
from prediction_manager import PredictionManager
from evaluation import Evaluator
from utilities import show_compact

config = {
    "databricks_data_path": "/mnt/2025-team6/fulldataset_ECG5000.csv",
    "local_data_path": "/fulldataset_ECG5000.csv",  # This is relative to the project root
    "data_percentage": 0.5,  # Load 10% of the data for testing/iteration
    "local_model_config": {
        "num_partitions": 2,
        "model_params": {"n_splitters": "num_partitions", "max_depth": 2, "random_state": 42}
    }
}

controller = PipelineController(config)
controller._setup_spark()
print("Spark session info:")
print(controller.spark.sparkContext.getConf().getAll())
print("Ingestion Config:")
print(controller.ingestion_config)

controller.evaluator = Evaluator()


Using local Spark session.
Current working directory (project root): d:\repos\BigData-main\BigData-1\code\src
Spark session info:
[('spark.driver.extraJavaOptions', '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDi

## Data Ingestion

In [2]:
controller.ingestion = DataIngestion(controller.spark, controller.ingestion_config)
controller.evaluator.start_timer("ingestion")
df_raw = controller.ingestion.load_data()
controller.evaluator.record_time("ingestion")
print("\nRaw Data Schema:")
df_raw.printSchema()

Data Path is d:\repos\BigData-main\BigData-1/fulldataset_ECG5000.csv and loading 50.0% of data ++++++++++++++++++++++

Data size is :2485



2025-04-13 19:10:24,873 - evaluation - INFO - ingestion took 4.04s



Raw Data Schema:
root
 |-- label: integer (nullable = true)
 |-- _c1: double (nullable = true)
 |-- _c2: double (nullable = true)
 |-- _c3: double (nullable = true)
 |-- _c4: double (nullable = true)
 |-- _c5: double (nullable = true)
 |-- _c6: double (nullable = true)
 |-- _c7: double (nullable = true)
 |-- _c8: double (nullable = true)
 |-- _c9: double (nullable = true)
 |-- _c10: double (nullable = true)
 |-- _c11: double (nullable = true)
 |-- _c12: double (nullable = true)
 |-- _c13: double (nullable = true)
 |-- _c14: double (nullable = true)
 |-- _c15: double (nullable = true)
 |-- _c16: double (nullable = true)
 |-- _c17: double (nullable = true)
 |-- _c18: double (nullable = true)
 |-- _c19: double (nullable = true)
 |-- _c20: double (nullable = true)
 |-- _c21: double (nullable = true)
 |-- _c22: double (nullable = true)
 |-- _c23: double (nullable = true)
 |-- _c24: double (nullable = true)
 |-- _c25: double (nullable = true)
 |-- _c26: double (nullable = true)
 |-- _c27: d

## Preprocessing

In [3]:
controller.preprocessor = Preprocessor(config={})
controller.evaluator.start_timer("preprocessing")
df_preproc = controller.preprocessor.run_preprocessing(df_raw)
controller.evaluator.record_time("preprocessing")
print("\nSample of Preprocessed Data:")
show_compact(df_preproc, num_rows=5, num_cols=3)

2025-04-13 19:10:29,122 - evaluation - INFO - preprocessing took 4.24s



Sample of Preprocessed Data:
_c1 | _c2 | _c3 | ... | _c139 | _c140 | label
0.4812468208445704 | 0.29627268292525955 | 0.10610580916728928 | ... | 0.7715435872746609 | 0.34503209866662515 | 1
0.6437916205754234 | 0.5953278761279595 | 0.35074416035524136 | ... | 0.8260735772068618 | 0.556802983178861 | 1
0.5499645692470915 | 0.41409135597853736 | 0.1297395913599849 | ... | 0.7901192932078601 | 0.39044852764086846 | 1
0.6135705088911871 | 0.5346905271482562 | 0.24744883094892559 | ... | 0.7540852506473523 | 0.3773993232713295 | 1
0.5828642308887235 | 0.5111218200769114 | 0.23239215809205377 | ... | 0.5766544592026659 | 0.27056986897459895 | 1


## Train Local Models

In [4]:
controller.model_manager = LocalModelManager(config=controller.config.get("local_model_config", None))
controller.evaluator.start_timer("training")
ensemble = controller.model_manager.train_ensemble(df_preproc)
controller.evaluator.record_time("training")
print("\nTrained ensemble details: ")
if ensemble:
    print("Ensemble is build with", len(controller.model_manager.trees), "trees.")
else:
    print("No ensemble was created.")

2025-04-13 19:10:40,860 - evaluation - INFO - training took 11.36s



Trained ensemble details: 
Ensemble is build with 2 trees.


## generate predictions

In [5]:

controller.predictor = PredictionManager(controller.spark, ensemble)
controller.evaluator.start_timer("prediction")
predictions_df = controller.predictor.generate_predictions(df_preproc)
controller.evaluator.record_time("prediction")
print("\nPredictions Summary:")
predictions_df.groupBy("prediction").count().show()

2025-04-13 19:10:41,340 - evaluation - INFO - prediction took 0.47s



Predictions Summary:
+----------+-----+
|prediction|count|
+----------+-----+
|         1| 1524|
|         3|   15|
|         5|    2|
|         4|   38|
|         2|  906|
+----------+-----+



In [6]:
predictions_df.select("prediction", "label").show(5, truncate=False)

+----------+-----+
|prediction|label|
+----------+-----+
|1         |1    |
|1         |1    |
|1         |1    |
|1         |1    |
|1         |1    |
+----------+-----+
only showing top 5 rows



## Evaluation

In [7]:
print(ensemble)

ProximityForest(n_jobs=-1, n_trees=2, random_state=123)


In [8]:
import json 
report = controller.evaluator.log_metrics(predictions_df, ensemble=ensemble)
print("\nFinal Evaluation Report:")
print(json.dumps(report, indent=2))

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 21.0 failed 1 times, most recent failure: Lost task 0.0 in stage 21.0 (TID 30) (razvan.petru1-everest.nord executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "C:\spark-3.5.5-bin-hadoop3\python\lib\pyspark.zip\pyspark\worker.py", line 1225, in main
    eval_type = read_int(infile)
                ^^^^^^^^^^^^^^^^
  File "C:\spark-3.5.5-bin-hadoop3\python\lib\pyspark.zip\pyspark\serializers.py", line 594, in read_int
    length = stream.read(4)
             ^^^^^^^^^^^^^^
  File "C:\Users\Petru\anaconda3\envs\bigdata_env\Lib\socket.py", line 718, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
TimeoutError: timed out

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:784)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:766)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$runJob$1(PythonRDD.scala:181)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2414)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2433)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:181)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "C:\spark-3.5.5-bin-hadoop3\python\lib\pyspark.zip\pyspark\worker.py", line 1225, in main
    eval_type = read_int(infile)
                ^^^^^^^^^^^^^^^^
  File "C:\spark-3.5.5-bin-hadoop3\python\lib\pyspark.zip\pyspark\serializers.py", line 594, in read_int
    length = stream.read(4)
             ^^^^^^^^^^^^^^
  File "C:\Users\Petru\anaconda3\envs\bigdata_env\Lib\socket.py", line 718, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
TimeoutError: timed out

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:784)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:766)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$runJob$1(PythonRDD.scala:181)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more


In [None]:
print(report)

{'performance': {'accuracy': 0.892116, 'error_rate': 0.107884, 'precision': 0.887378, 'recall': 0.892116, 'f1_score': 0.878523}, 'timing': {'ingestion_time': 0.1536, 'preprocessing_time': 1.4343, 'training_time': 4.9439, 'prediction_time': 0.2858}, 'complexity': {'num_trees': 2, 'avg_depth': 9.0, 'avg_leaves': 35.5, 'avg_splits': 27.0}, '_meta': {'decimal_precision': 8, 'timestamp': '2025-04-13 17:23:11'}}


In [None]:
from visualization import plot_confusion_matrix
import numpy as np
if "confusion_matrix" in report:
    # Get class names from confusion matrix dimensions
    num_classes = len(report["confusion_matrix"])
    class_names = [f"Class {i}" for i in range(num_classes)]
    
    # Create plot that shows AND saves
    plot_confusion_matrix(
        np.array(report["confusion_matrix"]),
        class_names,
        save_path="pdf_result/sconfusion_matrix.pdf",  # Save to file
        show=True                         # Display on screen
    )