In [2]:
# data_ingestion.py

"""
data_ingestion.py

This module shall handle loading raw data and returns a Spark DataFrame for subsequent stages of the pipeline.
"""

from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType
import os
# THIS WAS HERE IN BEN'S DODGY COMMIT

class DataIngestion:
    """
    Responsible for:       
        1. Loading data.
        2. Performing basic validations or sanity checks.
        3. Returning the loaded data in a Spark-dataframe format.
    """

    def __init__(self, spark: SparkSession, config: dict):
        """
        Set up Spark session and configuration.

        Args:
            spark (SparkSession): Spark session to be used for reading data.
            config (dict): Dictionary containing settings for data paths,
                           file formats, and any other parameters.
        """
        self.spark = spark
        config.setdefault("data_percentage", 1.0)
        self.config = config


    def load_data(self):
        """Load data using file path from config."""
        data_path = self.config["data_path"]
        data_percentage = self.config["data_percentage"]
        
        print(f"Data Path is {data_path} and loading {data_percentage}% of data ++++++++++++++++++++++")

        columns = [f"_c{i}" for i in range(1, 141)]
        schema = StructType([
            StructField("label", IntegerType(), True)
        ] + [
            StructField(col, DoubleType(), True) for col in columns
        ])
        df = self.spark.read.csv(
            data_path, 
            header=True, 
            schema=schema, 
            sep=","
        )
        # df = df.sample(fraction=data_percentage)     
        print(f"\nData size is :{df.count()}\n")
        self.validate_data(df)  # Check if data is valid.
        return df
    

    def validate_data(self, df):
        """Make sure the data is not empty."""
        # fetch a single row and check if the list is empty
        if len(df.take(1)) == 0:
            raise Exception("Data is empty!")
        
    def get_sample_data(self, df, fraction=0.1):
        """Return a small random sample of the data for testing."""
        return df.sample(fraction=fraction)

In [3]:
from pyspark.sql import SparkSession

if "DATABRICKS_RUNTIME_VERSION" in os.environ:
    try:
        spark  # Try to use the existing 'spark' session.
        print("\nUsing Databricks Spark session.")
    except NameError:
        spark = SparkSession.builder.getOrCreate()
        print("\nDatabricks: Created Spark session!")
    config_DataIngestion = {
        "data_path": "/mnt/2025-team6/fulldataset_ECG5000.csv",
        "data_percentage": 0.005  # % Percentage of data to load for SW development
    }
else:
    # If not in Databricks, we are running locally.
    # Try to get an active Spark session; if none, create one.
    spark = SparkSession.getActiveSession()
    if spark is None:
        spark = SparkSession.builder \
            .appName("TestPipeline") \
            .master("local[*]") \
            .getOrCreate()
        print("\nLocal: Created Spark session!")
    else:
        print("\nLocal: Spark session already active!")


Local: Created Spark session!


In [4]:
current_dir = os.getcwd()
# Go up two levels to reach the project root
project_root = os.path.abspath(os.path.join(current_dir, "..", ".."))

config_DataIngestion = {
    # "data_path": project_root + "/ECG5000/*.tsv" # IF WE DECIDE TO USE .tsv FILES
    "data_path": project_root + "/fulldataset_ECG5000.csv", # IF WE DECIDE TO USE .csv FILES
    "data_percentage": 0.005  # % Percentage of data to load for SW development
    }   

ingestion = DataIngestion(spark, config_DataIngestion)

In [5]:
config_DataIngestion['data_path']

'c:\\Users\\benat\\OneDrive\\0. MSc MLiS\\0. GitHub Repositories\\BigDataProject_repo/fulldataset_ECG5000.csv'

In [6]:
df_full = ingestion.load_data()
# df.show()
df_full.count()

df_subset = ingestion.get_sample_data(df_full, fraction=0.1)
df_subset.count()

Data Path is c:\Users\benat\OneDrive\0. MSc MLiS\0. GitHub Repositories\BigDataProject_repo/fulldataset_ECG5000.csv and loading 0.005% of data ++++++++++++++++++++++

Data size is :5000



501

In [7]:
import pyspark.sql.functions as F

In [8]:
df = df_full
feature_cols = [col for col in df.columns if col != "label"]
feature_cols[:5]

['_c1', '_c2', '_c3', '_c4', '_c5']

In [20]:
col_exprs = []
for col in feature_cols:
    col_exprs.append(F.min(col))
    col_exprs.append(F.max(col))

col_exprs[:5]

[Column<'min(_c1)'>,
 Column<'max(_c1)'>,
 Column<'min(_c2)'>,
 Column<'max(_c2)'>,
 Column<'min(_c3)'>]

In [21]:
df_stats_rows = df.select(*col_exprs)

print(df_stats_rows.first())
print(df_stats_rows.show(5, truncate=False))

Row(min(_c1)=-6.7294987, max(_c1)=4.9664142, min(_c2)=-7.0903741, max(_c2)=3.4796887, min(_c3)=-5.1324592, max(_c3)=2.660597, min(_c4)=-5.3632414, max(_c4)=1.8997982, min(_c5)=-5.3757152, max(_c5)=2.1470148, min(_c6)=-5.3301935, max(_c6)=1.6143748, min(_c7)=-4.7822402, max(_c7)=1.8687283, min(_c8)=-4.3112877, max(_c8)=1.804251, min(_c9)=-4.071361, max(_c9)=1.68373, min(_c10)=-4.1424756, max(_c10)=1.8986267, min(_c11)=-4.0405259, max(_c11)=2.0330067, min(_c12)=-3.7245377, max(_c12)=2.1598611, min(_c13)=-3.264576, max(_c13)=2.3908666, min(_c14)=-3.2223453, max(_c14)=2.433236, min(_c15)=-3.2634838, max(_c15)=2.1603562, min(_c16)=-3.2192115, max(_c16)=2.1257583, min(_c17)=-3.2021869, max(_c17)=2.5667622, min(_c18)=-3.1475037, max(_c18)=2.0997268, min(_c19)=-3.4599173, max(_c19)=2.1175937, min(_c20)=-4.1021017, max(_c20)=2.0762789, min(_c21)=-4.3112405, max(_c21)=2.0531319, min(_c22)=-4.0998883, max(_c22)=2.0917938, min(_c23)=-3.6149732, max(_c23)=2.0032581, min(_c24)=-3.8222981, max(_c24)=