In [33]:
import pickle
import pandas as pd
import numpy as np
from pyspark.sql import DataFrame
from aeon.classification.distance_based import ProximityTree, ProximityForest
import logging

from pyspark.sql import SparkSession
import os
from pyspark.sql import SparkSession
from data_ingestion import DataIngestion
from preprocessing import Preprocessor
from prediction_manager import PredictionManager
from local_model_manager import LocalModelManager
from evaluation import Evaluator
from utilities import show_compact
import time
import json
from random import sample
from dtaidistance import dtw

## ---


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").appName("GenericRDD").getOrCreate()

# Access the SparkContext
sc = spark.sparkContext

In [3]:
data = [
    {"ID": 1, "Name": "Alice", "Age": 25, "Capacity": 50},
    {"ID": 2, "Name": "Bob", "Age": 30, "Capacity": 30},
    {"ID": 3, "Name": "Charlie", "Age": 35, "Capacity": 70},
    {"ID": 4, "Name": "David", "Age": 40, "Capacity": 20},
    {"ID": 5, "Name": "Eve", "Age": 45, "Capacity": 40}
]

rdd = sc.parallelize(data)

In [4]:
print(rdd.collect())

[{'ID': 1, 'Name': 'Alice', 'Age': 25, 'Capacity': 50}, {'ID': 2, 'Name': 'Bob', 'Age': 30, 'Capacity': 30}, {'ID': 3, 'Name': 'Charlie', 'Age': 35, 'Capacity': 70}, {'ID': 4, 'Name': 'David', 'Age': 40, 'Capacity': 20}, {'ID': 5, 'Name': 'Eve', 'Age': 45, 'Capacity': 40}]


In [None]:
# Add a new column "Score" with some computed or static value
new_rdd = rdd.map(lambda row: {**row, "Score": row["Capacity"] * 2})

# Show the updated RDD
print(new_rdd.collect())

[{'ID': 1, 'Name': 'Alice', 'Age': 25, 'Capacity': 50, 'Score': 100}, {'ID': 2, 'Name': 'Bob', 'Age': 30, 'Capacity': 30, 'Score': 60}, {'ID': 3, 'Name': 'Charlie', 'Age': 35, 'Capacity': 70, 'Score': 140}, {'ID': 4, 'Name': 'David', 'Age': 40, 'Capacity': 20, 'Score': 40}, {'ID': 5, 'Name': 'Eve', 'Age': 45, 'Capacity': 40, 'Score': 80}]


In [8]:
labeled_rdd = rdd.map(lambda row: {**row, "Label": 1 if row["Capacity"] > 40 else 0})

for row in labeled_rdd.collect():
    print(row)

{'ID': 1, 'Name': 'Alice', 'Age': 25, 'Capacity': 50, 'Label': 1}
{'ID': 2, 'Name': 'Bob', 'Age': 30, 'Capacity': 30, 'Label': 0}
{'ID': 3, 'Name': 'Charlie', 'Age': 35, 'Capacity': 70, 'Label': 1}
{'ID': 4, 'Name': 'David', 'Age': 40, 'Capacity': 20, 'Label': 0}
{'ID': 5, 'Name': 'Eve', 'Age': 45, 'Capacity': 40, 'Label': 0}


# ---

In [None]:
data = [
    {"label": 1, "time_series": [1.0, 2.1, 3.2, 4.3, 5.4]},
    {"label": 2, "time_series": [2.0, 3.1, 4.2, 5.3, 6.4]},
    {"label": 3, "time_series": [3.0, 4.1, 5.2, 6.3, 7.4]},
    {"label": 4, "time_series": [4.0, 5.1, 6.2, 7.3, 8.4]},
    {"label": 1, "time_series": [1.5, 2.6, 3.7, 4.8, 5.9]},
    {"label": 2, "time_series": [2.5, 3.6, 4.7, 5.8, 6.9]},
    {"label": 3, "time_series": [3.5, 4.6, 5.7, 6.8, 7.9]},
    {"label": 4, "time_series": [4.5, 5.6, 6.7, 7.8, 8.9]}
]

rdd = sc.parallelize(data)

In [16]:
rdd = rdd.repartition(1)
rdd.getNumPartitions()

1

In [24]:
lst = rdd.sample(False, 0.5).collect()
lst[0]['time_series']

[2.5, 3.6, 4.7, 5.8, 6.9]

In [32]:
def sample_and_add_column(iterator):
    partition_data = list(iterator)
    sampled_element = sample(partition_data, 1)[0]['time_series']
    return iter([{**row, "exemplar": sampled_element} for row in partition_data])

rdd_with_sampled_column = rdd.mapPartitions(sample_and_add_column)

# Collect and print the updated RDD
for row in rdd_with_sampled_column.collect():
    print(row)

{'label': 4, 'time_series': [4.0, 5.1, 6.2, 7.3, 8.4], 'exemplar': [1.5, 2.6, 3.7, 4.8, 5.9]}
{'label': 2, 'time_series': [2.5, 3.6, 4.7, 5.8, 6.9], 'exemplar': [1.5, 2.6, 3.7, 4.8, 5.9]}
{'label': 2, 'time_series': [2.0, 3.1, 4.2, 5.3, 6.4], 'exemplar': [1.5, 2.6, 3.7, 4.8, 5.9]}
{'label': 3, 'time_series': [3.0, 4.1, 5.2, 6.3, 7.4], 'exemplar': [1.5, 2.6, 3.7, 4.8, 5.9]}
{'label': 1, 'time_series': [1.5, 2.6, 3.7, 4.8, 5.9], 'exemplar': [1.5, 2.6, 3.7, 4.8, 5.9]}
{'label': 3, 'time_series': [3.5, 4.6, 5.7, 6.8, 7.9], 'exemplar': [1.5, 2.6, 3.7, 4.8, 5.9]}
{'label': 1, 'time_series': [1.0, 2.1, 3.2, 4.3, 5.4], 'exemplar': [1.5, 2.6, 3.7, 4.8, 5.9]}
{'label': 4, 'time_series': [4.5, 5.6, 6.7, 7.8, 8.9], 'exemplar': [1.5, 2.6, 3.7, 4.8, 5.9]}


In [35]:
# def calc_dtw_distance(iterator):
#     partition_data = list(iterator)
#     time_series = partition_data['time_series']
#     exemplar = partition_data['exemplar']
#     dtw_distance = dtw.distance(time_series, exemplar)
#     return iter([{**row, "dtw_distance": dtw_distance} for row in partition_data])

def calc_dtw_distance(iterator):
    partition_data = list(iterator)
    updated_rows = []
    
    for row in partition_data:
        # Access the time_series and exemplar fields for each row
        time_series = row['time_series']
        exemplar = row['exemplar']
        
        # Calculate the DTW distance
        dtw_distance = dtw.distance(time_series, exemplar)
        
        # Add the DTW distance as a new column
        updated_row = {**row, "dtw_distance": dtw_distance}
        updated_rows.append(updated_row)
    
    return iter(updated_rows)

rdd_with_dtw = rdd_with_sampled_column.mapPartitions(calc_dtw_distance)

for row in rdd_with_dtw.collect():
    print(row)

{'label': 4, 'time_series': [4.0, 5.1, 6.2, 7.3, 8.4], 'exemplar': [1.5, 2.6, 3.7, 4.8, 5.9], 'dtw_distance': 4.085339643163099}
{'label': 2, 'time_series': [2.5, 3.6, 4.7, 5.8, 6.9], 'exemplar': [1.5, 2.6, 3.7, 4.8, 5.9], 'dtw_distance': 1.42828568570857}
{'label': 2, 'time_series': [2.0, 3.1, 4.2, 5.3, 6.4], 'exemplar': [1.5, 2.6, 3.7, 4.8, 5.9], 'dtw_distance': 1.118033988749895}
{'label': 3, 'time_series': [3.0, 4.1, 5.2, 6.3, 7.4], 'exemplar': [1.5, 2.6, 3.7, 4.8, 5.9], 'dtw_distance': 2.2671568097509267}
{'label': 1, 'time_series': [1.5, 2.6, 3.7, 4.8, 5.9], 'exemplar': [1.5, 2.6, 3.7, 4.8, 5.9], 'dtw_distance': 0.0}
{'label': 3, 'time_series': [3.5, 4.6, 5.7, 6.8, 7.9], 'exemplar': [1.5, 2.6, 3.7, 4.8, 5.9], 'dtw_distance': 3.1208973068654466}
{'label': 1, 'time_series': [1.0, 2.1, 3.2, 4.3, 5.4], 'exemplar': [1.5, 2.6, 3.7, 4.8, 5.9], 'dtw_distance': 1.118033988749895}
{'label': 4, 'time_series': [4.5, 5.6, 6.7, 7.8, 8.9], 'exemplar': [1.5, 2.6, 3.7, 4.8, 5.9], 'dtw_distance': 