In [1]:
import pickle
import pandas as pd
import numpy as np
from pyspark.sql import DataFrame
from aeon.classification.distance_based import ProximityTree, ProximityForest
import logging

from pyspark.sql import SparkSession
import os
from pyspark.sql import SparkSession
from data_ingestion import DataIngestion
from preprocessing import Preprocessor
from prediction_manager import PredictionManager
from local_model_manager import LocalModelManager
from evaluation import Evaluator
from utilities import show_compact
import time
import json
from random import sample
from dtaidistance import dtw

## ---


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").appName("GenericRDD").getOrCreate()

# Access the SparkContext
sc = spark.sparkContext

# ---

In [3]:
data = [
    {"label": 1, "time_series": [1.0, 2.1, 3.2, 4.3, 5.4]},
    {"label": 2, "time_series": [2.0, 3.1, 4.2, 5.3, 6.4]},
    {"label": 3, "time_series": [3.0, 4.1, 5.2, 6.3, 7.4]},
    {"label": 4, "time_series": [4.0, 5.1, 6.2, 7.3, 8.4]},
    {"label": 1, "time_series": [1.5, 2.6, 3.7, 4.8, 5.9]},
    {"label": 2, "time_series": [2.5, 3.6, 4.7, 5.8, 6.9]},
    {"label": 3, "time_series": [3.5, 4.6, 5.7, 6.8, 7.9]},
    {"label": 4, "time_series": [4.5, 5.6, 6.7, 7.8, 8.9]}
]

rdd = sc.parallelize(data)

In [4]:
rdd = rdd.repartition(2)
rdd.getNumPartitions()

2

In [66]:
def print_partition_rows(index, iterator):
    # Add partition index to each row
    return [(index, row) for row in iterator]

# Use mapPartitionsWithIndex to include partition index
partitioned_rdd = rdd.mapPartitionsWithIndex(print_partition_rows)

# Collect and print the rows along with their partition index
for partition_index, row in partitioned_rdd.collect():
    print(f"Partition {partition_index}: {row}")

Partition 0: {'label': 3, 'time_series': [3.0, 4.1, 5.2, 6.3, 7.4]}
Partition 0: {'label': 4, 'time_series': [4.0, 5.1, 6.2, 7.3, 8.4]}
Partition 0: {'label': 1, 'time_series': [1.5, 2.6, 3.7, 4.8, 5.9]}
Partition 0: {'label': 2, 'time_series': [2.5, 3.6, 4.7, 5.8, 6.9]}
Partition 0: {'label': 3, 'time_series': [3.5, 4.6, 5.7, 6.8, 7.9]}
Partition 1: {'label': 1, 'time_series': [1.0, 2.1, 3.2, 4.3, 5.4]}
Partition 1: {'label': 2, 'time_series': [2.0, 3.1, 4.2, 5.3, 6.4]}
Partition 1: {'label': 4, 'time_series': [4.5, 5.6, 6.7, 7.8, 8.9]}


# adding exemplar column

In [5]:
def sample_and_add_column(iterator):
    partition_data = list(iterator)
    sampled_element = sample(partition_data, 1)[0]['time_series']
    return iter([{**row, "exemplar": sampled_element} for row in partition_data])

rdd_with_sampled_column = rdd.mapPartitions(sample_and_add_column)

# Collect and print the updated RDD
for row in rdd_with_sampled_column.collect():
    print(row)

{'label': 3, 'time_series': [3.0, 4.1, 5.2, 6.3, 7.4], 'exemplar': [4.0, 5.1, 6.2, 7.3, 8.4]}
{'label': 4, 'time_series': [4.0, 5.1, 6.2, 7.3, 8.4], 'exemplar': [4.0, 5.1, 6.2, 7.3, 8.4]}
{'label': 1, 'time_series': [1.5, 2.6, 3.7, 4.8, 5.9], 'exemplar': [4.0, 5.1, 6.2, 7.3, 8.4]}
{'label': 2, 'time_series': [2.5, 3.6, 4.7, 5.8, 6.9], 'exemplar': [4.0, 5.1, 6.2, 7.3, 8.4]}
{'label': 3, 'time_series': [3.5, 4.6, 5.7, 6.8, 7.9], 'exemplar': [4.0, 5.1, 6.2, 7.3, 8.4]}
{'label': 1, 'time_series': [1.0, 2.1, 3.2, 4.3, 5.4], 'exemplar': [1.0, 2.1, 3.2, 4.3, 5.4]}
{'label': 2, 'time_series': [2.0, 3.1, 4.2, 5.3, 6.4], 'exemplar': [1.0, 2.1, 3.2, 4.3, 5.4]}
{'label': 4, 'time_series': [4.5, 5.6, 6.7, 7.8, 8.9], 'exemplar': [1.0, 2.1, 3.2, 4.3, 5.4]}


# calculating DTW distance using time series and exemplar columns

In [6]:
# def calc_dtw_distance(iterator):
#     partition_data = list(iterator)
#     time_series = partition_data['time_series']
#     exemplar = partition_data['exemplar']
#     dtw_distance = dtw.distance(time_series, exemplar)
#     return iter([{**row, "dtw_distance": dtw_distance} for row in partition_data])

def calc_dtw_distance(iterator):
    partition_data = list(iterator)
    updated_rows = []
    
    for row in partition_data:
        time_series = row['time_series']
        exemplar = row['exemplar']
        
        dtw_distance = dtw.distance(time_series, exemplar)
        
        updated_row = {**row, "dtw_distance": dtw_distance}
        updated_rows.append(updated_row)
    return iter(updated_rows)

rdd_with_dtw = rdd_with_sampled_column.mapPartitions(calc_dtw_distance)
for row in rdd_with_dtw.collect():
    print(row)

{'label': 3, 'time_series': [3.0, 4.1, 5.2, 6.3, 7.4], 'exemplar': [4.0, 5.1, 6.2, 7.3, 8.4], 'dtw_distance': 1.42828568570857}
{'label': 4, 'time_series': [4.0, 5.1, 6.2, 7.3, 8.4], 'exemplar': [4.0, 5.1, 6.2, 7.3, 8.4], 'dtw_distance': 0.0}
{'label': 1, 'time_series': [1.5, 2.6, 3.7, 4.8, 5.9], 'exemplar': [4.0, 5.1, 6.2, 7.3, 8.4], 'dtw_distance': 4.085339643163099}
{'label': 2, 'time_series': [2.5, 3.6, 4.7, 5.8, 6.9], 'exemplar': [4.0, 5.1, 6.2, 7.3, 8.4], 'dtw_distance': 2.2671568097509267}
{'label': 3, 'time_series': [3.5, 4.6, 5.7, 6.8, 7.9], 'exemplar': [4.0, 5.1, 6.2, 7.3, 8.4], 'dtw_distance': 1.118033988749895}
{'label': 1, 'time_series': [1.0, 2.1, 3.2, 4.3, 5.4], 'exemplar': [1.0, 2.1, 3.2, 4.3, 5.4], 'dtw_distance': 0.0}
{'label': 2, 'time_series': [2.0, 3.1, 4.2, 5.3, 6.4], 'exemplar': [1.0, 2.1, 3.2, 4.3, 5.4], 'dtw_distance': 1.42828568570857}
{'label': 4, 'time_series': [4.5, 5.6, 6.7, 7.8, 8.9], 'exemplar': [1.0, 2.1, 3.2, 4.3, 5.4], 'dtw_distance': 6.28331122896200

# WORKS FOR ANY NUM OF PARTITIONS AND EXEMPLARS

In [7]:
def create_sample_and_add_column_function(num_exemplars):
    def sample_and_add_column(iterator):
        partition_data = list(iterator)
        exemplars = []
        for row in sample(partition_data, min(num_exemplars, len(partition_data))):
            exemplars.append(row['time_series'])
        return iter([{**row, "exemplars": exemplars} for row in partition_data])
    return sample_and_add_column

# Example usage
num_exemplars = 2
sample_and_add_column = create_sample_and_add_column_function(num_exemplars)
rdd_with_exemplar_column = rdd.mapPartitions(sample_and_add_column)
for row in rdd_with_exemplar_column.collect():
    print(row)

print(f'\nrdd num partitions: {rdd_with_exemplar_column.getNumPartitions()}')

{'label': 3, 'time_series': [3.0, 4.1, 5.2, 6.3, 7.4], 'exemplars': [[4.0, 5.1, 6.2, 7.3, 8.4], [1.5, 2.6, 3.7, 4.8, 5.9]]}
{'label': 4, 'time_series': [4.0, 5.1, 6.2, 7.3, 8.4], 'exemplars': [[4.0, 5.1, 6.2, 7.3, 8.4], [1.5, 2.6, 3.7, 4.8, 5.9]]}
{'label': 1, 'time_series': [1.5, 2.6, 3.7, 4.8, 5.9], 'exemplars': [[4.0, 5.1, 6.2, 7.3, 8.4], [1.5, 2.6, 3.7, 4.8, 5.9]]}
{'label': 2, 'time_series': [2.5, 3.6, 4.7, 5.8, 6.9], 'exemplars': [[4.0, 5.1, 6.2, 7.3, 8.4], [1.5, 2.6, 3.7, 4.8, 5.9]]}
{'label': 3, 'time_series': [3.5, 4.6, 5.7, 6.8, 7.9], 'exemplars': [[4.0, 5.1, 6.2, 7.3, 8.4], [1.5, 2.6, 3.7, 4.8, 5.9]]}
{'label': 1, 'time_series': [1.0, 2.1, 3.2, 4.3, 5.4], 'exemplars': [[1.0, 2.1, 3.2, 4.3, 5.4], [2.0, 3.1, 4.2, 5.3, 6.4]]}
{'label': 2, 'time_series': [2.0, 3.1, 4.2, 5.3, 6.4], 'exemplars': [[1.0, 2.1, 3.2, 4.3, 5.4], [2.0, 3.1, 4.2, 5.3, 6.4]]}
{'label': 4, 'time_series': [4.5, 5.6, 6.7, 7.8, 8.9], 'exemplars': [[1.0, 2.1, 3.2, 4.3, 5.4], [2.0, 3.1, 4.2, 5.3, 6.4]]}

rdd num

In [8]:
def calc_dtw_distance(iterator):
    partition_data = list(iterator)
    updated_rows = []
    
    for row in partition_data:
        time_series = row['time_series']
        exemplars = row['exemplars']
        
        # Calculate DTW distances for each exemplar
        dtw_distances = [dtw.distance(time_series, exemplar) for exemplar in exemplars]
        
        # Add each DTW distance as a separate column
        updated_row = {**row}
        for i, dtw_distance in enumerate(dtw_distances):
            updated_row[f"dtw_distance_exemplar_{i+1}"] = dtw_distance
        
        updated_rows.append(updated_row)
    
    return iter(updated_rows)

# Example usage
rdd_with_dtw = rdd_with_exemplar_column.mapPartitions(calc_dtw_distance)
for row in rdd_with_dtw.collect():
    print(row)

print(f'\nrdd num partitions: {rdd_with_dtw.getNumPartitions()}')

{'label': 3, 'time_series': [3.0, 4.1, 5.2, 6.3, 7.4], 'exemplars': [[4.0, 5.1, 6.2, 7.3, 8.4], [1.5, 2.6, 3.7, 4.8, 5.9]], 'dtw_distance_exemplar_1': 1.42828568570857, 'dtw_distance_exemplar_2': 2.2671568097509267}
{'label': 4, 'time_series': [4.0, 5.1, 6.2, 7.3, 8.4], 'exemplars': [[4.0, 5.1, 6.2, 7.3, 8.4], [1.5, 2.6, 3.7, 4.8, 5.9]], 'dtw_distance_exemplar_1': 0.0, 'dtw_distance_exemplar_2': 4.085339643163099}
{'label': 1, 'time_series': [1.5, 2.6, 3.7, 4.8, 5.9], 'exemplars': [[4.0, 5.1, 6.2, 7.3, 8.4], [1.5, 2.6, 3.7, 4.8, 5.9]], 'dtw_distance_exemplar_1': 4.085339643163099, 'dtw_distance_exemplar_2': 0.0}
{'label': 2, 'time_series': [2.5, 3.6, 4.7, 5.8, 6.9], 'exemplars': [[4.0, 5.1, 6.2, 7.3, 8.4], [1.5, 2.6, 3.7, 4.8, 5.9]], 'dtw_distance_exemplar_1': 2.2671568097509267, 'dtw_distance_exemplar_2': 1.42828568570857}
{'label': 3, 'time_series': [3.5, 4.6, 5.7, 6.8, 7.9], 'exemplars': [[4.0, 5.1, 6.2, 7.3, 8.4], [1.5, 2.6, 3.7, 4.8, 5.9]], 'dtw_distance_exemplar_1': 1.11803398874

In [None]:
# not sure if this is needed

def assign_closest_exemplar(iterator):
    partition_data = list(iterator)

    for row in partition_data:
        # Check if there are DTW distances for exemplars
        exemplar_distances = {key: value for key, value in row.items() if key.startswith("dtw_distance_exemplar_")}
        
        if exemplar_distances:
            # Find the exemplar with the smallest DTW distance
            closest_exemplar = min(exemplar_distances, key=exemplar_distances.get)
            
            # Assign the closest exemplar to the row
            row["closest exemplar"] = closest_exemplar

    return iter(partition_data)

# Example usage
rdd_with_classification = rdd_with_dtw.mapPartitions(assign_closest_exemplar)
for row in rdd_with_classification.collect():
    print(row)

{'label': 3, 'time_series': [3.0, 4.1, 5.2, 6.3, 7.4], 'exemplars': [[4.0, 5.1, 6.2, 7.3, 8.4], [1.5, 2.6, 3.7, 4.8, 5.9]], 'dtw_distance_exemplar_1': 1.42828568570857, 'dtw_distance_exemplar_2': 2.2671568097509267, 'closest exemplar': 'dtw_distance_exemplar_1'}
{'label': 4, 'time_series': [4.0, 5.1, 6.2, 7.3, 8.4], 'exemplars': [[4.0, 5.1, 6.2, 7.3, 8.4], [1.5, 2.6, 3.7, 4.8, 5.9]], 'dtw_distance_exemplar_1': 0.0, 'dtw_distance_exemplar_2': 4.085339643163099, 'closest exemplar': 'dtw_distance_exemplar_1'}
{'label': 1, 'time_series': [1.5, 2.6, 3.7, 4.8, 5.9], 'exemplars': [[4.0, 5.1, 6.2, 7.3, 8.4], [1.5, 2.6, 3.7, 4.8, 5.9]], 'dtw_distance_exemplar_1': 4.085339643163099, 'dtw_distance_exemplar_2': 0.0, 'closest exemplar': 'dtw_distance_exemplar_2'}
{'label': 2, 'time_series': [2.5, 3.6, 4.7, 5.8, 6.9], 'exemplars': [[4.0, 5.1, 6.2, 7.3, 8.4], [1.5, 2.6, 3.7, 4.8, 5.9]], 'dtw_distance_exemplar_1': 2.2671568097509267, 'dtw_distance_exemplar_2': 1.42828568570857, 'closest exemplar': 'dt

In [70]:
def calculate_partition_gini(iterator):
    labels = [row['label'] for row in iterator]

    label_counts_dict = {}
    for label in labels:
        if label in label_counts_dict:
            label_counts_dict[label] += 1
        else:
            label_counts_dict[label] = 1
    
    total = sum(label_counts_dict.values())
    proportion_sqrd_values = [(count / total) ** 2 for count in label_counts_dict.values()]
    gini_impurity = 1 - sum(proportion_sqrd_values)
    
    return iter([gini_impurity])

In [None]:
# Example usage
gini_rdd = rdd_with_classification.mapPartitions(calculate_partition_gini)

# Collect and print the Gini impurity for each partition
i=0
for gini in gini_rdd.collect():
    print(f'gini impurity of partition {i+1}: {gini}')
    i+=1

gini impurity of partition 1: 0.72
gini impurity of partition 2: 0.6666666666666667


### trying splitting code

In [111]:
tsdata = [
    {'label': 1, 'time_series': [1.2, 2.4, 3.6, 4.8, 6.0], 'closest_exemplar': 'exemplar_1'},
    {'label': 2, 'time_series': [2.1, 3.3, 4.5, 5.7, 6.9], 'closest_exemplar': 'exemplar_2'},
    {'label': 3, 'time_series': [0.5, 1.5, 2.5, 3.5, 4.5], 'closest_exemplar': 'exemplar_1'},
    {'label': 2, 'time_series': [3.0, 3.8, 4.6, 5.4, 6.2], 'closest_exemplar': 'exemplar_2'},
    {'label': 1, 'time_series': [1.0, 1.8, 2.6, 3.4, 4.2], 'closest_exemplar': 'exemplar_1'},
    {'label': 4, 'time_series': [5.5, 6.6, 7.7, 8.8, 9.9], 'closest_exemplar': 'exemplar_2'},
    {'label': 3, 'time_series': [2.0, 2.5, 3.0, 3.5, 4.0], 'closest_exemplar': 'exemplar_1'},
    {'label': 4, 'time_series': [6.1, 6.2, 6.3, 6.4, 6.5], 'closest_exemplar': 'exemplar_2'},
    {'label': 1, 'time_series': [0.9, 1.8, 2.7, 3.6, 4.5], 'closest_exemplar': 'exemplar_1'},
    {'label': 2, 'time_series': [3.3, 4.1, 4.9, 5.7, 6.5], 'closest_exemplar': 'exemplar_2'}
]

ts_rdd = sc.parallelize(tsdata)
ts_rdd = ts_rdd.repartition(1)

print(f'ts_rdd num partitions: {ts_rdd.getNumPartitions()}')

ts_rdd_gini = ts_rdd.mapPartitions(calculate_partition_gini)
# Collect and print the Gini impurity for each partition
i=0
for gini in ts_rdd_gini.collect():
    print(f'gini impurity of partition {i+1}: {gini}')
    i+=1

ts_rdd num partitions: 1
gini impurity of partition 1: 0.74


In [105]:
closestto1_yes = [row['label'] for row in tsdata if row['closest_exemplar'] == 'exemplar_1']
closestto1_no = [row['label'] for row in tsdata if row['closest_exemplar'] != 'exemplar_1']

print(closestto1_yes)
print(closestto1_no)

[1, 3, 1, 3, 1]
[2, 2, 4, 4, 2]


In [106]:
def split_rdd_by_closest_exemplar(rdd, exemplar_name):
    yes_rdd = rdd.filter(lambda row: row['closest_exemplar'] == exemplar_name)
    no_rdd = rdd.filter(lambda row: row['closest_exemplar'] != exemplar_name)
    return yes_rdd, no_rdd

In [107]:
# Example usage
yes_rdd, no_rdd = split_rdd_by_closest_exemplar(ts_rdd, 'exemplar_1')

# Collect and print the results
print("yes_rdd:")
for row in yes_rdd.collect():
    print(row)
print("\nno_rdd:")
for row in no_rdd.collect():
    print(row)

yes_rdd:
{'label': 1, 'time_series': [1.2, 2.4, 3.6, 4.8, 6.0], 'closest_exemplar': 'exemplar_1'}
{'label': 3, 'time_series': [0.5, 1.5, 2.5, 3.5, 4.5], 'closest_exemplar': 'exemplar_1'}
{'label': 1, 'time_series': [1.0, 1.8, 2.6, 3.4, 4.2], 'closest_exemplar': 'exemplar_1'}
{'label': 3, 'time_series': [2.0, 2.5, 3.0, 3.5, 4.0], 'closest_exemplar': 'exemplar_1'}
{'label': 1, 'time_series': [0.9, 1.8, 2.7, 3.6, 4.5], 'closest_exemplar': 'exemplar_1'}

no_rdd:
{'label': 2, 'time_series': [2.1, 3.3, 4.5, 5.7, 6.9], 'closest_exemplar': 'exemplar_2'}
{'label': 2, 'time_series': [3.0, 3.8, 4.6, 5.4, 6.2], 'closest_exemplar': 'exemplar_2'}
{'label': 4, 'time_series': [5.5, 6.6, 7.7, 8.8, 9.9], 'closest_exemplar': 'exemplar_2'}
{'label': 4, 'time_series': [6.1, 6.2, 6.3, 6.4, 6.5], 'closest_exemplar': 'exemplar_2'}
{'label': 2, 'time_series': [3.3, 4.1, 4.9, 5.7, 6.5], 'closest_exemplar': 'exemplar_2'}


In [109]:
print(f'yes_rdd gini impurity: {yes_rdd.mapPartitions(calculate_partition_gini).collect()}')
print(f'no_rdd gini impurity: {no_rdd.mapPartitions(calculate_partition_gini).collect()}')

yes_rdd gini impurity: [0.48]
no_rdd gini impurity: [0.48]


In [None]:
class RDDProcessor:
    def print_partition_rows(self, index, iterator):
        # Add partition index to each row
        return [(index, row) for row in iterator]

    def sample_and_add_column(self, iterator):
        # Add an exemplar column by sampling one element from the partition
        partition_data = list(iterator)
        sampled_element = sample(partition_data, 1)[0]['time_series']
        return iter([{**row, "exemplar": sampled_element} for row in partition_data])

    def create_sample_and_add_column_function(self, num_exemplars):
        # Create a function to add multiple exemplars
        def sample_and_add_column(iterator):
            partition_data = list(iterator)
            exemplars = []
            for row in sample(partition_data, min(num_exemplars, len(partition_data))):
                exemplars.append(row['time_series'])
            return iter([{**row, "exemplars": exemplars} for row in partition_data])
        return sample_and_add_column

    def calc_dtw_distance(self, iterator, dtw_distance_func):
        # Calculate DTW distance for each row
        partition_data = list(iterator)
        updated_rows = []
        for row in partition_data:
            time_series = row['time_series']
            exemplars = row['exemplars']
            dtw_distances = [dtw_distance_func(time_series, exemplar) for exemplar in exemplars]
            updated_row = {**row}
            for i, dtw_distance in enumerate(dtw_distances):
                updated_row[f"dtw_distance_exemplar_{i+1}"] = dtw_distance
            updated_rows.append(updated_row)
        return iter(updated_rows)

    def assign_closest_exemplar(self, iterator):
        # Assign the closest exemplar based on DTW distance
        partition_data = list(iterator)
        for row in partition_data:
            exemplar_distances = {key: value for key, value in row.items() if key.startswith("dtw_distance_exemplar_")}
            if exemplar_distances:
                closest_exemplar = min(exemplar_distances, key=exemplar_distances.get)
                row["closest exemplar"] = closest_exemplar
        return iter(partition_data)

    def calculate_partition_gini(self, iterator):
        # Calculate Gini impurity for a partition
        labels = [row['label'] for row in iterator]
        label_counts_dict = {}
        for label in labels:
            if label in label_counts_dict:
                label_counts_dict[label] += 1
            else:
                label_counts_dict[label] = 1
        total = sum(label_counts_dict.values())
        proportion_sqrd_values = [(count / total) ** 2 for count in label_counts_dict.values()]
        gini_impurity = 1 - sum(proportion_sqrd_values)
        return iter([gini_impurity])

    def split_rdd_by_closest_exemplar(self, rdd, exemplar_name):
        # Split RDD based on the closest exemplar
        yes_rdd = rdd.filter(lambda row: row['closest_exemplar'] == exemplar_name)
        no_rdd = rdd.filter(lambda row: row['closest_exemplar'] != exemplar_name)
        return yes_rdd, no_rdd

In [118]:
# Initialize the processor
processor = RDDProcessor()

# Example: Repartition and print partition rows
partitioned_rdd = rdd.mapPartitionsWithIndex(processor.print_partition_rows)
for partition_index, row in partitioned_rdd.collect():
    print(f"Partition {partition_index}: {row}")

# Example: Add exemplar column
rdd_with_sampled_column = rdd.mapPartitions(processor.sample_and_add_column)
for row in rdd_with_sampled_column.collect():
    print(row)

# Broadcast the dtw.distance function
dtw_distance_broadcast = sc.broadcast(dtw.distance)
# Initialize the processor
processor = RDDProcessor()
# Example: Calculate DTW distances
rdd_with_dtw = rdd_with_sampled_column.mapPartitions(
    lambda iterator: processor.calc_dtw_distance(iterator, dtw_distance_broadcast.value)
)

# Collect and print the results
for row in rdd_with_dtw.collect():
    print(row)
# Example: Assign closest exemplar
rdd_with_classification = rdd_with_dtw.mapPartitions(processor.assign_closest_exemplar)
for row in rdd_with_classification.collect():
    print(row)

# Example: Calculate Gini impurity
gini_rdd = rdd_with_classification.mapPartitions(processor.calculate_partition_gini)
for gini in gini_rdd.collect():
    print(f"Gini impurity: {gini}")

Partition 0: {'label': 4, 'time_series': [4.0, 5.1, 6.2, 7.3, 8.4]}
Partition 0: {'label': 2, 'time_series': [2.5, 3.6, 4.7, 5.8, 6.9]}
Partition 0: {'label': 2, 'time_series': [2.0, 3.1, 4.2, 5.3, 6.4]}
Partition 1: {'label': 3, 'time_series': [3.0, 4.1, 5.2, 6.3, 7.4]}
Partition 1: {'label': 1, 'time_series': [1.5, 2.6, 3.7, 4.8, 5.9]}
Partition 1: {'label': 3, 'time_series': [3.5, 4.6, 5.7, 6.8, 7.9]}
Partition 1: {'label': 1, 'time_series': [1.0, 2.1, 3.2, 4.3, 5.4]}
Partition 1: {'label': 4, 'time_series': [4.5, 5.6, 6.7, 7.8, 8.9]}
{'label': 4, 'time_series': [4.0, 5.1, 6.2, 7.3, 8.4], 'exemplar': [4.0, 5.1, 6.2, 7.3, 8.4]}
{'label': 2, 'time_series': [2.5, 3.6, 4.7, 5.8, 6.9], 'exemplar': [4.0, 5.1, 6.2, 7.3, 8.4]}
{'label': 2, 'time_series': [2.0, 3.1, 4.2, 5.3, 6.4], 'exemplar': [4.0, 5.1, 6.2, 7.3, 8.4]}
{'label': 3, 'time_series': [3.0, 4.1, 5.2, 6.3, 7.4], 'exemplar': [1.5, 2.6, 3.7, 4.8, 5.9]}
{'label': 1, 'time_series': [1.5, 2.6, 3.7, 4.8, 5.9], 'exemplar': [1.5, 2.6, 3.

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 94.0 failed 1 times, most recent failure: Lost task 0.0 in stage 94.0 (TID 155) (BenAtkinson-Dell-Inspiron3505.Home executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "C:\Users\benat\miniconda3\envs\bigdata_env\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 1247, in main
    process()
  File "C:\Users\benat\miniconda3\envs\bigdata_env\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 1237, in process
    out_iter = func(split_index, iterator)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\benat\miniconda3\envs\bigdata_env\Lib\site-packages\pyspark\rdd.py", line 5434, in pipeline_func
    return func(split, prev_func(split, iterator))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\benat\miniconda3\envs\bigdata_env\Lib\site-packages\pyspark\rdd.py", line 840, in func
    return f(iterator)
           ^^^^^^^^^^^
  File "C:\Users\benat\AppData\Local\Temp\ipykernel_20836\1884868875.py", line 20, in <lambda>
TypeError: RDDProcessor.calc_dtw_distance() takes 2 positional arguments but 3 were given

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:784)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:766)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1049)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:842)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2414)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2433)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2458)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1049)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1048)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:195)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at jdk.internal.reflect.GeneratedMethodAccessor44.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:842)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "C:\Users\benat\miniconda3\envs\bigdata_env\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 1247, in main
    process()
  File "C:\Users\benat\miniconda3\envs\bigdata_env\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 1237, in process
    out_iter = func(split_index, iterator)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\benat\miniconda3\envs\bigdata_env\Lib\site-packages\pyspark\rdd.py", line 5434, in pipeline_func
    return func(split, prev_func(split, iterator))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\benat\miniconda3\envs\bigdata_env\Lib\site-packages\pyspark\rdd.py", line 840, in func
    return f(iterator)
           ^^^^^^^^^^^
  File "C:\Users\benat\AppData\Local\Temp\ipykernel_20836\1884868875.py", line 20, in <lambda>
TypeError: RDDProcessor.calc_dtw_distance() takes 2 positional arguments but 3 were given

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:784)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:766)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1049)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	... 1 more
