# UCDD on airlines evaluation

## Accept and preprocess the airlines dataset

In [28]:
import csv

import numpy as np
import pandas as pd
import seaborn as sb

In [29]:
from eval_helpers import accepting
from category_encoders import TargetEncoder
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler


airlines_exclude_reference_batches = []
airlines_exclude_testing_batches = []
airlines_onehot_reference_batches = []
airlines_onehot_testing_batches = []
airlines_target_reference_batches = []
airlines_target_testing_batches = []


df = pd.read_csv("../Datasets_concept_drift/real_world_data/airline_dataset.csv")

X = df.drop(columns=['Unnamed: 0', 'Delay'])
y = df["Delay"]

print('number of unique airlines', df['Airline'].nunique())

X_ref = X[:179794]
X_test = X[179794:]
y_ref = y[:179794]
y_test = y[179794:]

df_x_ref_num, df_x_ref_cat = accepting.divide_numeric_categorical(X_ref)
df_x_test_num, df_x_test_cat = accepting.divide_numeric_categorical(X_test)

X_ref_exclude = df_x_ref_num.to_numpy()
X_test_exclude = df_x_test_num.to_numpy()
scaler = MinMaxScaler()
scaler.fit(X_ref_exclude)
X_ref_exclude = scaler.transform(X_ref_exclude)
X_test_exclude = scaler.transform(X_test_exclude)

ref_index = df_x_ref_cat.index
test_index = df_x_test_cat.index
encoder = TargetEncoder()
encoder.fit(df_x_ref_cat, y_ref)
df_x_ref_cat_transformed = pd.DataFrame(encoder.transform(df_x_ref_cat))
df_x_test_cat_transformed = pd.DataFrame(encoder.transform(df_x_test_cat))
df_x_ref_cat_transformed.set_index(ref_index, inplace=True)
df_x_test_cat_transformed.set_index(test_index, inplace=True)
X_ref_target = df_x_ref_num.join(df_x_ref_cat_transformed, lsuffix='_num').to_numpy()
X_test_target = df_x_test_num.join(df_x_test_cat_transformed, lsuffix='_num').to_numpy()
scaler = MinMaxScaler()
scaler.fit(X_ref_target)
X_ref_target = scaler.transform(X_ref_target)
X_test_target = scaler.transform(X_test_target)

ref_index = df_x_ref_cat.index
test_index = df_x_test_cat.index
encoder = OneHotEncoder(sparse=False)
encoder.fit(df_x_ref_cat)
df_x_ref_cat_transformed = pd.DataFrame(encoder.transform(df_x_ref_cat))
df_x_test_cat_transformed = pd.DataFrame(encoder.transform(df_x_test_cat))
df_x_ref_cat_transformed.set_index(ref_index, inplace=True)
df_x_test_cat_transformed.set_index(test_index, inplace=True)
X_ref_onehot = df_x_ref_num.join(df_x_ref_cat_transformed, lsuffix='_num').to_numpy()
X_test_onehot = df_x_test_num.join(df_x_test_cat_transformed, lsuffix='_num').to_numpy()
scaler = MinMaxScaler()
scaler.fit(X_ref_onehot)
X_ref_onehot = scaler.transform(X_ref_onehot)
X_test_onehot = scaler.transform(X_test_onehot)

y_ref = y_ref.to_numpy().reshape((len(y_ref.index), 1))
y_test = y_test.to_numpy().reshape((len(y_test.index), 1))

print('\nreference data shape')
print('exclude')
print(X_ref_exclude.shape)
print('onehot')
print(X_ref_onehot.shape)
print('target')
print(X_ref_target.shape)

print('\nreference labels shape')
print(y_ref.shape)

print('\ntesting data shape')
print('exclude')
print(X_test_exclude.shape)
print('onehot')
print(X_test_onehot.shape)
print('target')
print(X_test_target.shape)

print('\ntesting labels shape')
print(y_test.shape)

number of unique airlines 18





reference data shape
exclude
(179794, 3)
onehot
(179794, 21)
target
(179794, 4)

reference labels shape
(179794, 1)

testing data shape
exclude
(359589, 3)
onehot
(359589, 21)
target
(359589, 4)

testing labels shape
(359589, 1)


## Split data to batches (size=17000)

In [30]:
from eval_helpers import helpers
import importlib
importlib.reload(helpers)

X_test_batches_exclude, y_test_batches = helpers.split_to_fixed_size_batches(X_test_exclude, y_test, batch_size=17000)
X_ref_batches_exclude, y_ref_batches = helpers.split_to_fixed_size_batches(X_ref_exclude, y_ref, batch_size=17000)
X_test_batches_onehot, _ = helpers.split_to_fixed_size_batches(X_test_onehot, y_test, batch_size=17000)
X_ref_batches_onehot, _ = helpers.split_to_fixed_size_batches(X_ref_onehot, y_ref, batch_size=17000)
X_test_batches_target, _ = helpers.split_to_fixed_size_batches(X_test_target, y_test, batch_size=17000)
X_ref_batches_target, _ = helpers.split_to_fixed_size_batches(X_ref_target, y_ref, batch_size=17000)

print('number of resulting testing batches')
print('exclude')
print(len(X_test_batches_exclude))
print('onehot')
print(len(X_test_batches_onehot))
print('target')
print(len(X_test_batches_target))
print('labels')
print(len(y_test_batches))

print('number of resulting reference batches')
print('exclude')
print(len(X_ref_batches_exclude))
print('onehot')
print(len(X_ref_batches_onehot))
print('target')
print(len(X_ref_batches_target))
print('labels')
print(len(y_ref_batches))

[[0.79736303 0.46256123 0.05496183]
 [0.8766001  0.46256123 0.05801527]
 [0.89247312 0.46256123 0.22748092]
 ...
 [0.12839222 0.563331   0.22137405]
 [0.15245776 0.563331   0.24427481]
 [0.20302099 0.563331   0.08396947]]
(17000, 3)
[[0.0343062  0.00349895 0.3129771 ]
 [0.19930876 0.00349895 0.3389313 ]
 [0.30709165 0.0069979  0.2519084 ]
 ...
 [0.42524322 0.88523443 0.18320611]
 [0.28046595 0.88523443 0.12977099]
 [0.38082437 0.88523443 0.17251908]]
(17000, 3)
[[0.79736303 0.46256123 0.05496183 ... 0.         0.         0.        ]
 [0.8766001  0.46256123 0.05801527 ... 0.         0.         0.        ]
 [0.89247312 0.46256123 0.22748092 ... 0.         0.         0.        ]
 ...
 [0.12839222 0.563331   0.22137405 ... 0.         0.         0.        ]
 [0.15245776 0.563331   0.24427481 ... 0.         0.         0.        ]
 [0.20302099 0.563331   0.08396947 ... 0.         0.         0.        ]]
(17000, 21)
[[0.0343062  0.00349895 0.3129771  ... 0.         0.         0.        ]
 [0.1

## Imports

In [31]:
import importlib
from eval_helpers import ucdd_eval_real_world
from eval_helpers import kmeans_verbose_helpers
from core import ucdd_supported_parameters as spms
from core import ucdd
importlib.reload(ucdd_eval_real_world)
importlib.reload(ucdd)
importlib.reload(kmeans_verbose_helpers)

something


<module 'eval_helpers.kmeans_verbose_helpers' from 'C:\\Users\\jpohl\\PycharmProjects\\clustering-drift-detection\\ucdd_improved\\eval_helpers\\kmeans_verbose_helpers.py'>

## Experiments with target encoding

### Find suitable KMeans clustering parameters

In [25]:
kmeans_verbose_helpers.write_kmeans_results_ucdd_helper('airlines_kmeans_target/airlines_kmeans_target_output',
                                                        X_ref_batches_target,
                                                        n_init=100, max_iter=500, tol=0, random_state=1053)


filename airlines_kmeans_target/airlines_kmeans_target_output0.txt
random state: 1053
filename airlines_kmeans_target/airlines_kmeans_target_output1.txt
random state: 1053
filename airlines_kmeans_target/airlines_kmeans_target_output2.txt
random state: 1053
filename airlines_kmeans_target/airlines_kmeans_target_output3.txt
random state: 1053
filename airlines_kmeans_target/airlines_kmeans_target_output4.txt
random state: 1053
filename airlines_kmeans_target/airlines_kmeans_target_output5.txt
random state: 1053
filename airlines_kmeans_target/airlines_kmeans_target_output6.txt
random state: 1053
filename airlines_kmeans_target/airlines_kmeans_target_output7.txt
random state: 1053
filename airlines_kmeans_target/airlines_kmeans_target_output8.txt
random state: 1053
filename airlines_kmeans_target/airlines_kmeans_target_output9.txt
random state: 1053
{'total_max_iterations': 42, 'total_min_init_inertia': 4637.488648502701, 'total_max_init_inertia': 12548.678845293836, 'total_min_final_ine

### Use UCDD directly to obtain inspectable results

In [32]:
from core import ucdd
import importlib
importlib.reload(ucdd)

all_2d_drifts_target = []
all_2d_cluster_classif_accs_target = []

for random_state in [0, 100]:
    drifts_2d_arr_target, cluster_classif_accs_2d_arr_target = ucdd.all_drifting_batches_parallel_all_info(
        X_ref_batches_target,
        X_test_batches_target,
        additional_check=True,
        n_init=100,
        max_iter=42000,
        tol=0,
        random_state=random_state,
        reference_label_batches=y_ref_batches,
        testing_label_batches=y_test_batches
    )
    all_2d_drifts_target.append(drifts_2d_arr_target)
    all_2d_cluster_classif_accs_target.append(cluster_classif_accs_2d_arr_target)

result_dict_target = {
    'all_2d_drifts_target': all_2d_drifts_target,
    'all_2d_cluster_classif_accs_target': all_2d_cluster_classif_accs_target
}
np.save('airlines_stats_target.npy', result_dict_target)

random_state
0
pool opened
random_state
100
pool opened


In [33]:
result_dict_target = np.load('airlines_stats_target.npy', allow_pickle=True).item()
all_2d_drifts_target = result_dict_target['all_2d_drifts_target']
print('total number of drift detections:', len(X_ref_batches_target) * len(X_test_batches_target))
print('number of drift detection differences in the two runs:')
num_diffs = np.sum(all_2d_drifts_target[0] != all_2d_drifts_target[1])
print(num_diffs)

total number of drift detections: 210
number of drift detection differences in the two runs:
0


### Use them for the evaluation

In [5]:
import time
import random
from ucdd_improved.core import ucdd
import csv
importlib.reload(ucdd)

# # because of time constraints, only use 20% of all reference batches
# print('number of reference batches')
# print(len(X_ref_batches_target))
# num_ref_batches_to_use = int(0.2 * len(X_ref_batches_target))
#
# random.seed(0)
# X_ref_batches_target_choice = random.sample(X_ref_batches_target, num_ref_batches_to_use)
# print('number of batches in the choice')
# print(len(X_ref_batches_target_choice))
# print(X_ref_batches_target_choice)

target_detection_list = []
start_time = time.time()
for first_random_state in [0, 100]:
    drifts_detected = ucdd.all_drifting_batches_parallel(
        X_ref_batches_target,
        X_test_batches_target,
        min_ref_batches_drift=0.3,
        additional_check=True,
        n_init=100,
        max_iter=42000,
        tol=0,
        random_state=first_random_state
    )
    target_detection_list.append(drifts_detected)

print('target detection list')
print(target_detection_list)
print('execution time')
print(time.time() - start_time)
with open('airlines_target.csv', 'w') as f:
    wrtr = csv.writer(f)
    wrtr.writerows(target_detection_list)

random_state
0
random_state
100
target detection list
[[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False]]
execution time
229.92048907279968


## Experiments with onehot encoding

### Find suitable KMeans clustering parameters

In [26]:
kmeans_verbose_helpers.write_kmeans_results_ucdd_helper('airlines_kmeans_onehot/airlines_kmeans_onehot_output',
                                                        X_ref_batches_onehot,
                                                        n_init=100, max_iter=500, tol=0, random_state=1053)


filename airlines_kmeans_onehot/airlines_kmeans_onehot_output0.txt
random state: 1053
filename airlines_kmeans_onehot/airlines_kmeans_onehot_output1.txt
random state: 1053
filename airlines_kmeans_onehot/airlines_kmeans_onehot_output2.txt
random state: 1053
filename airlines_kmeans_onehot/airlines_kmeans_onehot_output3.txt
random state: 1053
filename airlines_kmeans_onehot/airlines_kmeans_onehot_output4.txt
random state: 1053
filename airlines_kmeans_onehot/airlines_kmeans_onehot_output5.txt
random state: 1053
filename airlines_kmeans_onehot/airlines_kmeans_onehot_output6.txt
random state: 1053
filename airlines_kmeans_onehot/airlines_kmeans_onehot_output7.txt
random state: 1053
filename airlines_kmeans_onehot/airlines_kmeans_onehot_output8.txt
random state: 1053
filename airlines_kmeans_onehot/airlines_kmeans_onehot_output9.txt
random state: 1053
{'total_max_iterations': 16, 'total_min_init_inertia': 51567.30607214207, 'total_max_init_inertia': 71575.79111583055, 'total_min_final_iner

### Use UCDD directly to obtain inspectable results

In [34]:
from core import ucdd
import importlib
importlib.reload(ucdd)

all_2d_drifts_onehot = []
all_2d_cluster_classif_accs_onehot = []

for random_state in [0, 100]:
    drifts_2d_arr_onehot, cluster_classif_accs_2d_arr_onehot = ucdd.all_drifting_batches_parallel_all_info(
        X_ref_batches_onehot,
        X_test_batches_onehot,
        additional_check=True,
        n_init=100,
        max_iter=16000,
        tol=0,
        random_state=random_state,
        reference_label_batches=y_ref_batches,
        testing_label_batches=y_test_batches
    )
    all_2d_drifts_onehot.append(drifts_2d_arr_onehot)
    all_2d_cluster_classif_accs_onehot.append(cluster_classif_accs_2d_arr_onehot)

result_dict_onehot = {
    'all_2d_drifts_onehot': all_2d_drifts_onehot,
    'all_2d_cluster_classif_accs_onehot': all_2d_cluster_classif_accs_onehot
}
np.save('airlines_stats_onehot.npy', result_dict_onehot)

random_state
0
pool opened
random_state
100
pool opened


In [35]:
result_dict_onehot = np.load('airlines_stats_onehot.npy', allow_pickle=True).item()
all_2d_drifts_onehot = result_dict_onehot['all_2d_drifts_onehot']
print('total number of drift detections:', len(X_ref_batches_onehot) * len(X_test_batches_onehot))
print('number of drift detection differences in the two runs:')
num_diffs = np.sum(all_2d_drifts_onehot[0] != all_2d_drifts_onehot[1])
print(num_diffs)

total number of drift detections: 210
number of drift detection differences in the two runs:
0


### Use them for the evaluation

In [6]:
import time
import random
from ucdd_improved.core import ucdd
import csv
importlib.reload(ucdd)

# # because of time constraints, only use 20% of all reference batches
# print('number of reference batches')
# print(len(X_ref_batches_onehot))
# num_ref_batches_to_use = int(0.2 * len(X_ref_batches_onehot))
#
# random.seed(0)
# X_ref_batches_onehot_choice = random.sample(X_ref_batches_onehot, num_ref_batches_to_use)
# print('number of batches in the choice')
# print(len(X_ref_batches_onehot_choice))
# print(X_ref_batches_onehot_choice)

onehot_detection_list = []
start_time = time.time()
for first_random_state in [0, 100]:
    drifts_detected = ucdd.all_drifting_batches_parallel(
        X_ref_batches_onehot,
        X_test_batches_onehot,
        min_ref_batches_drift=0.3,
        additional_check=True,
        n_init=100,
        max_iter=16000,
        tol=0,
        random_state=first_random_state
    )
    onehot_detection_list.append(drifts_detected)

print('onehot detection list')
print(onehot_detection_list)
print('execution time')
print(time.time() - start_time)
with open('airlines_onehot.csv', 'w') as f:
    wrtr = csv.writer(f)
    wrtr.writerows(onehot_detection_list)


random_state
0
random_state
100
onehot detection list
[[False, False, False, False, False, False, True, True, False, True, False, False, False, True, True, True, False, False, False, False, False], [False, False, False, False, False, False, True, True, False, True, False, False, False, True, True, True, False, False, False, False, False]]
execution time
282.94530868530273


## Experiments with excluded categories

### Find suitable KMeans clustering parameters

In [27]:
kmeans_verbose_helpers.write_kmeans_results_ucdd_helper('airlines_kmeans_exclude/airlines_kmeans_exclude_output',
                                                        X_ref_batches_exclude,
                                                        n_init=100, max_iter=500, tol=0, random_state=1053)


filename airlines_kmeans_exclude/airlines_kmeans_exclude_output0.txt
random state: 1053
filename airlines_kmeans_exclude/airlines_kmeans_exclude_output1.txt
random state: 1053
filename airlines_kmeans_exclude/airlines_kmeans_exclude_output2.txt
random state: 1053
filename airlines_kmeans_exclude/airlines_kmeans_exclude_output3.txt
random state: 1053
filename airlines_kmeans_exclude/airlines_kmeans_exclude_output4.txt
random state: 1053
filename airlines_kmeans_exclude/airlines_kmeans_exclude_output5.txt
random state: 1053
filename airlines_kmeans_exclude/airlines_kmeans_exclude_output6.txt
random state: 1053
filename airlines_kmeans_exclude/airlines_kmeans_exclude_output7.txt
random state: 1053
filename airlines_kmeans_exclude/airlines_kmeans_exclude_output8.txt
random state: 1053
filename airlines_kmeans_exclude/airlines_kmeans_exclude_output9.txt
random state: 1053
{'total_max_iterations': 24, 'total_min_init_inertia': 2193.374855109041, 'total_max_init_inertia': 8673.080383545612, '

### Use UCDD directly to obtain inspectable results

In [17]:
from core import ucdd
import importlib
importlib.reload(ucdd)

all_2d_drifts_exclude = []
all_2d_cluster_classif_accs_exclude = []

for random_state in [0, 100]:
    drifts_2d_arr_exclude, cluster_classif_accs_2d_arr_exclude = ucdd.all_drifting_batches_parallel_all_info(
        X_ref_batches_exclude,
        X_test_batches_exclude,
        additional_check=True,
        n_init=5,
        # n_init=100,
        max_iter=20,
        tol=0,
        random_state=random_state,
        reference_label_batches=y_ref_batches,
        testing_label_batches=y_test_batches
    )
    all_2d_drifts_exclude.append(drifts_2d_arr_exclude)
    all_2d_cluster_classif_accs_exclude.append(cluster_classif_accs_2d_arr_exclude)

result_dict_exclude = {
    'all_2d_drifts_exclude': all_2d_drifts_exclude,
    'all_2d_cluster_classif_accs_exclude': all_2d_cluster_classif_accs_exclude
}
np.save('airlines_stats_exclude.npy', result_dict_exclude)

random_state
0
pool opened
random_state
100
pool opened


In [18]:
result_dict_exclude = np.load('airlines_stats_exclude.npy', allow_pickle=True).item()
all_2d_drifts_exclude = result_dict_exclude['all_2d_drifts_exclude']
print('total number of drift detections:', len(X_ref_batches_exclude) * len(X_test_batches_exclude))
print('number of drift detection differences in the two runs:')
num_diffs = np.sum(all_2d_drifts_exclude[0] != all_2d_drifts_exclude[1])
print(num_diffs)


total number of drift detections: 210
number of drift detection differences in the two runs:
0


### Use them for the evaluation

In [7]:
import time
import random
from ucdd_improved.core import ucdd
import csv
importlib.reload(ucdd)

# # because of time constraints, only use 20% of all reference batches
# print('number of reference batches')
# print(len(X_ref_batches_exclude))
# num_ref_batches_to_use = int(0.2 * len(X_ref_batches_exclude))
#
# random.seed(0)
# X_ref_batches_exclude_choice = random.sample(X_ref_batches_exclude, num_ref_batches_to_use)
# print('number of batches in the choice')
# print(len(X_ref_batches_exclude_choice))
# print(X_ref_batches_exclude_choice)

exclude_detection_list = []
start_time = time.time()
for first_random_state in [0, 100]:
    drifts_detected = ucdd.all_drifting_batches_parallel(
        X_ref_batches_exclude,
        X_test_batches_exclude,
        min_ref_batches_drift=0.3,
        additional_check=True,
        n_init=100,
        max_iter=24000,
        tol=0,
        random_state=first_random_state
    )
    exclude_detection_list.append(drifts_detected)

print('exclude detection list')
print(exclude_detection_list)
print('execution time')
print(time.time() - start_time)
with open('airlines_exclude.csv', 'w') as f:
    wrtr = csv.writer(f)
    wrtr.writerows(exclude_detection_list)

random_state
0
random_state
100
exclude detection list
[[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]]
execution time
177.9049210548401
