# Evaluation of UCDD on AGRAW2

## Imports

In [15]:
import numpy as np
import pandas as pd
import sklearn

## AGRAW2 dataset locations

In [16]:
abrupt_agraw2_path = '../Datasets_concept_drift/synthetic_data/abrupt_drift/agraw2_1_abrupt_drift_0_noise_balanced.arff'
gradual_agraw2_paths = [
    '../Datasets_concept_drift/synthetic_data/gradual_drift/agraw2_1_gradual_drift_0_noise_balanced_05.arff',
    '../Datasets_concept_drift/synthetic_data/gradual_drift/agraw2_1_gradual_drift_0_noise_balanced_1.arff',
    '../Datasets_concept_drift/synthetic_data/gradual_drift/agraw2_1_gradual_drift_0_noise_balanced_5.arff',
    '../Datasets_concept_drift/synthetic_data/gradual_drift/agraw2_1_gradual_drift_0_noise_balanced_10.arff',
    '../Datasets_concept_drift/synthetic_data/gradual_drift/agraw2_1_gradual_drift_0_noise_balanced_20.arff'
]

all_agraw2_data_paths = [abrupt_agraw2_path] + gradual_agraw2_paths

## Accept and preprocess AGRAW2 datasets

In [17]:
import accepting
from sklearn.preprocessing import MinMaxScaler
from category_encoders import TargetEncoder
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split


agraw2_exclude_reference_batches = {}
agraw2_exclude_testing_batches = {}
agraw2_onehot_reference_batches = {}
agraw2_onehot_testing_batches = {}
agraw2_target_reference_batches = {}
agraw2_target_testing_batches = {}


# agraw2 with categories excluded
for file_path in all_agraw2_data_paths:
    df_x, df_y = accepting.get_clean_df(file_path)
    df_y = pd.DataFrame(LabelEncoder().fit_transform(df_y))

    df_x_ref, df_x_test, df_y_ref, df_y_test = sklearn.model_selection.train_test_split(
        df_x, df_y, test_size=0.7, shuffle=False)
    
    df_x_ref_num, df_x_ref_cat = accepting.divide_numeric_categorical(df_x_ref)
    df_x_test_num, df_x_test_cat = accepting.divide_numeric_categorical(df_x_test)
    
    reference_data = df_x_ref_num.to_numpy()
    testing_data = df_x_test_num.to_numpy()
    scaler = MinMaxScaler()
    scaler.fit(reference_data)
    reference_data = scaler.transform(reference_data)
    testing_data = scaler.transform(testing_data)
    
    num_ref_batches = 3
    num_test_batches = 7
    ref_batches = np.array_split(reference_data, num_ref_batches)
    test_batches = np.array_split(testing_data, num_test_batches)
    
    agraw2_exclude_reference_batches[file_path] = ref_batches
    agraw2_exclude_testing_batches[file_path] = test_batches
    
print('agraw2 exclude')
print(agraw2_exclude_reference_batches)
print(agraw2_exclude_testing_batches)

# agraw2 with categories onehot encoded
for file_path in all_agraw2_data_paths:
    df_x, df_y = accepting.get_clean_df(file_path)
    df_y = pd.DataFrame(LabelEncoder().fit_transform(df_y))

    df_x_ref, df_x_test, df_y_ref, df_y_test = sklearn.model_selection.train_test_split(
        df_x, df_y, test_size=0.7, shuffle=False)
    
    df_x_ref_num, df_x_ref_cat = accepting.divide_numeric_categorical(df_x_ref)
    df_x_test_num, df_x_test_cat = accepting.divide_numeric_categorical(df_x_test)
    
    ref_index = df_x_ref_cat.index
    test_index = df_x_test_cat.index
    encoder = OneHotEncoder(sparse=False)
    encoder.fit(df_x_ref_cat)
    df_x_ref_cat_transformed = pd.DataFrame(encoder.transform(df_x_ref_cat))
    df_x_test_cat_transformed = pd.DataFrame(encoder.transform(df_x_test_cat))
    df_x_ref_cat_transformed.set_index(ref_index, inplace=True)
    df_x_test_cat_transformed.set_index(test_index, inplace=True)
    
    reference_data = df_x_ref_num.join(df_x_ref_cat_transformed, lsuffix='_num').to_numpy()
    testing_data = df_x_test_num.join(df_x_test_cat_transformed, lsuffix='_num').to_numpy()
    scaler = MinMaxScaler()
    scaler.fit(reference_data)
    reference_data = scaler.transform(reference_data)
    testing_data = scaler.transform(testing_data)
    
    num_ref_batches = 3
    num_test_batches = 7
    ref_batches = np.array_split(reference_data, num_ref_batches)
    test_batches = np.array_split(testing_data, num_test_batches)
    
    agraw2_onehot_reference_batches[file_path] = ref_batches
    agraw2_onehot_testing_batches[file_path] = test_batches
    
print('agraw2 onehot')
print(agraw2_onehot_reference_batches)
print(agraw2_onehot_testing_batches)

# agraw2 with categories target encoded
for file_path in all_agraw2_data_paths:
    df_x, df_y = accepting.get_clean_df(file_path)
    df_y = pd.DataFrame(LabelEncoder().fit_transform(df_y))

    df_x_ref, df_x_test, df_y_ref, df_y_test = sklearn.model_selection.train_test_split(
        df_x, df_y, test_size=0.7, shuffle=False)
    
    df_x_ref_num, df_x_ref_cat = accepting.divide_numeric_categorical(df_x_ref)
    df_x_test_num, df_x_test_cat = accepting.divide_numeric_categorical(df_x_test)
    
    ref_index = df_x_ref_cat.index
    test_index = df_x_test_cat.index
    encoder = TargetEncoder()
    encoder.fit(df_x_ref_cat, df_y_ref)
    df_x_ref_cat_transformed = pd.DataFrame(encoder.transform(df_x_ref_cat))
    df_x_test_cat_transformed = pd.DataFrame(encoder.transform(df_x_test_cat))
    df_x_ref_cat_transformed.set_index(ref_index, inplace=True)
    df_x_test_cat_transformed.set_index(test_index, inplace=True)
    
    reference_data = df_x_ref_num.join(df_x_ref_cat_transformed, lsuffix='_num').to_numpy()
    testing_data = df_x_test_num.join(df_x_test_cat_transformed, lsuffix='_num').to_numpy()
    scaler = MinMaxScaler()
    scaler.fit(reference_data)
    reference_data = scaler.transform(reference_data)
    testing_data = scaler.transform(testing_data)
    
    num_ref_batches = 3
    num_test_batches = 7
    ref_batches = np.array_split(reference_data, num_ref_batches)
    test_batches = np.array_split(testing_data, num_test_batches)
    
    agraw2_target_reference_batches[file_path] = ref_batches
    agraw2_target_testing_batches[file_path] = test_batches
    
print('agraw2 target')
print(agraw2_target_reference_batches)
print(agraw2_target_testing_batches)

df               salary    commission   age     elevel       car      zipcode  \
0       58227.414161  42921.435776  77.0  b'level2'   b'car6'  b'zipcode5'   
1       55628.982351  22363.143593  27.0  b'level3'  b'car17'  b'zipcode3'   
2      146598.421289      0.000000  26.0  b'level2'  b'car13'  b'zipcode5'   
3       87701.290931      0.000000  52.0  b'level4'  b'car16'  b'zipcode5'   
4       83888.656723      0.000000  72.0  b'level3'  b'car11'  b'zipcode5'   
...              ...           ...   ...        ...       ...          ...   
99995  138438.140611      0.000000  50.0  b'level2'   b'car8'  b'zipcode6'   
99996   90804.449088      0.000000  77.0  b'level4'  b'car13'  b'zipcode7'   
99997  106569.365368      0.000000  51.0  b'level1'   b'car8'  b'zipcode6'   
99998   84356.498251      0.000000  36.0  b'level1'  b'car18'  b'zipcode5'   
99999   87473.968095      0.000000  54.0  b'level0'  b'car11'  b'zipcode3'   

              hvalue  hyears           loan      class  
0  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].str.decode('utf-8')
  y = column_or_1d(y, warn=True)


df               salary    commission   age     elevel       car      zipcode  \
0       58227.414161  42921.435776  77.0  b'level2'   b'car6'  b'zipcode5'   
1       55628.982351  22363.143593  27.0  b'level3'  b'car17'  b'zipcode3'   
2      146598.421289      0.000000  26.0  b'level2'  b'car13'  b'zipcode5'   
3       87701.290931      0.000000  52.0  b'level4'  b'car16'  b'zipcode5'   
4       83888.656723      0.000000  72.0  b'level3'  b'car11'  b'zipcode5'   
...              ...           ...   ...        ...       ...          ...   
99995   39396.704989  18582.273595  72.0  b'level1'  b'car16'  b'zipcode5'   
99996  138438.140611      0.000000  50.0  b'level2'   b'car8'  b'zipcode6'   
99997   90804.449088      0.000000  77.0  b'level4'  b'car13'  b'zipcode7'   
99998  106569.365368      0.000000  51.0  b'level1'   b'car8'  b'zipcode6'   
99999   84356.498251      0.000000  36.0  b'level1'  b'car18'  b'zipcode5'   

              hvalue  hyears           loan      class  
0  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].str.decode('utf-8')
  y = column_or_1d(y, warn=True)


df               salary    commission   age     elevel       car      zipcode  \
0       58227.414161  42921.435776  77.0  b'level2'   b'car6'  b'zipcode5'   
1       55628.982351  22363.143593  27.0  b'level3'  b'car17'  b'zipcode3'   
2      146598.421289      0.000000  26.0  b'level2'  b'car13'  b'zipcode5'   
3       87701.290931      0.000000  52.0  b'level4'  b'car16'  b'zipcode5'   
4       83888.656723      0.000000  72.0  b'level3'  b'car11'  b'zipcode5'   
...              ...           ...   ...        ...       ...          ...   
99995  111359.652359      0.000000  61.0  b'level1'  b'car15'  b'zipcode1'   
99996  147593.942041      0.000000  59.0  b'level4'  b'car17'  b'zipcode6'   
99997  123948.303642      0.000000  66.0  b'level4'  b'car11'  b'zipcode4'   
99998   71333.696488  11799.103316  58.0  b'level1'   b'car5'  b'zipcode3'   
99999   39396.704989  18582.273595  72.0  b'level1'  b'car16'  b'zipcode5'   

              hvalue  hyears           loan      class  
0  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].str.decode('utf-8')
  y = column_or_1d(y, warn=True)


df               salary    commission   age     elevel       car      zipcode  \
0       58227.414161  42921.435776  77.0  b'level2'   b'car6'  b'zipcode5'   
1       55628.982351  22363.143593  27.0  b'level3'  b'car17'  b'zipcode3'   
2      146598.421289      0.000000  26.0  b'level2'  b'car13'  b'zipcode5'   
3       87701.290931      0.000000  52.0  b'level4'  b'car16'  b'zipcode5'   
4       83888.656723      0.000000  72.0  b'level3'  b'car11'  b'zipcode5'   
...              ...           ...   ...        ...       ...          ...   
99995   71983.840341  19299.479268  45.0  b'level2'   b'car7'  b'zipcode8'   
99996   67069.529360  46815.593925  30.0  b'level3'  b'car14'  b'zipcode5'   
99997  104177.660368      0.000000  48.0  b'level1'   b'car9'  b'zipcode5'   
99998  143771.939648      0.000000  71.0  b'level3'  b'car20'  b'zipcode6'   
99999   36163.251675  59471.494372  53.0  b'level0'  b'car18'  b'zipcode5'   

              hvalue  hyears           loan      class  
0  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].str.decode('utf-8')
  y = column_or_1d(y, warn=True)


df               salary    commission   age     elevel       car      zipcode  \
0       58227.414161  42921.435776  77.0  b'level2'   b'car6'  b'zipcode5'   
1       55628.982351  22363.143593  27.0  b'level3'  b'car17'  b'zipcode3'   
2      146598.421289      0.000000  26.0  b'level2'  b'car13'  b'zipcode5'   
3       87701.290931      0.000000  52.0  b'level4'  b'car16'  b'zipcode5'   
4       83888.656723      0.000000  72.0  b'level3'  b'car11'  b'zipcode5'   
...              ...           ...   ...        ...       ...          ...   
99995   68222.567612  33141.251698  41.0  b'level2'   b'car8'  b'zipcode1'   
99996   50428.438359  59324.405616  26.0  b'level4'   b'car1'  b'zipcode2'   
99997  112914.978798      0.000000  41.0  b'level1'   b'car4'  b'zipcode4'   
99998   90178.866228      0.000000  60.0  b'level1'  b'car13'  b'zipcode6'   
99999   37082.251910  22468.199196  43.0  b'level2'  b'car20'  b'zipcode1'   

             hvalue  hyears           loan      class  
0   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].str.decode('utf-8')
  y = column_or_1d(y, warn=True)


df               salary    commission   age     elevel       car      zipcode  \
0       58227.414161  42921.435776  77.0  b'level2'   b'car6'  b'zipcode5'   
1       55628.982351  22363.143593  27.0  b'level3'  b'car17'  b'zipcode3'   
2      146598.421289      0.000000  26.0  b'level2'  b'car13'  b'zipcode5'   
3       87701.290931      0.000000  52.0  b'level4'  b'car16'  b'zipcode5'   
4       83888.656723      0.000000  72.0  b'level3'  b'car11'  b'zipcode5'   
...              ...           ...   ...        ...       ...          ...   
99995   80507.066152      0.000000  59.0  b'level1'   b'car8'  b'zipcode7'   
99996   43953.844702  23112.195013  36.0  b'level1'  b'car19'  b'zipcode6'   
99997  127958.679326      0.000000  59.0  b'level2'   b'car8'  b'zipcode7'   
99998   58150.729498  54772.811468  66.0  b'level3'  b'car15'  b'zipcode8'   
99999  120616.756315      0.000000  59.0  b'level2'   b'car3'  b'zipcode3'   

              hvalue  hyears           loan      class  
0  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].str.decode('utf-8')
  y = column_or_1d(y, warn=True)


df               salary    commission   age     elevel       car      zipcode  \
0       58227.414161  42921.435776  77.0  b'level2'   b'car6'  b'zipcode5'   
1       55628.982351  22363.143593  27.0  b'level3'  b'car17'  b'zipcode3'   
2      146598.421289      0.000000  26.0  b'level2'  b'car13'  b'zipcode5'   
3       87701.290931      0.000000  52.0  b'level4'  b'car16'  b'zipcode5'   
4       83888.656723      0.000000  72.0  b'level3'  b'car11'  b'zipcode5'   
...              ...           ...   ...        ...       ...          ...   
99995  138438.140611      0.000000  50.0  b'level2'   b'car8'  b'zipcode6'   
99996   90804.449088      0.000000  77.0  b'level4'  b'car13'  b'zipcode7'   
99997  106569.365368      0.000000  51.0  b'level1'   b'car8'  b'zipcode6'   
99998   84356.498251      0.000000  36.0  b'level1'  b'car18'  b'zipcode5'   
99999   87473.968095      0.000000  54.0  b'level0'  b'car11'  b'zipcode3'   

              hvalue  hyears           loan      class  
0  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].str.decode('utf-8')
  y = column_or_1d(y, warn=True)


df               salary    commission   age     elevel       car      zipcode  \
0       58227.414161  42921.435776  77.0  b'level2'   b'car6'  b'zipcode5'   
1       55628.982351  22363.143593  27.0  b'level3'  b'car17'  b'zipcode3'   
2      146598.421289      0.000000  26.0  b'level2'  b'car13'  b'zipcode5'   
3       87701.290931      0.000000  52.0  b'level4'  b'car16'  b'zipcode5'   
4       83888.656723      0.000000  72.0  b'level3'  b'car11'  b'zipcode5'   
...              ...           ...   ...        ...       ...          ...   
99995   39396.704989  18582.273595  72.0  b'level1'  b'car16'  b'zipcode5'   
99996  138438.140611      0.000000  50.0  b'level2'   b'car8'  b'zipcode6'   
99997   90804.449088      0.000000  77.0  b'level4'  b'car13'  b'zipcode7'   
99998  106569.365368      0.000000  51.0  b'level1'   b'car8'  b'zipcode6'   
99999   84356.498251      0.000000  36.0  b'level1'  b'car18'  b'zipcode5'   

              hvalue  hyears           loan      class  
0  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].str.decode('utf-8')
  y = column_or_1d(y, warn=True)


df               salary    commission   age     elevel       car      zipcode  \
0       58227.414161  42921.435776  77.0  b'level2'   b'car6'  b'zipcode5'   
1       55628.982351  22363.143593  27.0  b'level3'  b'car17'  b'zipcode3'   
2      146598.421289      0.000000  26.0  b'level2'  b'car13'  b'zipcode5'   
3       87701.290931      0.000000  52.0  b'level4'  b'car16'  b'zipcode5'   
4       83888.656723      0.000000  72.0  b'level3'  b'car11'  b'zipcode5'   
...              ...           ...   ...        ...       ...          ...   
99995  111359.652359      0.000000  61.0  b'level1'  b'car15'  b'zipcode1'   
99996  147593.942041      0.000000  59.0  b'level4'  b'car17'  b'zipcode6'   
99997  123948.303642      0.000000  66.0  b'level4'  b'car11'  b'zipcode4'   
99998   71333.696488  11799.103316  58.0  b'level1'   b'car5'  b'zipcode3'   
99999   39396.704989  18582.273595  72.0  b'level1'  b'car16'  b'zipcode5'   

              hvalue  hyears           loan      class  
0  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].str.decode('utf-8')
  y = column_or_1d(y, warn=True)


df               salary    commission   age     elevel       car      zipcode  \
0       58227.414161  42921.435776  77.0  b'level2'   b'car6'  b'zipcode5'   
1       55628.982351  22363.143593  27.0  b'level3'  b'car17'  b'zipcode3'   
2      146598.421289      0.000000  26.0  b'level2'  b'car13'  b'zipcode5'   
3       87701.290931      0.000000  52.0  b'level4'  b'car16'  b'zipcode5'   
4       83888.656723      0.000000  72.0  b'level3'  b'car11'  b'zipcode5'   
...              ...           ...   ...        ...       ...          ...   
99995   71983.840341  19299.479268  45.0  b'level2'   b'car7'  b'zipcode8'   
99996   67069.529360  46815.593925  30.0  b'level3'  b'car14'  b'zipcode5'   
99997  104177.660368      0.000000  48.0  b'level1'   b'car9'  b'zipcode5'   
99998  143771.939648      0.000000  71.0  b'level3'  b'car20'  b'zipcode6'   
99999   36163.251675  59471.494372  53.0  b'level0'  b'car18'  b'zipcode5'   

              hvalue  hyears           loan      class  
0  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].str.decode('utf-8')
  y = column_or_1d(y, warn=True)


df               salary    commission   age     elevel       car      zipcode  \
0       58227.414161  42921.435776  77.0  b'level2'   b'car6'  b'zipcode5'   
1       55628.982351  22363.143593  27.0  b'level3'  b'car17'  b'zipcode3'   
2      146598.421289      0.000000  26.0  b'level2'  b'car13'  b'zipcode5'   
3       87701.290931      0.000000  52.0  b'level4'  b'car16'  b'zipcode5'   
4       83888.656723      0.000000  72.0  b'level3'  b'car11'  b'zipcode5'   
...              ...           ...   ...        ...       ...          ...   
99995   68222.567612  33141.251698  41.0  b'level2'   b'car8'  b'zipcode1'   
99996   50428.438359  59324.405616  26.0  b'level4'   b'car1'  b'zipcode2'   
99997  112914.978798      0.000000  41.0  b'level1'   b'car4'  b'zipcode4'   
99998   90178.866228      0.000000  60.0  b'level1'  b'car13'  b'zipcode6'   
99999   37082.251910  22468.199196  43.0  b'level2'  b'car20'  b'zipcode1'   

             hvalue  hyears           loan      class  
0   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].str.decode('utf-8')
  y = column_or_1d(y, warn=True)


df               salary    commission   age     elevel       car      zipcode  \
0       58227.414161  42921.435776  77.0  b'level2'   b'car6'  b'zipcode5'   
1       55628.982351  22363.143593  27.0  b'level3'  b'car17'  b'zipcode3'   
2      146598.421289      0.000000  26.0  b'level2'  b'car13'  b'zipcode5'   
3       87701.290931      0.000000  52.0  b'level4'  b'car16'  b'zipcode5'   
4       83888.656723      0.000000  72.0  b'level3'  b'car11'  b'zipcode5'   
...              ...           ...   ...        ...       ...          ...   
99995   80507.066152      0.000000  59.0  b'level1'   b'car8'  b'zipcode7'   
99996   43953.844702  23112.195013  36.0  b'level1'  b'car19'  b'zipcode6'   
99997  127958.679326      0.000000  59.0  b'level2'   b'car8'  b'zipcode7'   
99998   58150.729498  54772.811468  66.0  b'level3'  b'car15'  b'zipcode8'   
99999  120616.756315      0.000000  59.0  b'level2'   b'car3'  b'zipcode3'   

              hvalue  hyears           loan      class  
0  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].str.decode('utf-8')
  y = column_or_1d(y, warn=True)


agraw2 onehot
{'../Datasets_concept_drift/synthetic_data/abrupt_drift/agraw2_1_abrupt_drift_0_noise_balanced.arff': [array([[0.29406669, 0.57232188, 0.95      , ..., 0.        , 0.        ,
        0.        ],
       [0.274077  , 0.29819404, 0.11666667, ..., 0.        , 0.        ,
        0.        ],
       [0.97390317, 0.        , 0.1       , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.03719466, 0.95578652, 0.31666667, ..., 0.        , 0.        ,
        0.        ],
       [0.31007942, 0.89456625, 0.53333333, ..., 0.        , 0.        ,
        0.        ],
       [0.34635195, 0.54364268, 0.65      , ..., 0.        , 0.        ,
        0.        ]]), array([[0.83522758, 0.        , 0.71666667, ..., 0.        , 0.        ,
        0.        ],
       [0.09902868, 0.65900734, 0.48333333, ..., 0.        , 0.        ,
        0.        ],
       [0.85813019, 0.        , 0.93333333, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.3

df               salary    commission   age     elevel       car      zipcode  \
0       58227.414161  42921.435776  77.0  b'level2'   b'car6'  b'zipcode5'   
1       55628.982351  22363.143593  27.0  b'level3'  b'car17'  b'zipcode3'   
2      146598.421289      0.000000  26.0  b'level2'  b'car13'  b'zipcode5'   
3       87701.290931      0.000000  52.0  b'level4'  b'car16'  b'zipcode5'   
4       83888.656723      0.000000  72.0  b'level3'  b'car11'  b'zipcode5'   
...              ...           ...   ...        ...       ...          ...   
99995  138438.140611      0.000000  50.0  b'level2'   b'car8'  b'zipcode6'   
99996   90804.449088      0.000000  77.0  b'level4'  b'car13'  b'zipcode7'   
99997  106569.365368      0.000000  51.0  b'level1'   b'car8'  b'zipcode6'   
99998   84356.498251      0.000000  36.0  b'level1'  b'car18'  b'zipcode5'   
99999   87473.968095      0.000000  54.0  b'level0'  b'car11'  b'zipcode3'   

              hvalue  hyears           loan      class  
0  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].str.decode('utf-8')
  y = column_or_1d(y, warn=True)


df               salary    commission   age     elevel       car      zipcode  \
0       58227.414161  42921.435776  77.0  b'level2'   b'car6'  b'zipcode5'   
1       55628.982351  22363.143593  27.0  b'level3'  b'car17'  b'zipcode3'   
2      146598.421289      0.000000  26.0  b'level2'  b'car13'  b'zipcode5'   
3       87701.290931      0.000000  52.0  b'level4'  b'car16'  b'zipcode5'   
4       83888.656723      0.000000  72.0  b'level3'  b'car11'  b'zipcode5'   
...              ...           ...   ...        ...       ...          ...   
99995   39396.704989  18582.273595  72.0  b'level1'  b'car16'  b'zipcode5'   
99996  138438.140611      0.000000  50.0  b'level2'   b'car8'  b'zipcode6'   
99997   90804.449088      0.000000  77.0  b'level4'  b'car13'  b'zipcode7'   
99998  106569.365368      0.000000  51.0  b'level1'   b'car8'  b'zipcode6'   
99999   84356.498251      0.000000  36.0  b'level1'  b'car18'  b'zipcode5'   

              hvalue  hyears           loan      class  
0  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].str.decode('utf-8')
  y = column_or_1d(y, warn=True)


df               salary    commission   age     elevel       car      zipcode  \
0       58227.414161  42921.435776  77.0  b'level2'   b'car6'  b'zipcode5'   
1       55628.982351  22363.143593  27.0  b'level3'  b'car17'  b'zipcode3'   
2      146598.421289      0.000000  26.0  b'level2'  b'car13'  b'zipcode5'   
3       87701.290931      0.000000  52.0  b'level4'  b'car16'  b'zipcode5'   
4       83888.656723      0.000000  72.0  b'level3'  b'car11'  b'zipcode5'   
...              ...           ...   ...        ...       ...          ...   
99995  111359.652359      0.000000  61.0  b'level1'  b'car15'  b'zipcode1'   
99996  147593.942041      0.000000  59.0  b'level4'  b'car17'  b'zipcode6'   
99997  123948.303642      0.000000  66.0  b'level4'  b'car11'  b'zipcode4'   
99998   71333.696488  11799.103316  58.0  b'level1'   b'car5'  b'zipcode3'   
99999   39396.704989  18582.273595  72.0  b'level1'  b'car16'  b'zipcode5'   

              hvalue  hyears           loan      class  
0  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].str.decode('utf-8')
  y = column_or_1d(y, warn=True)


df               salary    commission   age     elevel       car      zipcode  \
0       58227.414161  42921.435776  77.0  b'level2'   b'car6'  b'zipcode5'   
1       55628.982351  22363.143593  27.0  b'level3'  b'car17'  b'zipcode3'   
2      146598.421289      0.000000  26.0  b'level2'  b'car13'  b'zipcode5'   
3       87701.290931      0.000000  52.0  b'level4'  b'car16'  b'zipcode5'   
4       83888.656723      0.000000  72.0  b'level3'  b'car11'  b'zipcode5'   
...              ...           ...   ...        ...       ...          ...   
99995   71983.840341  19299.479268  45.0  b'level2'   b'car7'  b'zipcode8'   
99996   67069.529360  46815.593925  30.0  b'level3'  b'car14'  b'zipcode5'   
99997  104177.660368      0.000000  48.0  b'level1'   b'car9'  b'zipcode5'   
99998  143771.939648      0.000000  71.0  b'level3'  b'car20'  b'zipcode6'   
99999   36163.251675  59471.494372  53.0  b'level0'  b'car18'  b'zipcode5'   

              hvalue  hyears           loan      class  
0  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].str.decode('utf-8')
  y = column_or_1d(y, warn=True)


df               salary    commission   age     elevel       car      zipcode  \
0       58227.414161  42921.435776  77.0  b'level2'   b'car6'  b'zipcode5'   
1       55628.982351  22363.143593  27.0  b'level3'  b'car17'  b'zipcode3'   
2      146598.421289      0.000000  26.0  b'level2'  b'car13'  b'zipcode5'   
3       87701.290931      0.000000  52.0  b'level4'  b'car16'  b'zipcode5'   
4       83888.656723      0.000000  72.0  b'level3'  b'car11'  b'zipcode5'   
...              ...           ...   ...        ...       ...          ...   
99995   68222.567612  33141.251698  41.0  b'level2'   b'car8'  b'zipcode1'   
99996   50428.438359  59324.405616  26.0  b'level4'   b'car1'  b'zipcode2'   
99997  112914.978798      0.000000  41.0  b'level1'   b'car4'  b'zipcode4'   
99998   90178.866228      0.000000  60.0  b'level1'  b'car13'  b'zipcode6'   
99999   37082.251910  22468.199196  43.0  b'level2'  b'car20'  b'zipcode1'   

             hvalue  hyears           loan      class  
0   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].str.decode('utf-8')
  y = column_or_1d(y, warn=True)


df               salary    commission   age     elevel       car      zipcode  \
0       58227.414161  42921.435776  77.0  b'level2'   b'car6'  b'zipcode5'   
1       55628.982351  22363.143593  27.0  b'level3'  b'car17'  b'zipcode3'   
2      146598.421289      0.000000  26.0  b'level2'  b'car13'  b'zipcode5'   
3       87701.290931      0.000000  52.0  b'level4'  b'car16'  b'zipcode5'   
4       83888.656723      0.000000  72.0  b'level3'  b'car11'  b'zipcode5'   
...              ...           ...   ...        ...       ...          ...   
99995   80507.066152      0.000000  59.0  b'level1'   b'car8'  b'zipcode7'   
99996   43953.844702  23112.195013  36.0  b'level1'  b'car19'  b'zipcode6'   
99997  127958.679326      0.000000  59.0  b'level2'   b'car8'  b'zipcode7'   
99998   58150.729498  54772.811468  66.0  b'level3'  b'car15'  b'zipcode8'   
99999  120616.756315      0.000000  59.0  b'level2'   b'car3'  b'zipcode3'   

              hvalue  hyears           loan      class  
0  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].str.decode('utf-8')
  y = column_or_1d(y, warn=True)


agraw2 target
{'../Datasets_concept_drift/synthetic_data/abrupt_drift/agraw2_1_abrupt_drift_0_noise_balanced.arff': [array([[0.29406669, 0.57232188, 0.95      , ..., 0.51653096, 0.        ,
        0.87825406],
       [0.274077  , 0.29819404, 0.11666667, ..., 0.        , 0.57218266,
        1.        ],
       [0.97390317, 0.        , 0.1       , ..., 0.51653096, 0.26878201,
        0.87825406],
       ...,
       [0.03719466, 0.95578652, 0.31666667, ..., 1.        , 0.57218266,
        1.        ],
       [0.31007942, 0.89456625, 0.53333333, ..., 1.        , 0.35827864,
        0.81404917],
       [0.34635195, 0.54364268, 0.65      , ..., 0.        , 0.35601348,
        0.52543549]]), array([[0.83522758, 0.        , 0.71666667, ..., 0.4172376 , 0.64208153,
        0.        ],
       [0.09902868, 0.65900734, 0.48333333, ..., 0.4172376 , 0.57887399,
        0.        ],
       [0.85813019, 0.        , 0.93333333, ..., 1.        , 0.69608301,
        0.5830176 ],
       ...,
       [0.3

## Helper function

In [18]:
import kmeans_verbose_helpers


def write_kmeans_results_ucdd_helper(output_filename_no_extension, ref_batches, n_init, max_iter, tol, random_state):
    # dummy = [np.asarray(1), np.asarray(2), np.asarray(3)]
    combinations = []
    for i in range(3):
    #     combinations.append(np.vstack((dummy[i], dummy[(i + 1) % 3])))
        combinations.append(np.vstack((ref_batches[i], ref_batches[(i + 1) % 3])))
        
    all_results_from_combinations = []
    for i, combination in enumerate(combinations):
        filename = output_filename_no_extension + str(i) + '.txt'
        print('filename', filename)
        kmeans_verbose_helpers.write_verbose_kmeans_to_file(result_filename=output_filename_no_extension + str(i) + '.txt',
                                     data_to_cluster=combination,
                                     n_clusters=2, n_init=n_init, max_iter=max_iter, tol=tol, random_state=random_state)
        output_dicts = kmeans_verbose_helpers.convert_kmeans_output_file_to_dicts(filename, n_init=n_init)
        all_results_from_combinations.append(output_dicts)
        kmeans_verbose_helpers.print_stats_from_kmeans_output_dicts(output_dicts)
        
    kmeans_verbose_helpers.print_stats_from_all_combinations(all_results_from_combinations)

## AGRAW2 with categories excluded

### Find the best tol and max_iter (the drift type is irrelevant)

In [19]:
write_kmeans_results_ucdd_helper('agraw2_exclude_new_output', agraw2_exclude_reference_batches[abrupt_agraw2_path],
                                 n_init=100, max_iter=500, tol=0,
                                 random_state=1053)

filename agraw2_exclude_new_output0.txt
random state: 1053
total number of results: 100
maximum number of iterations: 18
minimum initial inertia: 9083.316503699725
maximum initial inertia: 17483.991310771078
number of unique final inertia values: 2
minimum final inertia: 7136.6496570932595
maximum final inertia: 7136.64965709326
total number of convergences: 100
number of strict convergences: 100
number of tol-based convergences: 0
filename agraw2_exclude_new_output1.txt
random state: 1053
total number of results: 100
maximum number of iterations: 26
minimum initial inertia: 9259.811965492709
maximum initial inertia: 17336.6024958001
number of unique final inertia values: 4
minimum final inertia: 7151.44522595734
maximum final inertia: 7151.445344936363
total number of convergences: 100
number of strict convergences: 100
number of tol-based convergences: 0
filename agraw2_exclude_new_output2.txt
random state: 1053
total number of results: 100
maximum number of iterations: 43
minimum in

### Use them for the analysis

In [22]:
import ucdd_eval
import ucdd_supported_parameters as spms



agraw2_exclude_stats = {}
    
for agraw2_path in all_agraw2_data_paths:
    runs_results_bool, final_fpr_mean, fpr_std_err, final_latency_mean, latency_std_err = \
        ucdd_eval.all_drifting_batches_randomness_robust(
        agraw2_exclude_reference_batches[agraw2_path],
        agraw2_exclude_testing_batches[agraw2_path],
        train_batch_strategy=spms.TrainBatchStrategies.SUBMAJORITY,
        additional_check=True,
        n_init=100,
        max_iter=43000,
        tol=0,
        true_drift_idx=2,
        min_runs=2
    )
    agraw2_exclude_stats[agraw2_path] = {
        'runs_results_bool': runs_results_bool,
        'final_fpr_mean': final_fpr_mean,
        'fpr_std_err': fpr_std_err,
        'final_latency_mean': final_latency_mean,
        'latency_std_err': latency_std_err
    }

print('agraw2 STATS')
print(agraw2_exclude_stats)

random_state
0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
random_state
100
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 1

n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
n_init 100 max_iter 43000 tol 0
fpr s.e. 0.0
latency s.e. 0.0
final fpr mean 0.0
final latency mean 1.0
agraw2 STATS
{'../Datasets_concept_drift/synthetic_data/abrupt_drift/agraw2_1_abrupt_drift_0_noise_balanced.arff': {'runs_results_bool': [[False, False, False, False, False, False, False], [False, False, False, False, False, False, False]], 'final_fpr_mean': 0.0, 'fpr_std_err': 0.0, 'final_latency_mean': 1.0, 'latency_std_err': 0.0}, '../Datasets_concept_drift/synthetic_data/gradual_drift/agraw2_1_gradual_drift_0_noise_balanced_05.arff': {'runs_results_bool': [[False, False, False, False, False, False, Fals

### Save the obtained results to csv

In [27]:
import helpers


final_result_dict = {
    'type_of_data': [], 'dataset': [], 'drift': [], 'width': [], 'encoding': [],
    'train_batch_strategy': [], 'additional_check': [],
    'n_init': [], 'max_iter': [], 'tol': [],
    'FPR_mean': [], 'latency_mean': []
}

for data_path, stats_dict in agraw2_exclude_stats.items():
    synthetic_filename_info = helpers.synthetic_data_information(data_path)
    encoding = 'exclude'
    fpr_mean = float(stats_dict['final_fpr_mean'])
    latency_mean = float(stats_dict['final_latency_mean'])
    
    final_result_dict['type_of_data'].append(synthetic_filename_info['type_of_data'])
    final_result_dict['dataset'].append(synthetic_filename_info['dataset_name'])
    final_result_dict['drift'].append(synthetic_filename_info['drift_type'])
    final_result_dict['width'].append(synthetic_filename_info['drift_width'])
    final_result_dict['encoding'].append(encoding)
    final_result_dict['train_batch_strategy'].append('submajority')
    final_result_dict['additional_check'].append('yes')
    final_result_dict['n_init'].append(100)
    final_result_dict['max_iter'].append(43000)
    final_result_dict['tol'].append(0)
    final_result_dict['FPR_mean'].append(fpr_mean)
    final_result_dict['latency_mean'].append(latency_mean)
    
final_result_df = pd.DataFrame.from_dict(final_result_dict)
sorted_final_result_df = final_result_df.sort_values(['drift', 'dataset', 'encoding', 'width'])
final_result_df.to_csv('agraw2_exclude_jupyter_results.csv', index=False)

## AGRAW2 with categories onehot encoded

### Find the best tol and max_iter (the drift type is irrelevant)

In [20]:
write_kmeans_results_ucdd_helper('agraw2_onehot_new_output', agraw2_onehot_reference_batches[abrupt_agraw2_path],
                                 n_init=100, max_iter=500, tol=0,
                                 random_state=1053)

filename agraw2_onehot_new_output0.txt
random state: 1053
total number of results: 100
maximum number of iterations: 37
minimum initial inertia: 105224.89101843184
maximum initial inertia: 115951.42105576495
number of unique final inertia values: 26
minimum final inertia: 58603.13327212382
maximum final inertia: 59998.38063075493
total number of convergences: 100
number of strict convergences: 100
number of tol-based convergences: 0
filename agraw2_onehot_new_output1.txt
random state: 1053
total number of results: 100
maximum number of iterations: 23
minimum initial inertia: 105204.7574983774
maximum initial inertia: 117037.30512573832
number of unique final inertia values: 18
minimum final inertia: 58610.34054204606
maximum final inertia: 60027.82838831125
total number of convergences: 100
number of strict convergences: 100
number of tol-based convergences: 0
filename agraw2_onehot_new_output2.txt
random state: 1053
total number of results: 100
maximum number of iterations: 29
minimum

### Use them for the analysis

In [23]:
import ucdd_eval
import ucdd_supported_parameters as spms



agraw2_onehot_stats1 = {}

for agraw2_path in all_agraw2_data_paths:
    runs_results_bool, final_fpr_mean, fpr_std_err, final_latency_mean, latency_std_err = \
        ucdd_eval.all_drifting_batches_randomness_robust(
        agraw2_onehot_reference_batches[agraw2_path],
        agraw2_onehot_testing_batches[agraw2_path],
        train_batch_strategy=spms.TrainBatchStrategies.SUBMAJORITY,
        additional_check=True,
        n_init=100,
        max_iter=37000,
        tol=0,
        true_drift_idx=2,
        min_runs=2
    )
    agraw2_onehot_stats1[agraw2_path] = {
        'runs_results_bool': runs_results_bool,
        'final_fpr_mean': final_fpr_mean,
        'fpr_std_err': fpr_std_err,
        'final_latency_mean': final_latency_mean,
        'latency_std_err': latency_std_err
    }

print('agraw2 STATS')
print(agraw2_onehot_stats1)

random_state
0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
random_state
100
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 1

n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
fpr s.e. 0.0
latency s.e. 0.0
final fpr mean 0.0
final latency mean 0.0
agraw2 STATS
{'../Datasets_concept_drift/synthetic_data/abrupt_drift/agraw2_1_abrupt_drift_0_noise_balanced.arff': {'runs_results_bool': [[False, False, False, False, True, False, False], [False, False, False, False, True, False, False]], 'final_fpr_mean': 0.0, 'fpr_std_err': 0.0, 'final_latency_mean': 0.5, 'latency_std_err': 0.0}, '../Datasets_concept_drift/synthetic_data/gradual_drift/agraw2_1_gradual_drift_0_noise_balanced_05.arff': {'runs_results_bool': [[False, False, False, False, True, False, False],

### Save the obtained results to csv

In [28]:
import helpers


final_result_dict = {
    'type_of_data': [], 'dataset': [], 'drift': [], 'width': [], 'encoding': [],
    'train_batch_strategy': [], 'additional_check': [],
    'n_init': [], 'max_iter': [], 'tol': [],
    'FPR_mean': [], 'latency_mean': []
}

for data_path, stats_dict in agraw2_onehot_stats1.items():
    synthetic_filename_info = helpers.synthetic_data_information(data_path)
    encoding = 'onehot'
    fpr_mean = float(stats_dict['final_fpr_mean'])
    latency_mean = float(stats_dict['final_latency_mean'])
    
    final_result_dict['type_of_data'].append(synthetic_filename_info['type_of_data'])
    final_result_dict['dataset'].append(synthetic_filename_info['dataset_name'])
    final_result_dict['drift'].append(synthetic_filename_info['drift_type'])
    final_result_dict['width'].append(synthetic_filename_info['drift_width'])
    final_result_dict['encoding'].append(encoding)
    final_result_dict['train_batch_strategy'].append('submajority')
    final_result_dict['additional_check'].append('yes')
    final_result_dict['n_init'].append(100)
    final_result_dict['max_iter'].append(37000)
    final_result_dict['tol'].append(0)
    final_result_dict['FPR_mean'].append(fpr_mean)
    final_result_dict['latency_mean'].append(latency_mean)
    
final_result_df = pd.DataFrame.from_dict(final_result_dict)
sorted_final_result_df = final_result_df.sort_values(['drift', 'dataset', 'encoding', 'width'])
final_result_df.to_csv('agraw2_onehot_jupyter_results.csv', index=False)

### Use them for the analysis (majority vote)

In [24]:
import ucdd_eval
import ucdd_supported_parameters as spms



agraw2_onehot_stats2 = {}

for agraw2_path in all_agraw2_data_paths:
    runs_results_bool, final_fpr_mean, fpr_std_err, final_latency_mean, latency_std_err = \
        ucdd_eval.all_drifting_batches_randomness_robust(
        agraw2_onehot_reference_batches[agraw2_path],
        agraw2_onehot_testing_batches[agraw2_path],
        train_batch_strategy=spms.TrainBatchStrategies.MAJORITY,
        additional_check=True,
        n_init=100,
        max_iter=37000,
        tol=0,
        true_drift_idx=2,
        min_runs=2
    )
    agraw2_onehot_stats2[agraw2_path] = {
        'runs_results_bool': runs_results_bool,
        'final_fpr_mean': final_fpr_mean,
        'fpr_std_err': fpr_std_err,
        'final_latency_mean': final_latency_mean,
        'latency_std_err': latency_std_err
    }

print('agraw2 STATS')
print(agraw2_onehot_stats2)

random_state
0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
random_state
100
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 1

n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
fpr s.e. 0.0
latency s.e. 0.0
final fpr mean 0.0
final latency mean 1.0
agraw2 STATS
{'../Datasets_concept_drift/synthetic_data/abrupt_drift/agraw2_1_abrupt_drift_0_noise_balanced.arff': {'runs_results_bool': [[False, False, False, False, False, False, False], [False, False, False, False, False, False, False]], 'final_fpr_mean': 0.0, 'fpr_std_err': 0.0, 'final_latency_mean': 1.0, 'latency_std_err': 0.0}, '../Datasets_concept_drift/synthetic_data/gradual_drift/agraw2_1_gradual_drift_0_noise_balanced_05.arff': {'runs_results_bool': [[False, False, False, False, False, False, Fals

### Save the obtained results to csv

In [29]:
import helpers


final_result_dict = {
    'type_of_data': [], 'dataset': [], 'drift': [], 'width': [], 'encoding': [],
    'train_batch_strategy': [], 'additional_check': [],
    'n_init': [], 'max_iter': [], 'tol': [],
    'FPR_mean': [], 'latency_mean': []
}

for data_path, stats_dict in agraw2_onehot_stats2.items():
    synthetic_filename_info = helpers.synthetic_data_information(data_path)
    encoding = 'onehot'
    fpr_mean = float(stats_dict['final_fpr_mean'])
    latency_mean = float(stats_dict['final_latency_mean'])
    
    final_result_dict['type_of_data'].append(synthetic_filename_info['type_of_data'])
    final_result_dict['dataset'].append(synthetic_filename_info['dataset_name'])
    final_result_dict['drift'].append(synthetic_filename_info['drift_type'])
    final_result_dict['width'].append(synthetic_filename_info['drift_width'])
    final_result_dict['encoding'].append(encoding)
    final_result_dict['train_batch_strategy'].append('majority')
    final_result_dict['additional_check'].append('yes')
    final_result_dict['n_init'].append(100)
    final_result_dict['max_iter'].append(37000)
    final_result_dict['tol'].append(0)
    final_result_dict['FPR_mean'].append(fpr_mean)
    final_result_dict['latency_mean'].append(latency_mean)
    
final_result_df = pd.DataFrame.from_dict(final_result_dict)
sorted_final_result_df = final_result_df.sort_values(['drift', 'dataset', 'encoding', 'width'])
final_result_df.to_csv('agraw2_onehot_jupyter_results_majority.csv', index=False)

### Use them for the analysis (no check)

In [25]:
import ucdd_eval
import ucdd_supported_parameters as spms



agraw2_onehot_stats3 = {}

for agraw2_path in all_agraw2_data_paths:
    runs_results_bool, final_fpr_mean, fpr_std_err, final_latency_mean, latency_std_err = \
        ucdd_eval.all_drifting_batches_randomness_robust(
        agraw2_onehot_reference_batches[agraw2_path],
        agraw2_onehot_testing_batches[agraw2_path],
        train_batch_strategy=spms.TrainBatchStrategies.SUBMAJORITY,
        additional_check=False,
        n_init=100,
        max_iter=37000,
        tol=0,
        true_drift_idx=2,
        min_runs=2
    )
    agraw2_onehot_stats3[agraw2_path] = {
        'runs_results_bool': runs_results_bool,
        'final_fpr_mean': final_fpr_mean,
        'fpr_std_err': fpr_std_err,
        'final_latency_mean': final_latency_mean,
        'latency_std_err': latency_std_err
    }

print('agraw2 STATS')
print(agraw2_onehot_stats3)

random_state
0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
random_state
100
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 1

n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
n_init 100 max_iter 37000 tol 0
fpr s.e. 0.0
latency s.e. 0.0
final fpr mean 0.0
final latency mean 0.0
agraw2 STATS
{'../Datasets_concept_drift/synthetic_data/abrupt_drift/agraw2_1_abrupt_drift_0_noise_balanced.arff': {'runs_results_bool': [[False, False, False, False, True, False, False], [False, False, False, False, True, False, False]], 'final_fpr_mean': 0.0, 'fpr_std_err': 0.0, 'final_latency_mean': 0.5, 'latency_std_err': 0.0}, '../Datasets_concept_drift/synthetic_data/gradual_drift/agraw2_1_gradual_drift_0_noise_balanced_05.arff': {'runs_results_bool': [[False, False, False, False, True, False, False],

### Save the obtained results to csv

In [31]:
import helpers


final_result_dict = {
    'type_of_data': [], 'dataset': [], 'drift': [], 'width': [], 'encoding': [],
    'train_batch_strategy': [], 'additional_check': [],
    'n_init': [], 'max_iter': [], 'tol': [],
    'FPR_mean': [], 'latency_mean': []
}

for data_path, stats_dict in agraw2_onehot_stats3.items():
    synthetic_filename_info = helpers.synthetic_data_information(data_path)
    encoding = 'onehot'
    fpr_mean = float(stats_dict['final_fpr_mean'])
    latency_mean = float(stats_dict['final_latency_mean'])
    
    final_result_dict['type_of_data'].append(synthetic_filename_info['type_of_data'])
    final_result_dict['dataset'].append(synthetic_filename_info['dataset_name'])
    final_result_dict['drift'].append(synthetic_filename_info['drift_type'])
    final_result_dict['width'].append(synthetic_filename_info['drift_width'])
    final_result_dict['encoding'].append(encoding)
    final_result_dict['train_batch_strategy'].append('submajority')
    final_result_dict['additional_check'].append('no')
    final_result_dict['n_init'].append(100)
    final_result_dict['max_iter'].append(37000)
    final_result_dict['tol'].append(0)
    final_result_dict['FPR_mean'].append(fpr_mean)
    final_result_dict['latency_mean'].append(latency_mean)
    
final_result_df = pd.DataFrame.from_dict(final_result_dict)
sorted_final_result_df = final_result_df.sort_values(['drift', 'dataset', 'encoding', 'width'])
final_result_df.to_csv('agraw2_onehot_jupyter_results_no_check.csv', index=False)

## AGRAW2 with categories target encoded

### Find the best tol and max_iter (the drift type is irrelevant)

In [21]:
write_kmeans_results_ucdd_helper('agraw2_target_new_output', agraw2_target_reference_batches[abrupt_agraw2_path],
                                 n_init=100, max_iter=500, tol=0,
                                 random_state=1053)

filename agraw2_target_new_output0.txt
random state: 1053
total number of results: 100
maximum number of iterations: 20
minimum initial inertia: 16176.827800853413
maximum initial inertia: 29031.88560718201
number of unique final inertia values: 7
minimum final inertia: 12389.53001076715
maximum final inertia: 13803.573549796298
total number of convergences: 100
number of strict convergences: 100
number of tol-based convergences: 0
filename agraw2_target_new_output1.txt
random state: 1053
total number of results: 100
maximum number of iterations: 19
minimum initial inertia: 16778.152451216705
maximum initial inertia: 29186.33702861551
number of unique final inertia values: 9
minimum final inertia: 12405.60465181937
maximum final inertia: 12405.605096531615
total number of convergences: 100
number of strict convergences: 100
number of tol-based convergences: 0
filename agraw2_target_new_output2.txt
random state: 1053
total number of results: 100
maximum number of iterations: 25
minimum 

### Use them for the analysis

In [26]:
import ucdd_eval
import ucdd_supported_parameters as spms



agraw2_target_stats = {}
    
for agraw2_path in all_agraw2_data_paths:
    runs_results_bool, final_fpr_mean, fpr_std_err, final_latency_mean, latency_std_err = \
        ucdd_eval.all_drifting_batches_randomness_robust(
        agraw2_target_reference_batches[agraw2_path],
        agraw2_target_testing_batches[agraw2_path],
        train_batch_strategy=spms.TrainBatchStrategies.SUBMAJORITY,
        additional_check=True,
        n_init=100,
        max_iter=25000,
        tol=0,
        true_drift_idx=2,
        min_runs=2
    )
    agraw2_target_stats[agraw2_path] = {
        'runs_results_bool': runs_results_bool,
        'final_fpr_mean': final_fpr_mean,
        'fpr_std_err': fpr_std_err,
        'final_latency_mean': final_latency_mean,
        'latency_std_err': latency_std_err
    }

print('agraw2 STATS')
print(agraw2_target_stats)

random_state
0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
random_state
100
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 1

n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
n_init 100 max_iter 25000 tol 0
fpr s.e. 0.0
latency s.e. 0.0
final fpr mean 0.0
final latency mean 1.0
agraw2 STATS
{'../Datasets_concept_drift/synthetic_data/abrupt_drift/agraw2_1_abrupt_drift_0_noise_balanced.arff': {'runs_results_bool': [[False, False, False, False, False, False, True], [False, False, False, False, False, False, True]], 'final_fpr_mean': 0.0, 'fpr_std_err': 0.0, 'final_latency_mean': 1.0, 'latency_std_err': 0.0}, '../Datasets_concept_drift/synthetic_data/gradual_drift/agraw2_1_gradual_drift_0_noise_balanced_05.arff': {'runs_results_bool': [[False, False, False, False, False, False, True],

### Save the obtained results to csv

In [32]:
import helpers


final_result_dict = {
    'type_of_data': [], 'dataset': [], 'drift': [], 'width': [], 'encoding': [],
    'train_batch_strategy': [], 'additional_check': [],
    'n_init': [], 'max_iter': [], 'tol': [],
    'FPR_mean': [], 'latency_mean': []
}

for data_path, stats_dict in agraw2_target_stats.items():
    synthetic_filename_info = helpers.synthetic_data_information(data_path)
    encoding = 'target'
    fpr_mean = float(stats_dict['final_fpr_mean'])
    latency_mean = float(stats_dict['final_latency_mean'])
    
    final_result_dict['type_of_data'].append(synthetic_filename_info['type_of_data'])
    final_result_dict['dataset'].append(synthetic_filename_info['dataset_name'])
    final_result_dict['drift'].append(synthetic_filename_info['drift_type'])
    final_result_dict['width'].append(synthetic_filename_info['drift_width'])
    final_result_dict['encoding'].append(encoding)
    final_result_dict['train_batch_strategy'].append('submajority')
    final_result_dict['additional_check'].append('yes')
    final_result_dict['n_init'].append(100)
    final_result_dict['max_iter'].append(25000)
    final_result_dict['tol'].append(0)
    final_result_dict['FPR_mean'].append(fpr_mean)
    final_result_dict['latency_mean'].append(latency_mean)
    
final_result_df = pd.DataFrame.from_dict(final_result_dict)
sorted_final_result_df = final_result_df.sort_values(['drift', 'dataset', 'encoding', 'width'])
final_result_df.to_csv('agraw2_target_jupyter_results.csv', index=False)