## Testing behaviour on concept drift

In [1]:
# import the functions needed for validate and comparsion

import numpy as np
import cupy as cp
import pandas as pd
from timeit import default_timer as timer

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from skmultiflow.data import FileStream, ConceptDriftStream, RandomRBFGenerator, RandomRBFGeneratorDrift
from skmultiflow.neural_networks import PerceptronMask
from skmultiflow.trees import HoeffdingTreeClassifier



# using plotly for plots
#import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

In [2]:
# stability measurment as proposed in "Measurment the Stability of Feature Selection"
def pearson_stability_ij(arr1,arr2):
    d = len(arr1)
    k_i = np.sum(arr1)
    k_j = np.sum(arr2)

    # catch edge cases as proposed in the paper under 4.1
    if (k_i == 0 or k_i == d) and k_i != k_j :
        return 0
    elif (k_j == 0 or k_j == d) and k_i != k_j :
        return 0
    elif (k_i == 0 or k_i == d) and k_i == k_j :
        return 1
    x_hat_i = k_i / d
    x_hat_j = k_j / d
    arr1 = arr1 - x_hat_i
    arr2 = arr2 - x_hat_j
    dividend = 1/d * np.sum(arr1*arr2)
    divisor = np.sqrt(1/d*np.sum(arr1**2))*np.sqrt(1/d*np.sum(arr2**2))
    return dividend/divisor

def stability_factor(selected_ftrs):
   M = len(selected_ftrs)
   sum_stabilities = 0
   for i in range(M):
       for j in range(i+1, M):
           sum_stabilities += pearson_stability_ij(selected_ftrs[i], selected_ftrs[j])
   return 1/(M*(M-1))*sum_stabilities * 2     

In [3]:
# import algorithms
from fires import FIRES
from ofs import MC_OFS
from ofssgr import MC_OFSSGD
from fsds import StreamFeatWeight

### Streaming Data Generation with Concept Drift


Extra Notebook for this data, cause it doesn't have test data and the stream is unlimited 

In [None]:
# stream generator to test how algorithms perform on data with concept drift
stream = RandomRBFGeneratorDrift(model_random_state=99, sample_random_state = 50,
 n_classes = 6, n_features = 200, n_centroids = 50, change_speed=0.05,
 num_drift_centroids=25)

n_selected_ftr = 50
batch_size = 100
n_window = 10

In [4]:
# stream 1
drift_stream = RandomRBFGenerator(model_random_state=42, sample_random_state=43, n_classes=10, n_features=100)
start_stream = RandomRBFGenerator(model_random_state=52, sample_random_state=53, n_classes=10, n_features=100, n_centroids=30)

stream = ConceptDriftStream(stream=start_stream, drift_stream=drift_stream, position=10000, width=1)

dataset_name = "Concept_drift_1"
n_selected_ftr = 50
n_window = 10
batch_size = 100

In [None]:
# stream 2
drift_stream = RandomRBFGenerator(model_random_state=20, sample_random_state=21, n_classes=10, n_features=300)
start_stream = RandomRBFGenerator(model_random_state=66, sample_random_state=67, n_classes=10, n_features=300)

stream = ConceptDriftStream(stream=start_stream, drift_stream=drift_stream, position=10000, width=1000)

dataset_name = "Concept_drift_2"
n_selected_ftr = 50
n_window = 10
batch_size = 100

### FIRES

In [5]:
stream.restart()
predictor = PerceptronMask()
x,y = stream.next_sample(batch_size=batch_size)
predictor.partial_fit(x,y, stream.target_values)

PerceptronMask(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
               fit_intercept=True, max_iter=1000, n_iter_no_change=5,
               n_jobs=None, penalty=None, random_state=0, shuffle=True,
               tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False)

In [6]:
fires_model = FIRES(n_total_ftr=stream.n_features,
                    target_values=stream.target_values,
                    mu_init=0,
                    sigma_init=1,
                    model='softmax')        

In [7]:
fires_accuracy = []
fires_f1 = []


fires_selected_ftrs = []
fires_stability = []

start_time_all = timer()
# stream has unlimeted samples so set for loop
for i in range(200):
    # Load a new sample
    x, y = stream.next_sample(batch_size=batch_size)
    # Select features
    
    ftr_weights = fires_model.weigh_features(x,y)
    ftr_selection = np.argsort(ftr_weights)[::-1][:n_selected_ftr]

    # Truncate x (retain only selected features, 'remove' all others, e.g. by replacing them with 0)
    x_reduced = np.zeros(x.shape)
    x_reduced[:, ftr_selection] = x[:, ftr_selection]

    # stability test
    ftr_array = np.zeros(stream.n_features)
    ftr_array[ftr_selection] = 1
    fires_selected_ftrs.append(ftr_array)

    if len(fires_selected_ftrs) >= 10:
        stability = stability_factor(fires_selected_ftrs[-10:])
        fires_stability.append(stability)


    # Test
    y_pred = predictor.predict(x_reduced)
    
    fires_accuracy.append(accuracy_score(y, y_pred))
    fires_f1.append(f1_score(y, y_pred, average="weighted", labels=stream.target_values))


    # Train
    predictor.partial_fit(x_reduced, y)

# Restart the FileStream
end_time_all = timer()
fires_run_time = timer() - start_time_all
print("The whole FIRES run took {}".format(fires_run_time))

fires_moving_average = pd.Series(fires_accuracy).rolling(window=n_window).mean().iloc[n_window-1:].values
fires_f1 = pd.Series(fires_f1).rolling(window=n_window).mean().iloc[n_window-1:].values

The whole FIRES run took 9.716045030000004


### FSDS

In [14]:
stream.restart()
predictor = PerceptronMask()
x,y = stream.next_sample(batch_size=batch_size)
predictor.partial_fit(x,y, stream.target_values)

PerceptronMask(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
               fit_intercept=True, max_iter=1000, n_iter_no_change=5,
               n_jobs=None, penalty=None, random_state=0, shuffle=True,
               tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False)

In [15]:
fsds_model = StreamFeatWeight(m=stream.n_features, k=stream.n_classes)
fsds_model.low_rank_approximation(x.T) # needs some pretraining in the first run

array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan])

In [16]:
fsds_accuracy = []
fsds_f1 = []


fsds_selected_ftrs = []
fsds_stability = []

start_time_all = timer()
# stream has unlimeted samples so set for loop
for i in range(200):
    # Load a new sample
    x, y = stream.next_sample(batch_size=batch_size)
    # Select features
    
    ftr_weights = fsds_model.low_rank_approximation(x.T)
    ftr_selection = np.argsort(ftr_weights)[::-1][:n_selected_ftr]

    # Truncate x (retain only selected features, 'remove' all others, e.g. by replacing them with 0)
    x_reduced = np.zeros(x.shape)
    x_reduced[:, ftr_selection] = x[:, ftr_selection]

    # stability test
    ftr_array = np.zeros(stream.n_features)
    ftr_array[ftr_selection] = 1
    fsds_selected_ftrs.append(ftr_array)

    if len(fsds_selected_ftrs) >= 10:
        stability = stability_factor(fsds_selected_ftrs[-10:])
        fsds_stability.append(stability)


    # Test
    y_pred = predictor.predict(x_reduced)
    
    fsds_accuracy.append(accuracy_score(y, y_pred))
    fsds_f1.append(f1_score(y, y_pred, average="weighted", labels=stream.target_values))


    # Train
    predictor.partial_fit(x_reduced, y)

# Restart the FileStream
end_time_all = timer()
fsds_run_time = timer() - start_time_all
print("The whole fsds run took {}".format(fsds_run_time))

fsds_moving_average = pd.Series(fsds_accuracy).rolling(window=n_window).mean().iloc[n_window-1:].values
fsds_f1 = pd.Series(fsds_f1).rolling(window=n_window).mean().iloc[n_window-1:].values

The whole fsds run took 5.4424781879999955


### OFS

In [24]:
stream.restart()
predictor = PerceptronMask()
x,y = stream.next_sample(batch_size=batch_size)
predictor.partial_fit(x,y, stream.target_values)

PerceptronMask(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
               fit_intercept=True, max_iter=1000, n_iter_no_change=5,
               n_jobs=None, penalty=None, random_state=0, shuffle=True,
               tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False)

In [25]:
ofs_model = MC_OFS(regularization_param = 0.01, step_size = 0.1, n_selected_ftr=n_selected_ftr, n_total_ftr=stream.n_num_features, n_classes=stream.n_classes)

In [26]:
ofs_accuracy = []
ofs_f1 = []


ofs_selected_ftrs = []
ofs_stability = []

start_time_all = timer()
# stream has unlimeted samples so set for loop
for i in range(200):
    # Load a new sample
    x, y = stream.next_sample(batch_size=batch_size)
    # Select features

     # Select features
    for idx, label in enumerate(np.array(y, dtype='int')):
        ofs_model.train(x[idx],label)
    
    
    ftr_selection = ofs_model.get_feature_indices()

    # Truncate x (retain only selected features, 'remove' all others, e.g. by replacing them with 0)
    x_reduced = np.zeros(x.shape)
    x_reduced[:, ftr_selection] = x[:, ftr_selection]

    # stability test
    ftr_array = np.zeros(stream.n_features)
    ftr_array[ftr_selection] = 1
    ofs_selected_ftrs.append(ftr_array)

    if len(ofs_selected_ftrs) >= 10:
        stability = stability_factor(ofs_selected_ftrs[-10:])
        ofs_stability.append(stability)


    # Test
    y_pred = predictor.predict(x_reduced)
    
    ofs_accuracy.append(accuracy_score(y, y_pred))
    ofs_f1.append(f1_score(y, y_pred, average="weighted", labels=stream.target_values))


    # Train
    predictor.partial_fit(x_reduced, y)

# Restart the FileStream
end_time_all = timer()
ofs_run_time = timer() - start_time_all
print("The whole ofs run took {}".format(ofs_run_time))

ofs_moving_average = pd.Series(ofs_accuracy).rolling(window=n_window).mean().iloc[n_window-1:].values
ofs_f1 = pd.Series(ofs_f1).rolling(window=n_window).mean().iloc[n_window-1:].values

The whole ofs run took 5.318906373000118


### OFSSGD

In [27]:
stream.restart()
predictor = PerceptronMask()
x,y = stream.next_sample(batch_size=batch_size)
predictor.partial_fit(x,y, stream.target_values)

PerceptronMask(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
               fit_intercept=True, max_iter=1000, n_iter_no_change=5,
               n_jobs=None, penalty=None, random_state=0, shuffle=True,
               tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False)

In [28]:
ofssgd_model = MC_OFSSGD(reduction_threshold=0.4, reduction_value=0.1, regularization_param=0.01, step_size=0.2, n_total_ftrs=stream.n_num_features, n_classes=stream.n_classes)

In [29]:
ofssgd_accuracy = []
ofssgd_f1 = []


ofssgd_selected_ftrs = []
ofssgd_stability = []

start_time_all = timer()
# stream has unlimeted samples so set for loop
for i in range(200):
    # Load a new sample
    x, y = stream.next_sample(batch_size=batch_size)
    # Select features
    
    for idx, label in enumerate(np.array(y, dtype='int')):
        ofssgd_model.train(x[idx],label)

    ftr_selection = ofssgd_model.get_feature_indices()

    # Truncate x (retain only selected features, 'remove' all others, e.g. by replacing them with 0)
    x_reduced = np.zeros(x.shape)
    x_reduced[:, ftr_selection] = x[:, ftr_selection]

    # stability test
    ftr_array = np.zeros(stream.n_features)
    ftr_array[ftr_selection] = 1
    ofssgd_selected_ftrs.append(ftr_array)

    if len(ofssgd_selected_ftrs) >= 10:
        stability = stability_factor(ofssgd_selected_ftrs[-10:])
        ofssgd_stability.append(stability)


    # Test
    y_pred = predictor.predict(x_reduced)
    
    ofssgd_accuracy.append(accuracy_score(y, y_pred))
    ofssgd_f1.append(f1_score(y, y_pred, average="weighted", labels=stream.target_values))


    # Train
    predictor.partial_fit(x_reduced, y)

# Restart the FileStream
end_time_all = timer()
ofssgd_run_time = timer() - start_time_all
print("The whole ofssgd run took {}".format(ofssgd_run_time))

ofssgd_moving_average = pd.Series(ofssgd_accuracy).rolling(window=n_window).mean().iloc[n_window-1:].values
ofssgd_f1 = pd.Series(ofssgd_f1).rolling(window=n_window).mean().iloc[n_window-1:].values

The whole ofssgd run took 12.743607161


In [30]:
# stability
title = "Stability"
col_names = ["FIRES", "OFS", "OFSSGD", "FSDS"]
d = {"FIRES":fires_stability, "OFS":ofs_stability, "OFSSGD":ofssgd_stability, "FSDS":fsds_stability} #"random":random_stability
df = pd.DataFrame(d, columns=col_names)
fig = px.line(df, y = col_names, title=title, labels={"index":"batches", "value":"stability"},color_discrete_map={'FIRES': 'red', 
                                                   'FSDS': 'green', 'OFS': 'purple', "OFSSGD":"yellow"} )
fig.show()

In [44]:
#moving averages
title = "Moving averages over accuracy while learning with window {}".format(n_window)
col_names = ["FIRES", "OFS", "OFSSGD", "FSDS"]
d = {"FIRES":fires_moving_average, "OFS":ofs_moving_average, 
"OFSSGD":ofssgd_moving_average, "FSDS":fsds_moving_average}
df = pd.DataFrame(d, columns=col_names)
fig = px.line(df, y=col_names, title=title, labels={"index":"batches", "value":"accuracy"}, color_discrete_map={'FIRES': 'red', 
                                                   'FSDS': 'green', 'OFS': 'purple', "OFSSGD":"yellow"})
fig.show()


In [45]:
#moving averages
title = "Moving averages over f1 while learning with window {}".format(n_window)
col_names = ["FIRES", "OFS", "OFSSGD", "FSDS"]
d = {"FIRES":fires_f1, "OFS":ofs_f1, 
"OFSSGD":ofssgd_f1, "FSDS":fsds_f1}
df = pd.DataFrame(d, columns=col_names)
fig = px.line(df, y=col_names, title=title, labels={"index":"batches", "value":"accuracy"}, color_discrete_map={'FIRES': 'red', 
                                                   'FSDS': 'green', 'OFS': 'purple', "OFSSGD":"yellow", "random":"cyan"})
fig.show()




In [43]:
df

Unnamed: 0,FIRES,OFS,OFSSGD,FSDS,random
0,0.886919,0.915536,0.927867,0.946818,
1,0.936725,0.934983,0.957474,0.977483,
2,0.958306,0.950872,0.985015,0.983374,
3,0.977854,0.957149,0.996911,0.985393,
4,0.985983,0.971819,0.997936,0.986430,
...,...,...,...,...,...
186,0.955043,0.972832,0.997981,0.990103,
187,0.960247,0.951364,0.997981,0.990103,
188,0.963436,0.954151,0.997059,0.990103,
189,0.966584,0.953112,0.996043,0.990103,


In [38]:
len(ofs_f1)

191

In [40]:
len(fsds_f1)

191

In [41]:
len(ofssgd_f1)

191

In [42]:
fsds_f1

array([0.94681827, 0.97748309, 0.98337399, 0.98539322, 0.98643026,
       0.98643026, 0.98746694, 0.99038917, 0.99140343, 0.99040735,
       0.99392051, 0.99594475, 0.99799832, 0.99900392, 0.99900392,
       0.99900392, 0.99900392, 0.99900392, 0.99900392, 1.        ,
       1.        , 0.997125  , 0.997125  , 0.997125  , 0.997125  ,
       0.99610987, 0.99610987, 0.99502099, 0.99502099, 0.99400939,
       0.99400939, 0.99688439, 0.99382798, 0.99382798, 0.99382798,
       0.99484311, 0.99484311, 0.995932  , 0.995932  , 0.99222904,
       0.99222904, 0.99222904, 0.99528545, 0.99528545, 0.99528545,
       0.99528545, 0.99528545, 0.99528545, 0.99528545, 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 0.99253741, 0.99253741, 0.99253741, 0.99253741,
       0.99253741, 0.98599735, 0.98500071, 0.98500071, 0.98500