# Validation notebook
Here the FIRES implementation for multiclass and regression is validated and compared to other online features selcetion algorithms.


In [1]:
# import the functions needed for validate and comparsion

import numpy as np
import cupy as cp
import pandas as pd
from timeit import default_timer as timer

#from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from skmultiflow.data import FileStream, RandomRBFGenerator, ConceptDriftStream
from skmultiflow.neural_networks import PerceptronMask
from skmultiflow.trees import HoeffdingTreeClassifier, ExtremelyFastDecisionTreeClassifier


# using plotly for plots
#import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots


In [2]:
# paint mnist
def paint_digit(digit_values):
    fig = px.imshow(digit_values.reshape(28,28), binary_string=True)
    fig.update_layout(coloraxis_showscale=False)
    fig.update_xaxes(showticklabels=False)
    fig.update_yaxes(showticklabels=False)
    #fig.show()
    return(fig)

In [3]:
# stability measurment as proposed in "Measurment the Stability of Feature Selection"

# TODO: check for case where nothing changes


def pearson_stability_ij(arr1,arr2):
    d = len(arr1)
    k_i = np.sum(arr1)
    k_j = np.sum(arr2)

    # catch edge cases as proposed in the paper under 4.1
    if (k_i == 0 or k_i == d) and k_i != k_j :
        return 0
    elif (k_j == 0 or k_j == d) and k_i != k_j :
        return 0
    elif (k_i == 0 or k_i == d) and k_i == k_j :
        return 1
    x_hat_i = k_i / d
    x_hat_j = k_j / d
    arr1 = arr1 - x_hat_i
    arr2 = arr2 - x_hat_j
    dividend = 1/d * np.sum(arr1*arr2)
    divisor = np.sqrt(1/d*np.sum(arr1**2))*np.sqrt(1/d*np.sum(arr2**2))
    return dividend/divisor

def stability_factor(selected_ftrs):
   M = len(selected_ftrs)
   sum_stabilities = 0
   for i in range(M):
       for j in range(i+1, M):
           sum_stabilities += pearson_stability_ij(selected_ftrs[i], selected_ftrs[j])
   return 1/(M*(M-1))*sum_stabilities * 2   

In [4]:
# import algorithms
from fires_new import FIRES
from ofs import OFS, MC_OFS
from ofssgr import OFSSGD, MC_OFSSGD
from fsds import StreamFeatWeight

## Multiclass Data

Here the FIRES softmax implementation is compared to the FSDS, OFS and OFSSGD oun multiclass data.


### Load Datasets as Streaming Data

In [5]:
# MNIST data normalized
stream = FileStream('datasets/Multiclass/mnist_train_normalized.csv', target_idx=0)
stream.prepare_for_use()
dataset_name = "mnist_norm"
n_selected_ftr = 100
n_window = 10
batch_size = 100
weights = None

check_ftrs = False

# load test data
test_data = pd.read_csv('datasets/Multiclass/mnist_test_normalized.csv', header=None)
test_y = test_data[0].to_numpy()
test_x = test_data.drop(columns=0).to_numpy()


New instances of the Stream class are now ready to use after instantiation.


In [None]:
# MNIST data
stream = FileStream('datasets/Multiclass/mnist_train.csv', target_idx=0)
stream.prepare_for_use()
dataset_name = "mnist"
n_selected_ftr = 100
n_window = 10
batch_size = 100
weights = None

check_ftrs = False

# load test data
test_data = pd.read_csv('datasets/Multiclass/mnist_test.csv', header=None)
test_y = test_data[0].to_numpy()
test_x = test_data.drop(columns=0).to_numpy()

In [None]:
# Human Activity Recognition
# labels changed from [1,...,6] to [0,...,5]
# rows shuffled
# split into train set with 7352 instances and test set with 2948
stream = FileStream('datasets/Multiclass/har_train.csv', target_idx = 561)
stream.prepare_for_use()
dataset_name = "har"
n_selected_ftr = 100
n_window = 10
batch_size = 50
weights = None

check_ftrs = False


# load test data
test_data = pd.read_csv('datasets/Multiclass/har_test.csv')
test_y = test_data["Class"].to_numpy()
test_x = test_data.drop(columns="Class").to_numpy()


In [None]:
# Covtype scaled to 0,1
# https://archive.ics.uci.edu/ml/datasets/covertype

# rows shuffled
# split into train set with 400000 instances and test set with 180000
stream = FileStream('datasets/Multiclass/covtype.scale01.test.csv', target_idx = 0)
stream.prepare_for_use()
dataset_name = "covtype"
n_selected_ftr = 25
n_window = 50
batch_size = 100
weights = [0.36460521, 0.48759922, 0.06153746, 0.00472796, 0.01633873, 0.02989095, 0.03530048]

check_ftrs = False

# load test data
test_data = pd.read_csv('datasets/Multiclass/covtype.scale01.train.csv', header=None)
test_y = test_data[0].to_numpy()
test_x = test_data.drop(columns=0).to_numpy()

In [None]:
def find_true_ftrs_indices(label_names, start_char):
    indices = []
    for i in range(len(label_names)):
        if label_names[i].startswith(start_char):
            indices.append(i)

    return indices

In [None]:
# synthetic dataset 1 (see data_generation.ipynb)
stream = FileStream('datasets/Multiclass/dataset_1_train.csv', target_idx=100)
stream.prepare_for_use()
dataset_name = "syn_ds_1"
n_selected_ftr = 20 # 15 are really informative
n_window = 10
batch_size = 50
weights = None

# load test data
test_data = pd.read_csv("datasets/Multiclass/dataset_1_test.csv")
test_y = test_data["label"].to_numpy()
test_x = test_data.drop(columns="label").to_numpy()

# get index of real ftrs
true_ftrs = find_true_ftrs_indices(test_data.columns, "y")
check_ftrs = True

In [None]:
# synthetic dataset 2 (see data_generation.ipynb)
stream = FileStream('datasets/Multiclass/dataset_2_train.csv', target_idx=500)
stream.prepare_for_use()
dataset_name = "syn_ds_2"
n_selected_ftr = 30 # 25 are really informative
n_window = 10
batch_size = 100
weights = None

# load test data
test_data = pd.read_csv("datasets/Multiclass/dataset_2_test.csv")
test_y = test_data["label"].to_numpy()
test_x = test_data.drop(columns="label").to_numpy()

# get index of real ftrs
true_ftrs = find_true_ftrs_indices(test_data.columns, "y")
check_ftrs = True

In [None]:
# synthetic dataset 3 (see data_generation.ipynb)
stream = FileStream('datasets/Multiclass/dataset_3_train.csv', target_idx=250)
stream.prepare_for_use()
dataset_name = "syn_ds_3"
n_selected_ftr = 20 # 20 are really informative
n_window = 10
batch_size = 20
weights = [0.1, 0.05, 0.15, 0.2, 0.025, 0.125, 0.075, 0.275]

# load test data
test_data = pd.read_csv("datasets/Multiclass/dataset_3_test.csv")
test_y = test_data["label"].to_numpy()
test_x = test_data.drop(columns="label").to_numpy()

# get index of real ftrs
true_ftrs = find_true_ftrs_indices(test_data.columns, "y")
check_ftrs = True

In [None]:
# synthetic dataset 4 (see data_generation.ipynb)
stream = FileStream('datasets/Multiclass/dataset_4_train.csv', target_idx=210)
stream.prepare_for_use()
dataset_name = "syn_ds_4"
n_selected_ftr = 100
n_window = 10
batch_size = 100
weights=None
check_ftrs = False

# load test data
test_data = pd.read_csv("datasets/Multiclass/dataset_4_test.csv")
test_y = test_data["label"].to_numpy()
test_x = test_data.drop(columns="label").to_numpy()

## Set trainings algorithms settings

In [4]:
perceptron_settings = {}

hoeffding_settings = {"leaf_prediction":"mc"}

fast_decision_settings = {"split_criterion":"info_gain",
                          "splif_confidence":0.0001,
                          "leaf_prediction":"mc" }

In [1]:
pred_algo = "perceptron"
#pred_algo = "hoeffding"
#pred_algo = 'fast_decision'

### Test without feature selection


In [7]:
stream.restart()
if pred_algo =="perceptron":
    predictor = PerceptronMask()
elif pred_algo == "hoeffding":
    predictor = HoeffdingTreeClassifier(leaf_prediction=hoeffding_settings["leaf_prediction"])
elif pred_algo == 'fast_decision':
    predictor = ExtremelyFastDecisionTreeClassifier(split_criterion=fast_decision_settings["split_criterion"],
                                                    split_confidence=fast_decision_settings["split_confidence"],
                                                    leaf_prediction=fast_decision_settings["leaf_prediction"])
x,y = stream.next_sample(batch_size=batch_size)
predictor.partial_fit(x,y, stream.target_values)

PerceptronMask(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
               fit_intercept=True, max_iter=1000, n_iter_no_change=5,
               n_jobs=None, penalty=None, random_state=0, shuffle=True,
               tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False)

In [8]:
accuarcy_pure = []
f1_pure = []
precision = []
recall = []
while stream.has_more_samples():
    x, y = stream.next_sample(batch_size=batch_size)
    y_pred = predictor.predict(x)
    
    accuarcy_pure.append(accuracy_score(y, y_pred))
    f1_pure.append(f1_score(y, y_pred, stream.target_values, 
    average="weighted" ))
    precision.append(precision_score(y, y_pred, stream.target_values,
    average="weighted"))
    recall.append(recall_score(y, y_pred, stream.target_values,
    average="weighted"))

    predictor.partial_fit(x,y)

pure_moving_average = pd.Series(accuarcy_pure).rolling(window=n_window).mean().iloc[n_window-1:].values
pure_f1 = pd.Series(f1_pure).rolling(window=n_window).mean().iloc[n_window-1:].values

ss labels=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9] as keyword args. From version 0.25 passing these as positional arguments will result in an error


In [9]:
y_pred = predictor.predict(test_x)
accuracy_no_ofs = accuracy_score(test_y, y_pred)
print("For the test dataset the previous trained predictor reached: {}".format(accuracy_no_ofs))

For the test dataset the previous trained predictor reached: 0.8499


In [10]:
precision = pd.Series(precision).rolling(window=n_window).mean().iloc[n_window-1:].values
recall = pd.Series(recall).rolling(window=n_window).mean().iloc[n_window-1:].values
f1_pure = pd.Series(f1_pure).rolling(window=n_window).mean().iloc[n_window-1:].values

In [11]:
col_names = ["accuracy","f1", "precision", "recall"]
d = {"accuracy":pure_moving_average, "f1":f1_pure, "precision":precision,
"recall":recall}
df = pd.DataFrame(d, columns=col_names)

In [12]:

fig = px.line(df)
fig.show()

### FIRES Framework

In [13]:
stream.restart()
if pred_algo =="perceptron":
    predictor = PerceptronMask()
elif pred_algo == "hoeffding":
    predictor = HoeffdingTreeClassifier(leaf_prediction=hoeffding_settings["leaf_prediction"])
elif pred_algo == 'fast_decision':
    predictor = ExtremelyFastDecisionTreeClassifier(split_criterion=fast_decision_settings["split_criterion"],
                                                    split_confidence=fast_decision_settings["split_confidence"],
                                                    leaf_prediction=fast_decision_settings["leaf_prediction"])
x,y = stream.next_sample(batch_size=batch_size)
predictor.partial_fit(x,y, stream.target_values)

PerceptronMask(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
               fit_intercept=True, max_iter=1000, n_iter_no_change=5,
               n_jobs=None, penalty=None, random_state=0, shuffle=True,
               tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False)

In [14]:
fires_model = FIRES(n_total_ftr=stream.n_features,
                    target_values=stream.target_values,
                    mu_init=0,
                    sigma_init=1,
                    model='softmax',
                    class_probabilities=weights)               

In [15]:
fires_cuda_accuracy = []
fires_f1 = []
fires_cuda_times = []

fires_cuda_selected_ftrs = []
fires_cuda_stability = []

start_time_all = timer()
while stream.has_more_samples():
    # Load a new sample
    x, y = stream.next_sample(batch_size=batch_size)
    # Select features
    start_time = timer()
    ftr_weights = fires_model.weigh_features(x,y)
    ftr_selection = np.argsort(ftr_weights)[::-1][:n_selected_ftr]
    fires_cuda_times.append(timer()-start_time)

    # Truncate x (retain only selected features, 'remove' all others, e.g. by replacing them with 0)
    x_reduced = np.zeros(x.shape)
    x_reduced[:, ftr_selection] = x[:, ftr_selection]

    # stability test
    ftr_array = np.zeros(stream.n_features)
    ftr_array[ftr_selection] = 1
    fires_cuda_selected_ftrs.append(ftr_array)

    if len(fires_cuda_selected_ftrs) >= 10:
        stability = stability_factor(fires_cuda_selected_ftrs[-10:])
        fires_cuda_stability.append(stability)


    # Test
    y_pred = predictor.predict(x_reduced)
    
    fires_cuda_accuracy.append(accuracy_score(y, y_pred))
    fires_f1.append(f1_score(y, y_pred, average="weighted",
    labels=stream.target_values))


    # Train
    predictor.partial_fit(x_reduced, y)

# Restart the FileStream
end_time_all = timer()
fires_cuda_run_time = timer() - start_time_all
print("The whole FIRES run took {}".format(fires_cuda_run_time))

fires_moving_average = pd.Series(fires_cuda_accuracy).rolling(window=n_window).mean().iloc[n_window-1:].values
fires_f1 = pd.Series(fires_f1).rolling(window=n_window).mean().iloc[n_window-1:].values

if dataset_name == "mnist" or dataset_name == "mnist_norm":
    selection_array = np.zeros((784))
    selection_array[ftr_selection] = 1
    mnist_fig_fires = paint_digit(selection_array)


The whole FIRES run took 120.54885748400011


In [16]:
if check_ftrs:
    true_selected_ftr = set(ftr_selection)&set(true_ftrs)
    fires_perc_ftr_found = len(true_selected_ftr) / len(true_ftrs) * 100
    print("FIRES found {}% of the true informative features.".format(fires_perc_ftr_found))

In [17]:
test_x_selected = np.zeros(test_x.shape)
test_x_selected[:,ftr_selection] = test_x[:,ftr_selection]
y_pred = predictor.predict(test_x)
accuracy_fires = accuracy_score(test_y, y_pred)
print("For the test dataset the previous trained predictor reached: {}".format(accuracy_fires))

For the test dataset the previous trained predictor reached: 0.704


### FSDS algorithm


In [18]:
stream.restart()
if pred_algo =="perceptron":
    predictor = PerceptronMask()
elif pred_algo == "hoeffding":
    predictor = HoeffdingTreeClassifier(leaf_prediction=hoeffding_settings["leaf_prediction"])
elif pred_algo == 'fast_decision':
    predictor = ExtremelyFastDecisionTreeClassifier(split_criterion=fast_decision_settings["split_criterion"],
                                                    split_confidence=fast_decision_settings["split_confidence"],
                                                    leaf_prediction=fast_decision_settings["leaf_prediction"])
x,y = stream.next_sample(batch_size=batch_size)
predictor.partial_fit(x,y, stream.target_values)

PerceptronMask(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
               fit_intercept=True, max_iter=1000, n_iter_no_change=5,
               n_jobs=None, penalty=None, random_state=0, shuffle=True,
               tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False)

In [19]:
fsds_model = StreamFeatWeight(m=stream.n_features, k=stream.n_classes)
fsds_model.low_rank_approximation(x.T) # needs some pretraining in the first run

fsds_selected_ftrs = []
fsds_stability = []

fsds_accuracy = []
fsds_f1 = []
fsds_times = []

start_time_all = timer()
while stream.has_more_samples():
    # Load a new sample
    x, y = stream.next_sample(batch_size=batch_size)
    # Select features
    start_time = timer()
    ftr_weights = fsds_model.low_rank_approximation(x.T)
    ftr_selection = np.argsort(ftr_weights)[::-1][:n_selected_ftr]
    fsds_times.append(timer()-start_time)

    # Truncate x (retain only selected features, 'remove' all others, e.g. by replacing them with 0)
    x_reduced = np.zeros(x.shape)
    x_reduced[:, ftr_selection] = x[:, ftr_selection]

     # stability test
    ftr_array = np.zeros(stream.n_features)
    ftr_array[ftr_selection] = 1
    fsds_selected_ftrs.append(ftr_array)

    if len(fsds_selected_ftrs) >= 10:
        stability = stability_factor(fsds_selected_ftrs[-10:])
        fsds_stability.append(stability)

    # Test
    y_pred = predictor.predict(x_reduced)
    
    fsds_accuracy.append(accuracy_score(y, y_pred))
    fsds_f1.append(f1_score(y, y_pred, average="weighted", 
    labels=stream.target_values))


    # Train
    predictor.partial_fit(x_reduced, y)

# Restart the FileStream
end_time_all = timer()
fsds_run_time = timer() - start_time_all
print("The whole fsds run took {}".format(fsds_run_time))

fsds_moving_average = pd.Series(fsds_accuracy).rolling(window=n_window).mean().iloc[n_window-1:].values
fsds_f1 = pd.Series(fsds_f1).rolling(window=n_window).mean().iloc[n_window-1:].values

if dataset_name == "mnist" or dataset_name == "mnist_norm":
    
    selection_array = np.zeros((784))
    selection_array[ftr_selection] = 1

    mnist_fig_fsds = paint_digit(selection_array)


The whole fsds run took 8.941547565000292


In [20]:
if check_ftrs:
    true_selected_ftr = set(ftr_selection)&set(true_ftrs)
    fsds_perc_ftr_found = len(true_selected_ftr) / len(true_ftrs) * 100
    print("FSDS found {}% of the true informative features.".format(fsds_perc_ftr_found))

In [21]:
test_x_selected = np.zeros(test_x.shape)
test_x_selected[:,ftr_selection] = test_x[:,ftr_selection]
y_pred = predictor.predict(test_x)
accuracy_fsds = accuracy_score(test_y, y_pred)
print("For the test dataset the previous trained predictor reached: {}".format(accuracy_fsds))

For the test dataset the previous trained predictor reached: 0.7148


### OFS algorithm

In [22]:
stream.restart()
if pred_algo =="perceptron":
    predictor = PerceptronMask()
elif pred_algo == "hoeffding":
    predictor = HoeffdingTreeClassifier(leaf_prediction=hoeffding_settings["leaf_prediction"])
elif pred_algo == 'fast_decision':
    predictor = ExtremelyFastDecisionTreeClassifier(split_criterion=fast_decision_settings["split_criterion"],
                                                    split_confidence=fast_decision_settings["split_confidence"],
                                                    leaf_prediction=fast_decision_settings["leaf_prediction"])
x,y = stream.next_sample(batch_size=batch_size)
predictor.partial_fit(x,y, stream.target_values)

PerceptronMask(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
               fit_intercept=True, max_iter=1000, n_iter_no_change=5,
               n_jobs=None, penalty=None, random_state=0, shuffle=True,
               tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False)

In [23]:
ofs = MC_OFS(regularization_param = 0.01, step_size = 0.1, n_selected_ftr=n_selected_ftr, n_total_ftr=stream.n_num_features, n_classes=stream.n_classes)

ofs_accuracy = []
ofs_f1 = []
ofs_selected_ftrs = []
ofs_stability = []

start_time_all = timer()
while stream.has_more_samples():
    # Load a new sample
    x, y = stream.next_sample(batch_size=batch_size)

    # Select features
    for idx, label in enumerate(y):
        ofs.train(x[idx],label)

    ftr_selection = ofs.get_feature_indices()
    # Truncate x (retain only selected features, 'remove' all others, e.g. by replacing them with 0)
    x_reduced = np.zeros(x.shape)
    x_reduced[:, ftr_selection] = x[:, ftr_selection]

     # stability test
    ftr_array = np.zeros(stream.n_features)
    ftr_array[ftr_selection] = 1
    ofs_selected_ftrs.append(ftr_array)

    if len(ofs_selected_ftrs) >= 10:
        stability = stability_factor(ofs_selected_ftrs[-10:])
        ofs_stability.append(stability)

    # Test
    y_pred = predictor.predict(x_reduced)
    ofs_accuracy.append(accuracy_score(y, y_pred))
    ofs_f1.append(f1_score(y,y_pred, labels=stream.target_values,
    average="weighted"))

    # Train
    predictor.partial_fit(x_reduced, y)

end_time_all = timer()
ofs_run_time = timer() - start_time_all
print("The whole ofs run took {}".format(ofs_run_time))

ofs_moving_average = pd.Series(ofs_accuracy).rolling(window=n_window).mean().iloc[n_window-1:].values
ofs_f1 = pd.Series(ofs_f1).rolling(window=n_window).mean().iloc[n_window-1:].values

if dataset_name == "mnist" or dataset_name == "mnist_norm":
    selection_array = np.zeros((784))
    selection_array[ftr_selection] = 1
    mnist_fig_ofs = paint_digit(selection_array)
# Restart the FileStream
stream.restart()

The whole ofs run took 8.794773913999961


In [24]:
if check_ftrs:
    true_selected_ftr = set(ftr_selection)&set(true_ftrs)
    ofs_perc_ftr_found = len(true_selected_ftr) / len(true_ftrs)* 100
    print("OFS found {}% of the true informative features.".format(fsds_perc_ftr_found))

In [25]:
test_x_selected = np.zeros(test_x.shape)
test_x_selected[:,ftr_selection] = test_x[:,ftr_selection]
y_pred = predictor.predict(test_x)
accuracy_ofs = accuracy_score(test_y, y_pred)
print("For the test dataset the previous trained predictor reached: {}".format(accuracy_ofs))

For the test dataset the previous trained predictor reached: 0.7718


### OFSSGR algorithm

In [26]:
stream.restart()
if pred_algo =="perceptron":
    predictor = PerceptronMask()
elif pred_algo == "hoeffding":
    predictor = HoeffdingTreeClassifier(leaf_prediction=hoeffding_settings["leaf_prediction"])
elif pred_algo == 'fast_decision':
    predictor = ExtremelyFastDecisionTreeClassifier(split_criterion=fast_decision_settings["split_criterion"],
                                                    split_confidence=fast_decision_settings["split_confidence"],
                                                    leaf_prediction=fast_decision_settings["leaf_prediction"])
x,y = stream.next_sample(batch_size=batch_size)
predictor.partial_fit(x,y, stream.target_values)

PerceptronMask(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
               fit_intercept=True, max_iter=1000, n_iter_no_change=5,
               n_jobs=None, penalty=None, random_state=0, shuffle=True,
               tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False)

In [27]:
ofssgd_model = MC_OFSSGD(reduction_threshold=0.4, reduction_value=0.1, regularization_param=0.01, step_size=0.2, n_total_ftrs=stream.n_num_features, n_classes=stream.n_classes)

ofssgd_accuracy = []
ofssgd_f1 = []
ofssgd_selected_ftrs = []
ofssgd_stability = []

start_time_all = timer()
while stream.has_more_samples():
    # Load a new sample
    x, y = stream.next_sample(batch_size=batch_size)

    # Select features
    for idx, label in enumerate(y):
        ofssgd_model.train(x[idx],label)

    ftr_selection = ofssgd_model.get_feature_indices()
    # Truncate x (retain only selected features, 'remove' all others, e.g. by replacing them with 0)
    x_reduced = np.zeros(x.shape)
    x_reduced[:, ftr_selection] = x[:, ftr_selection]

    # stability test
    ftr_array = np.zeros(stream.n_features)
    ftr_array[ftr_selection] = 1
    ofssgd_selected_ftrs.append(ftr_array)

    if len(ofssgd_selected_ftrs) >= 10:
        stability = stability_factor(ofssgd_selected_ftrs[-10:])
        ofssgd_stability.append(stability)

    # Test
    y_pred = predictor.predict(x_reduced)
    ofssgd_accuracy.append(accuracy_score(y, y_pred))
    ofssgd_f1.append(f1_score(y, y_pred, labels=stream.target_values,
    average="weighted"))

    # Train
    predictor.partial_fit(x_reduced, y)

end_time_all = timer()
ofssgd_run_time = timer() - start_time_all
print("The whole ofssgd run took {}".format(ofssgd_run_time))

ofssgd_moving_average = pd.Series(ofssgd_accuracy).rolling(window=n_window).mean().iloc[n_window-1:].values
ofssgd_f1 = pd.Series(ofssgd_f1).rolling(window=n_window).mean().iloc[n_window-1:].values

if dataset_name == "mnist" or dataset_name == "mnist_norm":
    selection_array = np.zeros((784))
    selection_array[ftr_selection] = 1
    mnist_fig_ofssgd = paint_digit(selection_array)


The whole ofssgd run took 134.64296517099865


In [28]:
if check_ftrs:
    true_selected_ftr = set(ftr_selection)&set(true_ftrs)
    ofssgd_perc_ftr_found = len(true_selected_ftr) / len(true_ftrs) * 100
    print("OFSSGD found {}% of the true informative features.".format(ofssgd_perc_ftr_found))

In [29]:
test_x_selected = np.zeros(test_x.shape)
test_x_selected[:,ftr_selection] = test_x[:,ftr_selection]
y_pred = predictor.predict(test_x)
accuracy_ofssgd = accuracy_score(test_y, y_pred)
print("For the test dataset the previous trained predictor reached: {}".format(accuracy_ofssgd))

For the test dataset the previous trained predictor reached: 0.8328


### Pick n random ftrs in each iteration as benchmark

In [30]:
stream.restart()
if pred_algo =="perceptron":
    predictor = PerceptronMask()
elif pred_algo == "hoeffding":
    predictor = HoeffdingTreeClassifier(leaf_prediction=hoeffding_settings["leaf_prediction"])
elif pred_algo == 'fast_decision':
    predictor = ExtremelyFastDecisionTreeClassifier(split_criterion=fast_decision_settings["split_criterion"],
                                                    split_confidence=fast_decision_settings["split_confidence"],
                                                    leaf_prediction=fast_decision_settings["leaf_prediction"])
x,y = stream.next_sample(batch_size=batch_size)
predictor.partial_fit(x,y, stream.target_values)

PerceptronMask(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
               fit_intercept=True, max_iter=1000, n_iter_no_change=5,
               n_jobs=None, penalty=None, random_state=0, shuffle=True,
               tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False)

In [31]:
random_accuracy = []
random_f1 = []
random_selected_ftrs = []
random_stability = []

start_time_all = timer()
while stream.has_more_samples():
    # Load a new sample
    x, y = stream.next_sample(batch_size=batch_size)

    
    # select features
    ftr_selection = np.random.choice(len(x[0]), n_selected_ftr)

    # Truncate x (retain only selected features, 'remove' all others, e.g. by replacing them with 0)
    x_reduced = np.zeros(x.shape)
    x_reduced[:, ftr_selection] = x[:, ftr_selection]

    # stability test
    ftr_array = np.zeros(stream.n_features)
    ftr_array[ftr_selection] = 1
    random_selected_ftrs.append(ftr_array)

    if len(random_selected_ftrs) >= 10:
        stability = stability_factor(random_selected_ftrs[-10:])
        random_stability.append(stability)

    # Test
    y_pred = predictor.predict(x_reduced)
    random_accuracy.append(accuracy_score(y, y_pred))
    random_f1.append(f1_score(y, y_pred, labels=stream.target_values,
    average="weighted"))

    # Train
    predictor.partial_fit(x_reduced, y)

end_time_all = timer()
random_run_time = timer() - start_time_all
print("The whole random run took {}".format(random_run_time))
random_moving_average = pd.Series(random_accuracy).rolling(window=n_window).mean().iloc[n_window-1:].values
random_f1 = pd.Series(random_f1).rolling(window=n_window).mean().iloc[n_window-1:].values


The whole random run took 5.922234612999091


### Plot all


In [39]:
# stability
title = "Stability on dataset {}".format(dataset_name)
col_names = ["FIRES", "OFS", "OFSSGD", "FSDS"]
d = {"FIRES":fires_cuda_stability, "OFS":ofs_stability, "OFSSGD":ofssgd_stability, "FSDS":fsds_stability} #"random":random_stability
df = pd.DataFrame(d, columns=col_names)
fig = px.line(df, y = col_names, title=title, labels={"index":"batches", "value":"stability"},color_discrete_map={'FIRES': 'red', 
                                                    'OFS': 'purple','FSDS': 'green', "OFSSGD":"yellow"})
fig.show()

In [42]:
#moving averages
title = "Moving averages over accuracy while learning with window {} on dataset {}".format(n_window, dataset_name)
col_names = ["Pure","FIRES", "OFS", "OFSSGD", "random","FSDS"]# 
d = {"Pure":pure_moving_average, "FIRES":fires_moving_average, "OFS":ofs_moving_average, 
"OFSSGD":ofssgd_moving_average,  "random":random_moving_average,"FSDS":fsds_moving_average}
df = pd.DataFrame(d, columns=col_names)
fig = px.line(df, y=col_names, title=title, labels={"index":"batches", "value":"accuracy"}, color_discrete_map={"Pure":"blue",'FIRES': 'red', 
                                                    'OFS': 'purple','FSDS': 'green', "OFSSGD":"yellow", "random":"cyan"})#
fig.show()


In [34]:
#moving averages
title = "Moving averages over f1 while learning with window {} on dataset {}".format(n_window, dataset_name)
col_names = ["Pure","FIRES", "OFS", "OFSSGD", "FSDS", "random"]
d = {"Pure":pure_f1, "FIRES":fires_f1, "OFS":ofs_f1, 
"OFSSGD":ofssgd_f1, "FSDS":fsds_f1, "random":random_f1}
df = pd.DataFrame(d, columns=col_names)
fig = px.line(df, y=col_names, title=title, labels={"index":"batches", "value":"accuracy"}, color_discrete_map={"Pure":"blue",'FIRES': 'red', 
                                                   'FSDS': 'green', 'OFS': 'purple', "OFSSGD":"yellow", "random":"cyan"})
fig.show()

In [35]:
if dataset_name == "mnist" or dataset_name == "mnist_norm":
    trace1 = mnist_fig_fires['data'][0]
    trace2 = mnist_fig_fsds['data'][0]
    trace3 = mnist_fig_ofs['data'][0]
    trace4 = mnist_fig_ofssgd['data'][0]
    fig = make_subplots(rows=1, cols=4, subplot_titles=("FIRES", "FSDS", "OFS", "OFSSGD"))
    fig.add_trace(trace1, row=1,col=1)
    fig.add_trace(trace2, row=1,col=2)
    fig.add_trace(trace3, row=1,col=3)
    fig.add_trace(trace4, row=1,col=4)
    fig.update_xaxes(
        visible=False 
    )
    fig.update_yaxes(
        visible=False
    )
    fig.show()


In [36]:
# accuracy on test data
col_names = ["Pure","FIRES","FSDS", "OFS", "OFSSGD"]
values = [accuracy_no_ofs, accuracy_fires, accuracy_fsds, accuracy_ofs, accuracy_ofssgd]
fig = px.bar(x=col_names, y=values, title="Accuracy on test data", labels={"y":"accuracy", "x":""}, color=col_names, color_discrete_map={"Pure":"blue",'FIRES': 'red',
                                                   'FSDS': 'green', 'OFS': 'purple', "OFSSGD":"yellow"})
fig.show()

In [37]:
# run times
col_names = ["FIRES","FSDS", "OFS", "OFSSGD"]
values = [fires_cuda_run_time, fsds_run_time, ofs_run_time, ofssgd_run_time]
fig = px.bar(x=col_names, y=values, title="Runtime", labels={"y":"s", "x":""}, color=col_names, color_discrete_map={'FIRES': 'red', 
                                                   'FSDS': 'green', 'OFS': 'purple', "OFSSGD":"yellow"})
fig.show()

In [38]:
if check_ftrs:
    col_names = ["FIRES","FSDS", "OFS", "OFSSGD"]
    values = [fires_perc_ftr_found, fsds_perc_ftr_found, ofs_perc_ftr_found, ofssgd_perc_ftr_found]
    fig = px.bar(x=col_names, y=values, title="True labels found", labels={"y":"%", "x":""}, color=col_names, color_discrete_map={'FIRES': 'red', 
                                                   'FSDS': 'green', 'OFS': 'purple', "OFSSGD":"yellow"})
    fig.show()