# Validation notebook
Here the FIRES implementation for multiclass and regression is validated and compared to other online features selcetion algorithms.


In [1]:
# import the functions needed for validate and comparsion

import numpy as np
import cupy as cp
import pandas as pd
from timeit import default_timer as timer

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from skmultiflow.data import FileStream
from skmultiflow.neural_networks import PerceptronMask
from skmultiflow.data.random_rbf_generator_drift import RandomRBFGeneratorDrift

# using plotly for plots
#import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots


In [None]:
# paint mnist
def paint_digit(digit_values):
    fig = px.imshow(digit_values.reshape(28,28), binary_string=True)
    fig.update_layout(coloraxis_showscale=False)
    fig.update_xaxes(showticklabels=False)
    fig.update_yaxes(showticklabels=False)
    fig.show()
    return(fig)

In [None]:
# stability measurment as proposed in "Measurment the Stability of Feature Selection"

def pearson_stability_ij(arr1,arr2):
    d = len(arr1)
    k_i = np.sum(arr1)
    k_j = np.sum(arr2)
    x_hat_i = k_i / d
    x_hat_j = k_j / d
    arr1 = arr1 - x_hat_i
    arr2 = arr2 - x_hat_j
    dividend = 1/d * np.sum(arr1*arr2)
    divisor = np.sqrt(1/d*np.sum(arr1**2))*np.sqrt(1/d*np.sum(arr2**2))
    return dividend/divisor

def stability_factor(selected_ftrs):
   M = len(selected_ftrs)
   sum_stabilities = 0
   for i in range(M):
       for j in range(i+1, M):
           sum_stabilities += pearson_stability_ij(selected_ftrs[i], selected_ftrs[j])
   return 1/(M*(M-1))*sum_stabilities * 2   

In [None]:
# import algorithms
from fires import FIRES
from ofs import OFS, MC_OFS
from ofssgr import OFSSGD, MC_OFSSGD
from fsds import StreamFeatWeight

## Multiclass Data

Here the FIRES softmax implementation is compared to the FSDS, OFS and OFSSGD oun multiclass data.


### Load Datasets as Streaming Data

In [None]:
# MNIST data
stream = FileStream('datasets/Multiclass/mnist_train_normalized.csv', target_idx=0)
stream.prepare_for_use()
dataset_name = "mnist"
n_selected_ftr = 100
n_window = 10
batch_size = 100
weights = None

# load test data
test_data = pd.read_csv('datasets/Multiclass/mnist_test_normalized.csv', header=None)
test_y = test_data[0].to_numpy()
test_x = test_data.drop(columns=0).to_numpy()


In [None]:
# Human Activity Recognition
# labels changed from [1,...,6] to [0,...,5]
# rows shuffled
# split into train set with 7352 instances and test set with 2948
stream = FileStream('datasets/Multiclass/har_train.csv', target_idx = 561)
stream.prepare_for_use()
dataset_name = "har"
n_selected_ftr = 100
n_window = 10
batch_size = 100
weights = None


# load test data
test_data = pd.read_csv('datasets/Multiclass/har_test.csv')
test_y = test_data["Class"].to_numpy()
test_x = test_data.drop(columns="Class").to_numpy()


In [None]:
# Covtype scaled to 0,1
# https://archive.ics.uci.edu/ml/datasets/covertype

# rows shuffled
# split into train set with 400000 instances and test set with 180000
stream = FileStream('datasets/Multiclass/covtype.scale01.test.csv', target_idx = 0)
stream.prepare_for_use()
dataset_name = "covtype"
n_selected_ftr = 25
n_window = 50
batch_size = 100
weights = None

# load test data
test_data = pd.read_csv('datasets/Multiclass/covtype.scale01.train.csv', header=None)
test_y = test_data[0].to_numpy()
test_x = test_data.drop(columns=0).to_numpy()

In [4]:
def find_true_ftrs_indices(label_names, start_char):
    indices = []
    for i in range(len(label_names)):
        if label_names[i].startswith(start_char):
            indices.append(i)

    return indices

In [None]:
# synthetic dataset 1 (see data_generation.ipynb)
stream = FileStream('datasets/Multiclass/dataset_1_test.csv', target_idx=100)
stream.prepare_for_use()
dataset_name = "syn_ds_1"
n_selected_ftr = 20 # 15 are really informative
n_window = 10
batch_size = 100
weights = None

# load test data
test_data = pd.read_csv("datasets/Multiclass/dataset_1_test.csv")
test_y = test_data["label"].to_numpy()
test_x = test_data.drop(columns="label").to_numpy()

# get index of real ftrs
true_ftrs = find_true_ftrs_indices(test_data.columns, "y")
check_ftrs = True

In [None]:
# synthetic dataset 2 (see data_generation.ipynb)
stream = FileStream('datasets/Multiclass/dataset_2_test.csv', target_idx=500)
stream.prepare_for_use()
dataset_name = "syn_ds_2"
n_selected_ftr = 30 # 25 are really informative
n_window = 10
batch_size = 100
weights = None

# load test data
test_data = pd.read_csv("datasets/Multiclass/dataset_2_test.csv")
test_y = test_data["label"].to_numpy()
test_x = test_data.drop(columns="label").to_numpy()

# get index of real ftrs
true_ftrs = find_true_ftrs_indices(test_data.columns, "y")
check_ftrs = True

In [None]:
# synthetic dataset 3 (see data_generation.ipynb)
stream = FileStream('datasets/Multiclass/dataset_3_test.csv', target_idx=250)
stream.prepare_for_use()
dataset_name = "syn_ds_3"
n_selected_ftr = 20 # 20 are really informative
n_window = 10
batch_size = 100
weights = [0.1, 0.05, 0.15, 0.2, 0.025, 0.125, 0.075, 0.275]

# load test data
test_data = pd.read_csv("datasets/Multiclass/dataset_3_test.csv")
test_y = test_data["label"].to_numpy()
test_x = test_data.drop(columns="label").to_numpy()

# get index of real ftrs
true_ftrs = find_true_ftrs_indices(test_data.columns, "y")
check_ftrs = True

In [None]:
# stream generator to test how algorithms perform on data with concept drift
stream = RandomRBFGeneratorDrift(model_random_state=99, sample_random_state = 50,
 n_classes = 4, n_features = 10, n_centroids = 50, change_speed=0.87,
 num_drift_centroids=50)

### Test without feature selection


In [None]:
predictor = PerceptronMask()
x,y = stream.next_sample(batch_size=batch_size)
predictor.partial_fit(x,y, stream.target_values)

In [None]:
accuarcy_pure = []
while stream.has_more_samples():
    x, y = stream.next_sample(batch_size=batch_size)
    y_pred = predictor.predict(x)
    accuarcy_pure.append(accuracy_score(y, y_pred))
    predictor.partial_fit(x,y)

In [None]:
fig = px.line(y = accuarcy_pure, title="Accuracy without ftr selection", labels={"x":"batches", "y":"accuracy"})
fig.show()

In [None]:
y_pred = predictor.predict(test_x)
accuracy_no_ofs = accuracy_score(test_y, y_pred)
print("For the test dataset the previous trained predictor reached: {}".format(accuracy_no_ofs))

### FIRES Framework

In [None]:
stream.restart()
predictor = PerceptronMask()
x,y = stream.next_sample(batch_size=batch_size)
predictor.partial_fit(x,y, stream.target_values)

In [None]:
# still without d, regularizer set 0.1 without being in the derivatives
fires_model = FIRES(n_total_ftr=stream.n_features,
                    target_values=stream.target_values,
                    mu_init=0,
                    sigma_init=1,
                    model='softmax',
                    class_probabilities=weights)               

In [None]:
fires_cuda_accuracy = []
#fsds_f1 = []
fires_cuda_times = []

fires_cuda_selected_ftrs = []
fires_cuda_stability = []

start_time_all = timer()
while stream.has_more_samples():
    # Load a new sample
    x, y = stream.next_sample(batch_size=batch_size)
    # Select features
    start_time = timer()
    ftr_weights = fires_model.weigh_features(x,y)
    ftr_selection = np.argsort(ftr_weights)[::-1][:n_selected_ftr]
    fires_cuda_times.append(timer()-start_time)

    # Truncate x (retain only selected features, 'remove' all others, e.g. by replacing them with 0)
    x_reduced = np.zeros(x.shape)
    x_reduced[:, ftr_selection] = x[:, ftr_selection]

    # stability test
    ftr_array = np.zeros(stream.n_features)
    ftr_array[ftr_selection] = 1
    fires_cuda_selected_ftrs.append(ftr_array)

    if len(fires_cuda_selected_ftrs) >= 10:
        stability = stability_factor(fires_cuda_selected_ftrs[-10:])
        fires_cuda_stability.append(stability)


    # Test
    y_pred = predictor.predict(x_reduced)
    
    fires_cuda_accuracy.append(accuracy_score(y, y_pred))
    #fsds_f1.append(f1_score(y, y_pred, average=None, labels=stream.target_values))


    # Train
    predictor.partial_fit(x_reduced, y)

# Restart the FileStream
end_time_all = timer()
fires_cuda_run_time = timer() - start_time_all
print("The whole FIRES run took {}".format(fires_cuda_run_time))
fires_moving_average = pd.Series(fires_cuda_accuracy).rolling(window=n_window).mean().iloc[n_window-1:].values
stream.restart()

In [None]:
if check_ftrs:
    true_selected_ftr = set(ftr_selection)&set(true_ftrs)
    fires_perc_ftr_found = len(true_selected_ftr) / len(true_ftrs)
    print("FIRES found {} \% of the true informative features.".format(fires_perc_ftr_found))

In [None]:
test_x_selected = np.zeros(test_x.shape)
test_x_selected[:,ftr_selection] = test_x[:,ftr_selection]
y_pred = predictor.predict(test_x)
accuracy_fires = accuracy_score(test_y, y_pred)
print("For the test dataset the previous trained predictor reached: {}".format(accuracy_fires))

### FIRES without cuda

Only one batch because of long calculation time

In [None]:
stream.restart()
print("Choosen dataset: {}".format(dataset_name))
predictor = PerceptronMask()
x,y = stream.next_sample(batch_size=batch_size)
predictor.partial_fit(x,y, stream.target_values)

In [None]:
fires_model = FIRES(n_total_ftr=stream.n_features,
                    target_values=stream.target_values,
                    mu_init=0,
                    sigma_init=1,
                    model='softmax')
print(fires_model.n_mc_samples)


x, y = stream.next_sample(batch_size=batch_size)
# Select features
start_time = timer()
ftr_weights = fires_model.weigh_features(x, y)  # Get feature weights with FIRES
ftr_selection = np.argsort(ftr_weights)[::-1][:n_selected_ftr]
print(timer()-start_time)     

### FSDS algorithm


In [None]:
stream.restart()
print("Choosen dataset: {}".format(dataset_name))
predictor = PerceptronMask()
x,y = stream.next_sample(batch_size=batch_size)
predictor.partial_fit(x,y, stream.target_values)

In [None]:
fsds_model = StreamFeatWeight(m=stream.n_features, k=stream.n_classes)
fsds_model.low_rank_approximation(x.T) # needs some pretraining in the first run

fsds_selected_ftrs = []
fsds_stability = []

fsds_accuracy = []
#fsds_f1 = []
fsds_times = []

start_time_all = timer()
while stream.has_more_samples():
    # Load a new sample
    x, y = stream.next_sample(batch_size=batch_size)
    # Select features
    start_time = timer()
    ftr_weights = fsds_model.low_rank_approximation(x.T)
    ftr_selection = np.argsort(ftr_weights)[::-1][:n_selected_ftr]
    fsds_times.append(timer()-start_time)

    # Truncate x (retain only selected features, 'remove' all others, e.g. by replacing them with 0)
    x_reduced = np.zeros(x.shape)
    x_reduced[:, ftr_selection] = x[:, ftr_selection]

     # stability test
    ftr_array = np.zeros(stream.n_features)
    ftr_array[ftr_selection] = 1
    fsds_selected_ftrs.append(ftr_array)

    if len(fsds_selected_ftrs) >= 10:
        stability = stability_factor(fsds_selected_ftrs[-10:])
        fsds_stability.append(stability)

    # Test
    y_pred = predictor.predict(x_reduced)
    
    fsds_accuracy.append(accuracy_score(y, y_pred))
    #fsds_f1.append(f1_score(y, y_pred, average=None, labels=stream.target_values))


    # Train
    predictor.partial_fit(x_reduced, y)

# Restart the FileStream
end_time_all = timer()
fsds_run_time = timer() - start_time_all
print("The whole fsds run took {}".format(fsds_run_time))
fsds_moving_average = pd.Series(fsds_accuracy).rolling(window=n_window).mean().iloc[n_window-1:].values
stream.restart()

In [None]:
if check_ftrs:
    true_selected_ftr = set(ftr_selection)&set(true_ftrs)
    fires_perc_ftr_found = len(true_selected_ftr) / len(true_ftrs)
    print("FIRES found {} \% of the true informative features.".format(fires_perc_ftr_found))

In [None]:
test_x_selected = np.zeros(test_x.shape)
test_x_selected[:,ftr_selection] = test_x[:,ftr_selection]
y_pred = predictor.predict(test_x)
accuracy_fsds = accuracy_score(test_y, y_pred)
print("For the test dataset the previous trained predictor reached: {}".format(accuracy_fsds))

### OFS algorithm

In [None]:
stream.restart()
predictor = PerceptronMask()
x,y = stream.next_sample(batch_size=batch_size)
predictor.partial_fit(x,y, stream.target_values)

In [None]:
ofs = MC_OFS(regularization_param = 0.01, step_size = 0.1, n_selected_ftr=n_selected_ftr, n_total_ftr=stream.n_num_features, n_classes=stream.n_classes)

ofs_accuracy = []
ofs_selected_ftrs = []
ofs_stability = []

start_time_all = timer()
while stream.has_more_samples():
    # Load a new sample
    x, y = stream.next_sample(batch_size=batch_size)

    # Select features
    for idx, label in enumerate(y):
        ofs.train(x[idx],label)

    ftr_selection = ofs.get_feature_indices()
    # Truncate x (retain only selected features, 'remove' all others, e.g. by replacing them with 0)
    x_reduced = np.zeros(x.shape)
    x_reduced[:, ftr_selection] = x[:, ftr_selection]

     # stability test
    ftr_array = np.zeros(stream.n_features)
    ftr_array[ftr_selection] = 1
    ofs_selected_ftrs.append(ftr_array)

    if len(ofs_selected_ftrs) >= 10:
        stability = stability_factor(ofs_selected_ftrs[-10:])
        ofs_stability.append(stability)

    # Test
    y_pred = predictor.predict(x_reduced)
    ofs_accuracy.append(accuracy_score(y, y_pred))

    # Train
    predictor.partial_fit(x_reduced, y)

end_time_all = timer()
ofs_run_time = timer() - start_time_all
print("The whole ofs run took {}".format(ofs_run_time))
ofs_moving_average = pd.Series(ofs_accuracy).rolling(window=n_window).mean().iloc[n_window-1:].values
# Restart the FileStream
stream.restart()

In [None]:
if check_ftrs:
    true_selected_ftr = set(ftr_selection)&set(true_ftrs)
    fires_perc_ftr_found = len(true_selected_ftr) / len(true_ftrs)
    print("FIRES found {} \% of the true informative features.".format(fires_perc_ftr_found))

In [None]:
test_x_selected = np.zeros(test_x.shape)
test_x_selected[:,ftr_selection] = test_x[:,ftr_selection]
y_pred = predictor.predict(test_x)
accuracy_ofs = accuracy_score(test_y, y_pred)
print("For the test dataset the previous trained predictor reached: {}".format(accuracy_ofs))

### OFSSGR algorithm

In [None]:
stream.restart()
predictor = PerceptronMask()
x,y = stream.next_sample(batch_size=batch_size)
predictor.partial_fit(x,y, stream.target_values)

In [None]:
ofssgd_model = MC_OFSSGD(reduction_threshold=0.4, reduction_value=0.1, regularization_param=0.01, step_size=0.2, n_total_ftrs=stream.n_num_features, n_classes=stream.n_classes)

ofssgd_accuracy = []
ofssgd_selected_ftrs = []
ofssgd_stability = []

start_time_all = timer()
while stream.has_more_samples():
    # Load a new sample
    x, y = stream.next_sample(batch_size=batch_size)

    # Select features
    for idx, label in enumerate(y):
        ofssgd_model.train(x[idx],label)

    ftr_selection = ofssgd_model.get_feature_indices()
    # Truncate x (retain only selected features, 'remove' all others, e.g. by replacing them with 0)
    x_reduced = np.zeros(x.shape)
    x_reduced[:, ftr_selection] = x[:, ftr_selection]

    # stability test
    ftr_array = np.zeros(stream.n_features)
    ftr_array[ftr_selection] = 1
    ofssgd_selected_ftrs.append(ftr_array)

    if len(ofssgd_selected_ftrs) >= 10:
        stability = stability_factor(ofssgd_selected_ftrs[-10:])
        ofssgd_stability.append(stability)

    # Test
    y_pred = predictor.predict(x_reduced)
    ofssgd_accuracy.append(accuracy_score(y, y_pred))

    # Train
    predictor.partial_fit(x_reduced, y)

end_time_all = timer()
ofssgd_run_time = timer() - start_time_all
print("The whole ofssgd run took {}".format(ofssgd_run_time))
ofssgd_moving_average = pd.Series(ofssgd_accuracy).rolling(window=n_window).mean().iloc[n_window-1:].values
# Restart the FileStream
stream.restart()

In [None]:
if check_ftrs:
    true_selected_ftr = set(ftr_selection)&set(true_ftrs)
    fires_perc_ftr_found = len(true_selected_ftr) / len(true_ftrs)
    print("FIRES found {} \% of the true informative features.".format(fires_perc_ftr_found))

In [None]:
test_x_selected = np.zeros(test_x.shape)
test_x_selected[:,ftr_selection] = test_x[:,ftr_selection]
y_pred = predictor.predict(test_x)
accuracy_ofssgd = accuracy_score(test_y, y_pred)
print("For the test dataset the previous trained predictor reached: {}".format(accuracy_ofssgd))

### Pick n random ftrs in each iteration as benchmark

In [None]:
stream.restart()
predictor = PerceptronMask()
x,y = stream.next_sample(batch_size=batch_size)
predictor.partial_fit(x,y, stream.target_values)

In [None]:
random_accuracy = []
random_selected_ftrs = []
random_stability = []

start_time_all = timer()
while stream.has_more_samples():
    # Load a new sample
    x, y = stream.next_sample(batch_size=batch_size)

    
    # select features
    ftr_selection = np.random.choice(len(x[0]), n_selected_ftr)

    # Truncate x (retain only selected features, 'remove' all others, e.g. by replacing them with 0)
    x_reduced = np.zeros(x.shape)
    x_reduced[:, ftr_selection] = x[:, ftr_selection]

    # stability test
    ftr_array = np.zeros(stream.n_features)
    ftr_array[ftr_selection] = 1
    random_selected_ftrs.append(ftr_array)

    if len(random_selected_ftrs) >= 10:
        stability = stability_factor(random_selected_ftrs[-10:])
        random_stability.append(stability)

    # Test
    y_pred = predictor.predict(x_reduced)
    random_accuracy.append(accuracy_score(y, y_pred))

    # Train
    predictor.partial_fit(x_reduced, y)

end_time_all = timer()
random_run_time = timer() - start_time_all
print("The whole random run took {}".format(random_run_time))
random_moving_average = pd.Series(ofssgd_accuracy).rolling(window=n_window).mean().iloc[n_window-1:].values
# Restart the FileStream
stream.restart()

### Plot all


In [None]:
# stability
title = "Stability on dataset {}".format(dataset_name)
col_names = ["FIRES", "OFS", "OFSSGD", "FSDS", "random"]
d = {"FIRES":fires_cuda_stability, "OFS":ofs_stability, 
"OFSSGD":ofssgd_stability, "FSDS":fsds_stability, "random":random_stability}
df = pd.DataFrame(d, columns=col_names)
fig = px.line(df, y = col_names, title=title, labels={"index":"batches", "value":"stability"})
fig.show()

In [None]:
#moving averages
title = "Moving averages over accuracy while learning with window {} on dataset {}".format(n_window, dataset_name)
col_names = ["FIRES", "OFS", "OFSSGD", "FSDS", "random"]
d = {"FIRES":fires_moving_average, "OFS":ofs_moving_average, 
"OFSSGD":ofssgd_moving_average, "FSDS":fsds_moving_average, "random":random_moving_average}
df = pd.DataFrame(d, columns=col_names)
fig = px.line(df, y=col_names, title=title, labels={"index":"batches", "value":"accuracy"})
fig.show()


In [None]:
trace1 = mnist_fig_fires['data'][0]
trace2 = mnist_fig_fsds['data'][0]
trace3 = mnist_fig_ofs['data'][0]
trace4 = mnist_fig_ofssgd['data'][0]
fig = make_subplots(rows=1, cols=4, subplot_titles=("FIRES", "FSDS", "OFS", "OFSSGD"))
fig.add_trace(trace1, row=1,col=1)
fig.add_trace(trace2, row=1,col=2)
fig.add_trace(trace3, row=1,col=3)
fig.add_trace(trace4, row=1,col=4)
fig.update_xaxes(
    visible=False 
)
fig.update_yaxes(
    visible=False
)


In [None]:
# accuracy on test data
col_names = ["Pure","FIRES","FSDS", "OFS", "OFSSGD"]
values = [accuracy_no_ofs, accuracy_fires, accuracy_fsds, accuracy_ofs, accuracy_ofssgd]
fig = px.bar(x=col_names, y=values, title="Accuracy on test data", labels={"y":"accuracy", "x":""}, color=col_names)
fig.show()

In [None]:
# run times
col_names = ["FIRES","FSDS", "OFS", "OFSSGD", ]
values = [fires_run_time, fsds_run_time, ofs_run_time, ofssgd_run_time]
fig = px.bar(x=col_names, y=values, title="Runtime", labels={"y":"run time", "x":""}, color=col_names)
fig.show()