## Regression Data
The FIRES Model can't be compared to a real datastream algorithm, instead the SGDRegressor form scikit learn

In [1]:
# import the functions needed for validate and comparsion

import numpy as np
import cupy as cp
import pandas as pd
from timeit import default_timer as timer

from sklearn.model_selection import train_test_split
#mean_squared_error: mse squared=true, rmse squared=false
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from skmultiflow.data import FileStream
from sklearn.linear_model import SGDRegressor
from skmultiflow.data.regression_generator import RegressionGenerator

# using plotly for plots
import plotly.express as px
from plotly.subplots import make_subplots


In [2]:
from fires import FIRES

In [3]:
# stability measurment as proposed in "Measurment the Stability of Feature Selection"

def pearson_stability_ij(arr1,arr2):
    d = len(arr1)
    k_i = np.sum(arr1)
    k_j = np.sum(arr2)
    x_hat_i = k_i / d
    x_hat_j = k_j / d
    arr1 = arr1 - x_hat_i
    arr2 = arr2 - x_hat_j
    dividend = 1/d * np.sum(arr1*arr2)
    divisor = np.sqrt(1/d*np.sum(arr1**2))*np.sqrt(1/d*np.sum(arr2**2))
    return dividend/divisor

def stability_factor(selected_ftrs):
   M = len(selected_ftrs)
   sum_stabilities = 0
   for i in range(M):
       for j in range(i+1, M):
           sum_stabilities += pearson_stability_ij(selected_ftrs[i], selected_ftrs[j])
   return 1/(M*(M-1))*sum_stabilities * 2   

### Load Datasets as Streaming Data

In [11]:
stream = FileStream('datasets/Regression/ailerons.csv', target_idx=40)
stream.prepare_for_use()
dataset_name = "ailerons"
n_selected_ftr = 10

# load test data
test_data = pd.read_csv('datasets/Regression/ailerons_test.csv', header=None)
test_y = test_data[40].to_numpy()
test_x = test_data.drop(columns=40).to_numpy()

In [None]:
stream = FileStream('datasets/Regression/ailerons_norm.csv', target_idx=40)
stream.prepare_for_use()
dataset_name = "ailerons_norm"
n_selected_ftr = 10

# load test data
test_data = pd.read_csv('datasets/Regression/ailerons_test_norm.csv', header=None)
test_y = test_data[40].to_numpy()
test_x = test_data.drop(columns=40).to_numpy()

In [32]:
stream = FileStream('datasets/Regression/dataset_1_train.csv', target_idx=150)
stream.prepare_for_use()
dataset_name = "dataset_1"
n_selected_ftr = 30 #25 are informative

true_ftrs = [27, 30, 31, 33, 36, 37, 42, 43, 44, 47, 50, 62, 70, 71, 75, 83, 84, 92, 106, 111, 121, 127, 129, 130, 134]
check_ftrs = True

# load test data
test_data = pd.read_csv('datasets/Regression/dataset_1_test.csv')
test_y = test_data["y"].to_numpy()
test_x = test_data.drop(columns="y").to_numpy()

In [4]:
# normalized data
stream = FileStream('datasets/Regression/dataset_1_norm_train.csv', target_idx=150)
stream.prepare_for_use()
dataset_name = "dataset_1_normalized"
n_selected_ftr = 30 #25 are informative

true_ftrs = [27, 30, 31, 33, 36, 37, 42, 43, 44, 47, 50, 62, 70, 71, 75, 83, 84, 92, 106, 111, 121, 127, 129, 130, 134]
check_ftrs = True

# load test data
test_data = pd.read_csv('datasets/Regression/dataset_1_norm_test.csv')
test_y = test_data["y"].to_numpy()
test_x = test_data.drop(columns="y").to_numpy()

### FIRES for Regression

In [5]:
# use SGDRegressor as predictor
stream.restart()
predictor = SGDRegressor()
X, y = stream.next_sample(batch_size=100)
predictor.partial_fit(X,y)

SGDRegressor()

In [6]:
fires_model = FIRES(n_total_ftr=stream.n_features,
                    target_values=None,
                    mu_init=0,
                    sigma_init=1,
                    model='regression')  

In [7]:
fires_cuda_mse = []
fires_cuda_rmse = []
fires_cuda_msa = []
fires_cuda_r2 = []
#fsds_f1 = []
fires_cuda_times = []

fires_cuda_selected_ftrs = []
fires_cuda_stability = []

start_time_all = timer()
while stream.has_more_samples():
    # Load a new sample
    x, y = stream.next_sample(batch_size=100)
    # Select features
    start_time = timer()
    ftr_weights = fires_model.weigh_features(x,y)
    ftr_selection = np.argsort(ftr_weights)[::-1][:n_selected_ftr]
    fires_cuda_times.append(timer()-start_time)

    # Truncate x (retain only selected features, 'remove' all others, e.g. by replacing them with 0)
    x_reduced = np.zeros(x.shape)
    x_reduced[:, ftr_selection] = x[:, ftr_selection]

    # stability test
    ftr_array = np.zeros(stream.n_features)
    ftr_array[ftr_selection] = 1
    fires_cuda_selected_ftrs.append(ftr_array)

    if len(fires_cuda_selected_ftrs) >= 10:
        stability = stability_factor(fires_cuda_selected_ftrs[-10:])
        fires_cuda_stability.append(stability)


    # Test
    y_pred = predictor.predict(x_reduced)
    
    fires_cuda_mse.append(mean_squared_error(y, y_pred, squared=True))
    fires_cuda_rmse.append(mean_squared_error(y, y_pred, squared=False))
    


    # Train
    predictor.partial_fit(x_reduced, y)

# Restart the FileStream
end_time_all = timer()
fires_cuda_run_time = timer() - start_time_all
print("The whole fires_cuda run took {}".format(fires_cuda_run_time))

stream.restart()

The whole fires_cuda run took 9.55578021999986


In [9]:
# informative features found
if check_ftrs:
    true_selected_ftr = set(ftr_selection)&set(true_ftrs)
    fires_perc_ftr_found = len(true_selected_ftr) / len(true_ftrs)
    print("FIRES found {}% of the true informative features.".format(fires_perc_ftr_found))

FIRES found 0.2% of the true informative features.


In [10]:
y_pred = predictor.predict(test_x)
fires_cuda_mse_test = mean_squared_error(test_y, y_pred)
print("For the test dataset the previous trained predictor reached: {}".format(fires_cuda_mse_test))

For the test dataset the previous trained predictor reached: 0.376254665333794


### Feature selection via SGDRegressor


In [11]:
# use SGDRegressor as predictor
stream.restart
predictor = SGDRegressor()
X, y = stream.next_sample(batch_size=100)
predictor.partial_fit(X,y)

SGDRegressor()

In [12]:
sgdr_model = SGDRegressor(penalty="l1") #penalty could be elasticnet as well
#n_selectey_ftrs?

In [13]:
sgdr_mse = []
sgdr_rmse = []
sgdr_mae = []
sgdr_r2 = []

sgdr_times = []

sgdr_selected_ftrs = []
sgdr_stability = []

start_time_all = timer()
while stream.has_more_samples():
    # Load a new sample
    x, y = stream.next_sample(batch_size=100)
    # Select features
    start_time = timer()
    sgdr_model.partial_fit(x,y)
    ftr_weights = sgdr_model.coef_
    ftr_selection = np.argsort(ftr_weights)[::-1][:n_selected_ftr]
    sgdr_times.append(timer()-start_time)

    # Truncate x (retain only selected features, 'remove' all others, e.g. by replacing them with 0)
    x_reduced = np.zeros(x.shape)
    x_reduced[:, ftr_selection] = x[:, ftr_selection]

    # stability test
    ftr_array = np.zeros(stream.n_features)
    ftr_array[ftr_selection] = 1
    sgdr_selected_ftrs.append(ftr_array)

    if len(sgdr_selected_ftrs) >= 10:
        stability = stability_factor(sgdr_selected_ftrs[-10:])
        sgdr_stability.append(stability)


    # Test
    y_pred = predictor.predict(x_reduced)
    
    sgdr_mse.append(mean_squared_error(y, y_pred))
    sgdr_rmse.append(mean_squared_error(y, y_pred, squared=False))
    


    # Train
    predictor.partial_fit(x_reduced, y)

# Restart the FileStream
end_time_all = timer()
sgdr_run_time = timer() - start_time_all
print("The whole sgdr run took {}".format(sgdr_run_time))
stream.restart()

The whole sgdr run took 0.5348831219998829


In [14]:
# informative features found
if check_ftrs:
    true_selected_ftr = set(ftr_selection)&set(true_ftrs)
    sgdr_perc_ftr_found = len(true_selected_ftr) / len(true_ftrs)
    print("SGDR found {} \% of the true informative features.".format(sgdr_perc_ftr_found))

SGDR found 1.0 \% of the true informative features.


In [18]:
y_pred = predictor.predict(test_x)
sgdr_mse_test = mean_squared_error(test_y, y_pred)
print("For the test dataset the previous trained predictor reached: {}".format(sgdr_mse_test))

For the test dataset the previous trained predictor reached: 0.2826533166638735


### Plot all


In [19]:
# stability
title = "Stability on dataset {}".format(dataset_name)
col_names = ["FIRES", "SGDR"]
d = {"FIRES":fires_cuda_stability, "SGDR":sgdr_stability}
df = pd.DataFrame(d, columns=col_names)
fig = px.line(df, y = col_names, title=title, labels={"index":"batches", "value":"stability"})
fig.show()

In [20]:
title = "MSE on dataset {}".format(dataset_name)
col_names = ["FIRES", "SGDR"]
d = {"FIRES":fires_cuda_mse, "SGDR":sgdr_mse}
df = pd.DataFrame(d, columns=col_names)
fig = px.line(df, y = col_names, title=title, labels={"index":"batches", "value":"mse"})
fig.show()

In [21]:
title = "RMSE on dataset {}".format(dataset_name)
col_names = ["FIRES", "SGDR"]
d = {"FIRES":fires_cuda_rmse, "SGDR":sgdr_rmse}
df = pd.DataFrame(d, columns=col_names)
fig = px.line(df, y = col_names, title=title, labels={"index":"batches", "value":"rmse"})
fig.show()

In [22]:
col_names = ["FIRES", "SGDR" ]
values = [fires_cuda_run_time, sgdr_run_time]
fig = px.bar(x=col_names, y=values, title="Runtime", labels={"y":"s", "x":""}, color=col_names)
fig.show()

In [23]:
col_names = ["FIRES", "SGDR" ]
values = [fires_cuda_mse_test, sgdr_mse_test]
fig = px.bar(x=col_names, y=values, title="MSE on test dataset", labels={"y":"mse", "x":""}, color=col_names)
fig.show()