## Regression Data
The FIRES Model can't be compared to a real datastream algorithm, instead the SGDRegressor form scikit learn

In [18]:
# import the functions needed for validate and comparsion

import numpy as np
import cupy as cp
import pandas as pd
from timeit import default_timer as timer

from sklearn.model_selection import train_test_split
#mean_squared_error: mse squared=true, rmse squared=false
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from skmultiflow.data import FileStream
from sklearn.linear_model import SGDRegressor

# using plotly for plots
#import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots


In [3]:
from fires_cuda import FIRES as FC

In [4]:
# stability measurment as proposed in "Measurment the Stability of Feature Selection"

def pearson_stability_ij(arr1,arr2):
    d = len(arr1)
    k_i = np.sum(arr1)
    k_j = np.sum(arr2)
    x_hat_i = k_i / d
    x_hat_j = k_j / d
    arr1 = arr1 - x_hat_i
    arr2 = arr2 - x_hat_j
    dividend = 1/d * np.sum(arr1*arr2)
    divisor = np.sqrt(1/d*np.sum(arr1**2))*np.sqrt(1/d*np.sum(arr2**2))
    return dividend/divisor

def stability_factor(selected_ftrs):
   M = len(selected_ftrs)
   sum_stabilities = 0
   for i in range(M):
       for j in range(i+1, M):
           sum_stabilities += pearson_stability_ij(selected_ftrs[i], selected_ftrs[j])
   return 1/(M*(M-1))*sum_stabilities * 2   

### Load Datasets as Streaming Data

In [11]:
stream = FileStream('datasets/Regression/ailerons.csv', target_idx=40)
stream.prepare_for_use()
dataset_name = "ailerons"
n_selected_ftr = 10

# load test data
test_data = pd.read_csv('datasets/Regression/ailerons_test.csv', header=None)
test_y = test_data[40].to_numpy()
test_x = test_data.drop(columns=40).to_numpy()

In [32]:
stream = FileStream('datasets/Regression/dataset_1_train.csv', target_idx=150)
stream.prepare_for_use()
dataset_name = "dataset_1"
n_selected_ftr = 30 #25 are informative

true_ftrs = ['x27', 'x30', 'x31', 'x33', 'x36', 'x37', 'x42', 'x43', 'x44', 'x47', 'x50', 'x62', 'x70', 'x71', 'x75', 'x83', 'x84', 'x92', 'x106', 'x111', 'x121', 'x127', 'x129', 'x130', 'x134']

# load test data
test_data = pd.read_csv('datasets/Regression/dataset_1_test.csv')
test_y = test_data["y"].to_numpy()
test_x = test_data.drop(columns="y").to_numpy()

In [None]:
stream = FileStream('datasets/Regression/dataset_1_train.csv', target_idx=150)
stream.prepare_for_use()
dataset_name = "dataset_1_normalized"
n_selected_ftr = 30 #25 are informative

true_ftrs = ['x27', 'x30', 'x31', 'x33', 'x36', 'x37', 'x42', 'x43', 'x44', 'x47', 'x50', 'x62', 'x70', 'x71', 'x75', 'x83', 'x84', 'x92', 'x106', 'x111', 'x121', 'x127', 'x129', 'x130', 'x134']

# load test data
test_data = pd.read_csv('datasets/Regression/dataset_1_test.csv')
test_y = test_data["y"].to_numpy()
test_x = test_data.drop(columns="y").to_numpy()

### FIRES for Regression

In [39]:
# use SGDRegressor as predictor
stream.restart()
predictor = SGDRegressor()
X, y = stream.next_sample(batch_size=1000)
predictor.partial_fit(X,y)

SGDRegressor()

In [34]:
X, y =stream.next_sample(batch_size=10)
print(y)
predictor.predict(X)

[-295.65145819  409.55425795  295.36028728   55.16084949   34.99561704
 -303.54263279   93.03827939 -444.5412978   146.47726406 -410.59013398]


array([-267.27249416,  409.79375524,  242.39939776,   68.48606322,
         52.13583996, -234.67642999,   47.29037493, -365.94698416,
        148.13613816, -409.89881475])

In [40]:
fires_model = FC(n_total_ftr=stream.n_features,
                 target_values=None,
                 mu_init=0,
                 sigma_init=1,
                 model='regression')  

In [41]:
fires_cuda_mse = []
fires_cuda_rmse = []
fires_cuda_msa = []
fires_cuda_r2 = []
#fsds_f1 = []
fires_cuda_times = []

fires_cuda_selected_ftrs = []
fires_cuda_stability = []

start_time_all = timer()
while stream.has_more_samples():
    # Load a new sample
    x, y = stream.next_sample(batch_size=100)
    # Select features
    start_time = timer()
    ftr_weights = fires_model.weigh_features(x,y)
    ftr_selection = np.argsort(ftr_weights)[::-1][:n_selected_ftr]
    fires_cuda_times.append(timer()-start_time)

    # Truncate x (retain only selected features, 'remove' all others, e.g. by replacing them with 0)
    x_reduced = np.zeros(x.shape)
    x_reduced[:, ftr_selection] = x[:, ftr_selection]

    # stability test
    ftr_array = np.zeros(stream.n_features)
    ftr_array[ftr_selection] = 1
    fires_cuda_selected_ftrs.append(ftr_array)

    if len(fires_cuda_selected_ftrs) >= 10:
        stability = stability_factor(fires_cuda_selected_ftrs[-10:])
        fires_cuda_stability.append(stability)


    # Test
    y_pred = predictor.predict(x_reduced)
    
    fires_cuda_mse.append(mean_squared_error(y, y_pred, squared=True))
    fires_cuda_rmse.append(mean_squared_error(y, y_pred, squared=False))
    


    # Train
    predictor.partial_fit(x_reduced, y)

# Restart the FileStream
end_time_all = timer()
fires_cuda_run_time = timer() - start_time_all
print("The whole fires_cuda run took {}".format(fires_cuda_run_time))
stream.restart()

The whole fires_cuda run took 7.987379730000612


In [42]:
fig = px.line(y = fires_cuda_mse, title="MSE for Fires algorithm", labels={"x":"batches", "y":"mse"})
fig.show()

In [43]:
fig = px.line(y = fires_cuda_rmse, title="RMSE for Fires algorithm", labels={"x":"batches", "y":"rmse"})
fig.show()

In [44]:
fig = px.line(y = fires_cuda_stability, title="Stability for Fires algorithm", labels={"x":"batches", "y":"accuracy"})
fig.show()

In [46]:
y_pred = predictor.predict(test_x)
accuracy = mean_squared_error(test_y, y_pred)
print("For the test dataset the previous trained predictor reached: {}".format(accuracy))

For the test dataset the previous trained predictor reached: 4668.614986487397


### Feature selection via SGDRegressor


In [47]:
# use SGDRegressor as predictor
stream.restart
predictor = SGDRegressor()
X, y = stream.next_sample(batch_size=1000)
predictor.partial_fit(X,y)

SGDRegressor()

In [48]:
sgdr_model = SGDRegressor(penalty="l1") #penalty could be elasticnet as well
#n_selectey_ftrs?

In [49]:
sgdr_mse = []
sgdr_rmse = []
sgdr_mae = []
sgdr_r2 = []

sgdr_times = []

sgdr_selected_ftrs = []
sgdr_stability = []

start_time_all = timer()
while stream.has_more_samples():
    # Load a new sample
    x, y = stream.next_sample(batch_size=100)
    # Select features
    start_time = timer()
    sgdr_model.partial_fit(x,y)
    ftr_weights = sgdr_model.coef_
    ftr_selection = np.argsort(ftr_weights)[::-1][:n_selected_ftr]
    sgdr_times.append(timer()-start_time)

    # Truncate x (retain only selected features, 'remove' all others, e.g. by replacing them with 0)
    x_reduced = np.zeros(x.shape)
    x_reduced[:, ftr_selection] = x[:, ftr_selection]

    # stability test
    ftr_array = np.zeros(stream.n_features)
    ftr_array[ftr_selection] = 1
    sgdr_selected_ftrs.append(ftr_array)

    if len(sgdr_selected_ftrs) >= 10:
        stability = stability_factor(sgdr_selected_ftrs[-10:])
        sgdr_stability.append(stability)


    # Test
    y_pred = predictor.predict(x_reduced)
    
    sgdr_mse.append(mean_squared_error(y, y_pred))
    sgdr_rmse.append(mean_squared_error(y, y_pred, squared=False))
    


    # Train
    predictor.partial_fit(x_reduced, y)

# Restart the FileStream
end_time_all = timer()
sgdr_run_time = timer() - start_time_all
print("The whole sgdr run took {}".format(sgdr_run_time))
stream.restart()

The whole sgdr run took 0.4057393660004891


In [50]:
fig = px.line(y = sgdr_mse, title="MSE for SGDR algorithm", labels={"x":"batches", "y":"mse"})
fig.show()

In [51]:
fig = px.line(y = sgdr_rmse, title="RMSE for SGDR algorithm", labels={"x":"batches", "y":"rmse"})
fig.show()

In [52]:
fig = px.line(y = sgdr_stability, title="Stability for SGDR algorithm", labels={"x":"batches", "y":"stability"})
fig.show()

In [None]:
y_pred = predictor.predict(test_x)
accuracy = accuracy_score(test_y, y_pred)
print("For the test dataset the previous trained predictor reached: {}".format(accuracy))