## Regression Data
The FIRES Model can't be compared to a real datastream algorithm, instead the SGDRegressor form scikit learn

In [4]:
# import the functions needed for validate and comparsion

import numpy as np
import cupy as cp
import pandas as pd
from timeit import default_timer as timer
import os

from sklearn.model_selection import train_test_split
#mean_squared_error: mse squared=true, rmse squared=false
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from skmultiflow.data import FileStream
from sklearn.linear_model import SGDRegressor
from skmultiflow.data.regression_generator import RegressionGenerator


# using plotly for plots
import plotly.express as px
from plotly.subplots import make_subplots


In [5]:
from fires import FIRES

In [1]:
from stability import stability_factor

### Load Datasets as Streaming Data

In [7]:
stream = FileStream('datasets/Regression/ailerons_norm.csv', target_idx=40)
stream.prepare_for_use()
dataset_name = "ailerons_norm"
n_selected_ftr = 10

# load test data
test_data = pd.read_csv('datasets/Regression/ailerons_test_norm.csv', header=None)
test_y = test_data[40].to_numpy()
test_x = test_data.drop(columns=40).to_numpy()

New instances of the Stream class are now ready to use after instantiation.


In [49]:
# normalized data
stream = FileStream('datasets/Regression/dataset_1_norm_train.csv', target_idx=150)
stream.prepare_for_use()
dataset_name = "dataset_1_normalized"
n_selected_ftr = 38 #25 are informative

# load test data
test_data = pd.read_csv('datasets/Regression/dataset_1_norm_test.csv')
test_x = test_data.drop(columns="y").to_numpy()
test_y = test_data["y"].to_numpy()


'prepare_for_use' has been deprecated in v0.5.0 and will be removed in v0.7.0.
New instances of the Stream class are now ready to use after instantiation.



In [50]:
# prepare folder for plots
folder = "plots/regression/{}".format(dataset_name)
if not os.path.exists(folder):
    os.makedirs(folder)

export_type = "pdf" # "png", "jpeg", "webp", "pdf", "svg"

### Without FS

In [51]:
stream.restart()
predictor = SGDRegressor()
X, y = stream.next_sample(batch_size=100)
predictor.partial_fit(X,y)

SGDRegressor()

In [52]:
pure_rmse = []
while stream.has_more_samples():
    x, y = stream.next_sample(batch_size=100)

    # Test
    y_pred = predictor.predict(x)
    
    pure_rmse.append(mean_squared_error(y, y_pred, squared=False))

    # Train
    predictor.partial_fit(x, y)


y_pred = predictor.predict(test_x)
pure_rmse_test = mean_squared_error(test_y, y_pred, squared=False)

### FIRES for Regression

In [53]:
# use SGDRegressor as predictor
stream.restart()
predictor = SGDRegressor()
X, y = stream.next_sample(batch_size=100)
predictor.partial_fit(X,y)

SGDRegressor()

In [54]:
fires_model = FIRES(n_total_ftr=stream.n_features,
                    target_values=None,
                    mu_init=0,
                    sigma_init=1,
                    model='regression')  

In [55]:
fires_mse = []
fires_rmse = []
fires_msa = []
fires_r2 = []
fires_times = []

fires_selected_ftrs = []
fires_stability = [0,0,0,0,0,0,0,0,0,0]

start_time_all = timer()
while stream.has_more_samples():
    # Load a new sample
    x, y = stream.next_sample(batch_size=100)
    # Select features
    start_time = timer()
    ftr_weights = fires_model.weigh_features(x,y)
    ftr_selection = np.argsort(ftr_weights)[::-1][:n_selected_ftr]
    fires_times.append(timer()-start_time)

    # Truncate x (retain only selected features, 'remove' all others, e.g. by replacing them with 0)
    x_reduced = np.zeros(x.shape)
    x_reduced[:, ftr_selection] = x[:, ftr_selection]

    # stability test
    ftr_array = np.zeros(stream.n_features)
    ftr_array[ftr_selection] = 1
    fires_selected_ftrs.append(ftr_array)

    if len(fires_selected_ftrs) >= 10:
        stability = stability_factor(fires_selected_ftrs[-10:])
        fires_stability.append(stability)


    # Test
    y_pred = predictor.predict(x_reduced)
    
    fires_mse.append(mean_squared_error(y, y_pred, squared=True))
    fires_rmse.append(mean_squared_error(y, y_pred, squared=False))
    fires_r2.append(r2_score(y,y_pred))
    


    # Train
    predictor.partial_fit(x_reduced, y)

# Restart the FileStream
end_time_all = timer()
fires_run_time = timer() - start_time_all
print("The whole fires run took {}".format(fires_run_time))


The whole fires run took 8.39037617799977


In [56]:
y_pred = predictor.predict(test_x)
fires_mse_test = mean_squared_error(test_y, y_pred, squared=False)
print("For the test dataset the previous trained predictor reached: {}".format(fires_mse_test))

For the test dataset the previous trained predictor reached: 0.3512403699277188


### Feature selection via SGDRegressor


In [57]:
# use SGDRegressor as predictor
stream.restart()
predictor = SGDRegressor()
X, y = stream.next_sample(batch_size=100)
predictor.partial_fit(X,y)

SGDRegressor()

In [58]:
sgdr_model = SGDRegressor(penalty="l1") #penalty could be elasticnet as well
#n_selectey_ftrs?

In [59]:
sgdr_mse = []
sgdr_rmse = []
sgdr_mae = []
sgdr_r2 = []

sgdr_times = []

sgdr_selected_ftrs = []
sgdr_stability = [0,0,0,0,0,0,0,0,0,0]

start_time_all = timer()
while stream.has_more_samples():
    # Load a new sample
    x, y = stream.next_sample(batch_size=100)
    # Select features
    start_time = timer()
    sgdr_model.partial_fit(x,y)
    ftr_weights = sgdr_model.coef_
    ftr_selection = np.argsort(ftr_weights)[::-1][:n_selected_ftr]
    sgdr_times.append(timer()-start_time)

    # Truncate x (retain only selected features, 'remove' all others, e.g. by replacing them with 0)
    x_reduced = np.zeros(x.shape)
    x_reduced[:, ftr_selection] = x[:, ftr_selection]

    # stability test
    ftr_array = np.zeros(stream.n_features)
    ftr_array[ftr_selection] = 1
    sgdr_selected_ftrs.append(ftr_array)

    if len(sgdr_selected_ftrs) >= 10:
        stability = stability_factor(sgdr_selected_ftrs[-10:])
        sgdr_stability.append(stability)


    # Test
    y_pred = predictor.predict(x_reduced)
    
    sgdr_mse.append(mean_squared_error(y, y_pred))
    sgdr_rmse.append(mean_squared_error(y, y_pred, squared=False))
    sgdr_r2.append(r2_score(y,y_pred))

    


    # Train
    predictor.partial_fit(x_reduced, y)

# Restart the FileStream
end_time_all = timer()
sgdr_run_time = timer() - start_time_all
print("The whole sgdr run took {}".format(sgdr_run_time))
stream.restart()

The whole sgdr run took 0.46406184800025585


In [60]:
y_pred = predictor.predict(test_x)
sgdr_mse_test = mean_squared_error(test_y, y_pred, squared=False)
print("For the test dataset the previous trained predictor reached: {}".format(sgdr_mse_test))

For the test dataset the previous trained predictor reached: 0.4988061261940327


### Random FS


In [61]:
stream.restart()
predictor = SGDRegressor()
X, y = stream.next_sample(batch_size=100)
predictor.partial_fit(X,y)

SGDRegressor()

In [62]:
random_rmse = []

while stream.has_more_samples():
    # Load a new sample
    x, y = stream.next_sample(batch_size=100)

    
    # select features
    ftr_selection = np.random.choice(len(x[0]), n_selected_ftr)

    # Truncate x (retain only selected features, 'remove' all others, e.g. by replacing them with 0)
    x_reduced = np.zeros(x.shape)
    x_reduced[:, ftr_selection] = x[:, ftr_selection]


    # Test
    y_pred = predictor.predict(x_reduced)

    random_rmse.append(mean_squared_error(y, y_pred, squared=False))
    

    # Train
    predictor.partial_fit(x_reduced, y)

In [63]:
y_pred = predictor.predict(test_x)
random_rmse_test = mean_squared_error(test_y, y_pred, squared=False)

### Plot all


In [64]:
# stability
title = "Stability on dataset {}".format(dataset_name)
col_names = ["FIRES", "SGDR"]
d = {"FIRES":fires_stability, "SGDR":sgdr_stability}
df = pd.DataFrame(d, columns=col_names)
fig = px.line(df, y = col_names, title=title, labels={"index":"batches", "value":"stability"}, color_discrete_map={'FIRES': 'red', "SGDR": "green"})
stability_trace = fig["data"]
fig.show()

In [65]:
title = "RMSE on dataset {}".format(dataset_name)
col_names = ["Pure", "FIRES", "SGDR", "Random"]
d = {"Pure":pure_rmse, "FIRES":fires_rmse, "SGDR":sgdr_rmse, "Random":random_rmse}
df = pd.DataFrame(d, columns=col_names)
fig = px.line(df, y = col_names, title=title, labels={"index":"batches", "value":"rmse"}, color_discrete_map={"Pure":'blue','FIRES': 'red', "SGDR": "green", "Random":"cyan"})
rmse_trace = fig["data"]
fig.show()

In [66]:
len(pure_rmse)

169

In [67]:
fig = make_subplots(rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.1) # subplot_titles=("Stability","Accuracy", "F1-Score"),
for i in range(len(stability_trace)):
    stability_trace[i]["showlegend"] = False
    if stability_trace[i]["name"] == "FIRES":
        trace = stability_trace[i]
    else:
        fig.add_trace(stability_trace[i], row=1, col=1)
fig.add_trace(trace, row=1, col=1)
for i in range(len(rmse_trace)):
    if rmse_trace[i]["name"] == "FIRES":
        trace = rmse_trace[i]
    else:
        fig.add_trace(rmse_trace[i], row=2, col=1)
fig.add_trace(trace, row=2, col=1)
fig.update_xaxes(title_text="batches", row=2, col=1)
fig.update_yaxes(title_text="Stability", row=1, col=1)
fig.update_yaxes(title_text="RMSE", row=2, col=1)

fig.write_image("{}/all_scores_{}.{}".format(folder, dataset_name, export_type))
fig.show()

In [68]:
col_names = ["FIRES", "SGDR" ]
values = [fires_run_time, sgdr_run_time]
fig = px.bar(x=col_names, y=values, title="Runtime", labels={"y":"s", "x":""}, color=col_names, color_discrete_map={'FIRES': 'red', "SGDR": "blue"})
with open("{}/runtime.csv".format(folder),"a") as outfile:
    outfile.write("{},{}\n".format( values[0],values[1]) )
fig.show()


In [69]:
values = [pure_rmse_test, fires_mse_test, sgdr_mse_test, random_rmse_test]
with open("{}/rmse.csv".format(folder),"a") as outfile:
    outfile.write("{},{},{},{}\n".format( values[0],values[1], values[2], values[3]))
