## Concept Drift in Regression Data

In [1]:
# import the functions needed for validate and comparsion

import numpy as np
import cupy as cp
import pandas as pd
from timeit import default_timer as timer
import os

from sklearn.model_selection import train_test_split
#mean_squared_error: mse squared=true, rmse squared=false
from sklearn.metrics import mean_squared_error
from skmultiflow.data import ConceptDriftStream, FileStream
from sklearn.linear_model import SGDRegressor
from skmultiflow.data.regression_generator import RegressionGenerator
from sklearn.preprocessing import MinMaxScaler


# using plotly for plots
import plotly.express as px
from plotly.subplots import make_subplots


In [2]:
from fires import FIRES

In [3]:
# stability measurment as proposed in "Measurment the Stability of Feature Selection"

# TODO: check for case where nothing changes


def pearson_stability_ij(arr1,arr2):
    d = len(arr1)
    k_i = np.sum(arr1)
    k_j = np.sum(arr2)

    # catch edge cases as proposed in the paper under 4.1
    if (k_i == 0 or k_i == d) and k_i != k_j :
        return 0
    elif (k_j == 0 or k_j == d) and k_i != k_j :
        return 0
    elif (k_i == 0 or k_i == d) and k_i == k_j :
        return 1
    x_hat_i = k_i / d
    x_hat_j = k_j / d
    arr1 = arr1 - x_hat_i
    arr2 = arr2 - x_hat_j
    dividend = 1/d * np.sum(arr1*arr2)
    divisor = np.sqrt(1/d*np.sum(arr1**2))*np.sqrt(1/d*np.sum(arr2**2))
    return dividend/divisor

def stability_factor(selected_ftrs):
   M = len(selected_ftrs)
   sum_stabilities = 0
   for i in range(M):
       for j in range(i+1, M):
           sum_stabilities += pearson_stability_ij(selected_ftrs[i], selected_ftrs[j])
   return 1/(M*(M-1))*sum_stabilities * 2   

In [4]:
stream = ConceptDriftStream(stream=RegressionGenerator(random_state=42, n_features=100, n_informative=25), drift_stream=RegressionGenerator(random_state=52, n_features=100, n_informative=25), position=10100, width=10)

dataset_name = "Concept_drift_10"
n_selected_ftr = 25
n_window = 100
batch_size = 10

In [17]:

stream = ConceptDriftStream(stream=RegressionGenerator(random_state=42, n_features=100, n_informative=25), drift_stream=RegressionGenerator(random_state=52, n_features=100, n_informative=25), position=10100, width=1000)

dataset_name = "Concept_drift_1000"
n_selected_ftr = 25
n_window = 100
batch_size = 10

In [18]:
# prepare folder for plots
folder = "plots/regression/{}".format(dataset_name)
if not os.path.exists(folder):
    os.makedirs(folder)

export_type = "pdf" # "png", "jpeg", "webp", "pdf", "svg"

In [19]:
# use SGDRegressor as predictor
stream.restart()
predictor = SGDRegressor()
scaler = MinMaxScaler()
scaler_y = MinMaxScaler()
X, y = stream.next_sample(batch_size=100)
scaler.partial_fit(X)
scaler_y.partial_fit(y.reshape(-1,1))

X = scaler.transform(X)
y = scaler_y.transform(y.reshape(-1,1))
predictor.partial_fit(X,y)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



SGDRegressor()

In [20]:
fires_model = FIRES(n_total_ftr=stream.n_features,
                    target_values=None,
                    mu_init=0,
                    sigma_init=1,
                    model='regression')  

In [21]:

fires_rmse = []


fires_selected_ftrs = []
fires_stability = []

start_time_all = timer()
for i in range(2000):
    # Load a new sample
    x, y = stream.next_sample(batch_size=10)
    scaler.partial_fit(x)
    scaler_y.partial_fit(y.reshape(-1,1))
    x = scaler.transform(x)
    y = scaler_y.transform(y.reshape(-1,1))
    # Select features
    ftr_weights = fires_model.weigh_features(x,y)
    ftr_selection = np.argsort(ftr_weights)[::-1][:n_selected_ftr]

    # Truncate x (retain only selected features, 'remove' all others, e.g. by replacing them with 0)
    x_reduced = np.zeros(x.shape)
    x_reduced[:, ftr_selection] = x[:, ftr_selection]

    # stability test
    ftr_array = np.zeros(stream.n_features)
    ftr_array[ftr_selection] = 1
    fires_selected_ftrs.append(ftr_array)

    if len(fires_selected_ftrs) >= 10:
        stability = stability_factor(fires_selected_ftrs[-10:])
        fires_stability.append(stability)


    # Test
    y_pred = predictor.predict(x_reduced)
    
    fires_rmse.append(mean_squared_error(y, y_pred, squared=False))


    # Train
    predictor.partial_fit(x_reduced, y)

# Restart the FileStream
fires_rmse = pd.Series(fires_rmse).rolling(window=n_window).mean().iloc[n_window-1:].values
end_time_all = timer()
fires_run_time = timer() - start_time_all
print("The whole fires run took {}".format(fires_run_time))


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expec

In [22]:
ftr_weights

array([2.13630148e-01, 8.79536918e-01, 6.58064451e-01, 4.63800787e-01,
       8.40052268e-01, 3.13605186e-01, 5.56519077e-01, 3.92571808e-01,
       4.78888198e-01, 9.17384737e-01, 3.44046148e-01, 1.00000000e+00,
       1.91860994e-01, 4.74117531e-01, 5.56254420e-01, 4.19584410e-01,
       4.03309842e-01, 7.44665151e-01, 5.80021893e-01, 6.23429675e-01,
       4.46622376e-01, 4.90413307e-01, 4.79074001e-01, 5.00052374e-01,
       7.34887928e-01, 1.01105767e-01, 6.11671735e-01, 4.03725616e-01,
       9.35798852e-02, 6.79304699e-01, 5.16162970e-01, 6.87607563e-01,
       5.83070168e-01, 5.95437957e-01, 5.68352087e-01, 4.15345295e-01,
       2.57013878e-01, 7.60150441e-01, 4.48455125e-01, 6.14226913e-01,
       2.30418570e-01, 4.58630968e-01, 5.40856627e-01, 0.00000000e+00,
       6.07162831e-01, 4.09956358e-01, 2.13179868e-01, 5.88292227e-01,
       4.20190930e-01, 2.45345293e-01, 4.76074025e-01, 2.72224512e-01,
       4.63458963e-01, 1.82396822e-04, 4.32763957e-01, 7.66466555e-01,
      

In [23]:
# use SGDRegressor as predictor
stream.restart()
predictor = SGDRegressor()
scaler = MinMaxScaler()
scaler_y = MinMaxScaler()
X, y = stream.next_sample(batch_size=100)
scaler.partial_fit(X)
scaler_y.partial_fit(y.reshape(-1,1))

X = scaler.transform(X)
y = scaler_y.transform(y.reshape(-1,1))
predictor.partial_fit(X,y)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



SGDRegressor()

In [24]:
sgdr_model = SGDRegressor(penalty="l1") #penalty could be elasticnet as well
#n_selectey_ftrs?

In [25]:
sgdr_rmse = []

sgdr_selected_ftrs = []
sgdr_stability = []

start_time_all = timer()
for i in range(2000):
    # Load a new sample
    x, y = stream.next_sample(batch_size=10)
    scaler.partial_fit(x)
    scaler_y.partial_fit(y.reshape(-1,1))
    x = scaler.transform(x)
    y = scaler_y.transform(y.reshape(-1,1))
    # Select features
    sgdr_model.partial_fit(x,y)
    ftr_weights = sgdr_model.coef_
    ftr_selection = np.argsort(ftr_weights)[::-1][:n_selected_ftr]

    # Truncate x (retain only selected features, 'remove' all others, e.g. by replacing them with 0)
    x_reduced = np.zeros(x.shape)
    x_reduced[:, ftr_selection] = x[:, ftr_selection]

    # stability test
    ftr_array = np.zeros(stream.n_features)
    ftr_array[ftr_selection] = 1
    sgdr_selected_ftrs.append(ftr_array)

    if len(sgdr_selected_ftrs) >= 10:
        stability = stability_factor(sgdr_selected_ftrs[-10:])
        sgdr_stability.append(stability)


    # Test
    y_pred = predictor.predict(x_reduced)
    
    sgdr_rmse.append(mean_squared_error(y, y_pred, squared=False))
    
    


    # Train
    predictor.partial_fit(x_reduced, y)

# Restart the FileStream
sgdr_rmse = pd.Series(sgdr_rmse).rolling(window=n_window).mean().iloc[n_window-1:].values
end_time_all = timer()
sgdr_run_time = timer() - start_time_all
print("The whole sgdr run took {}".format(sgdr_run_time))



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expec

In [26]:
# stability
title = "Stability on dataset {}".format(dataset_name)
col_names = ["FIRES", "SGDR"]
d = {"FIRES":fires_stability, "SGDR":sgdr_stability}
df = pd.DataFrame(d, columns=col_names)
fig = px.line(df, y = col_names, title=title, labels={"index":"batches", "value":"stability"}, color_discrete_map={'FIRES': 'red', "SGDR": "green"})
stability_trace = fig["data"]
fig.show()

In [27]:
title = "RMSE on dataset {}".format(dataset_name)
col_names = ["FIRES", "SGDR"]
d = {"FIRES":fires_rmse, "SGDR":sgdr_rmse}
df = pd.DataFrame(d, columns=col_names)
fig = px.line(df, y = col_names, title=title, labels={"index":"batches", "value":"rmse"}, color_discrete_map={'FIRES': 'red', "SGDR": "green"})
rmse_trace = fig["data"]
fig.show()

In [28]:
fig = make_subplots(rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.1) # subplot_titles=("Stability","Accuracy", "F1-Score"),
for i in range(len(stability_trace)):
    stability_trace[i]["showlegend"] = False
    if stability_trace[i]["name"] == "FIRES":
        trace = stability_trace[i]
    else:
        fig.add_trace(stability_trace[i], row=1, col=1)
fig.add_trace(trace, row=1, col=1)
for i in range(len(rmse_trace)):
    fig.add_trace(rmse_trace[i], row=2, col=1)
fig.update_xaxes(title_text="batches", row=2, col=1)
fig.update_yaxes(title_text="Stability", row=1, col=1)
fig.update_yaxes(title_text="RMSE", row=2, col=1)

fig.write_image("{}/{}.{}".format(folder, dataset_name, export_type))
fig.show()