## Testing MinMaxScaler() partial_fit for online normalization

In [1]:
# import the functions needed for validate and comparsion

import numpy as np
import cupy as cp
import pandas as pd
from timeit import default_timer as timer

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from skmultiflow.data import FileStream
from skmultiflow.neural_networks import PerceptronMask
from skmultiflow.data.random_rbf_generator_drift import RandomRBFGeneratorDrift
from sklearn.preprocessing import MinMaxScaler

# using plotly for plots
#import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots


In [2]:
# stability measurment as proposed in "Measurment the Stability of Feature Selection"

def pearson_stability_ij(arr1,arr2):
    d = len(arr1)
    k_i = np.sum(arr1)
    k_j = np.sum(arr2)
    x_hat_i = k_i / d
    x_hat_j = k_j / d
    arr1 = arr1 - x_hat_i
    arr2 = arr2 - x_hat_j
    dividend = 1/d * np.sum(arr1*arr2)
    divisor = np.sqrt(1/d*np.sum(arr1**2))*np.sqrt(1/d*np.sum(arr2**2))
    return dividend/divisor

def stability_factor(selected_ftrs):
   M = len(selected_ftrs)
   sum_stabilities = 0
   for i in range(M):
       for j in range(i+1, M):
           sum_stabilities += pearson_stability_ij(selected_ftrs[i], selected_ftrs[j])
   return 1/(M*(M-1))*sum_stabilities * 2   

In [3]:
# import algorithms
from fires import FIRES
from ofs import OFS, MC_OFS
from ofssgr import OFSSGD, MC_OFSSGD
from fsds import StreamFeatWeight

In [4]:
# Human Activity Recognition
# labels changed from [1,...,6] to [0,...,5]
# rows shuffled
# split into train set with 7352 instances and test set with 2948
stream = FileStream('datasets/Multiclass/har_train.csv', target_idx = 561)
stream.prepare_for_use()
dataset_name = "har"
n_selected_ftr = 100
n_window = 10
batch_size = 20
weights = None

check_ftrs = False


# load test data
test_data = pd.read_csv('datasets/Multiclass/har_test.csv')
test_y = test_data["Class"].to_numpy()
test_x = test_data.drop(columns="Class").to_numpy()

In [4]:
def find_true_ftrs_indices(label_names, start_char):
    indices = []
    for i in range(len(label_names)):
        if label_names[i].startswith(start_char):
            indices.append(i)

    return indices

In [15]:
# synthetic dataset 1 (see data_generation.ipynb)
stream = FileStream('datasets/Multiclass/dataset_4_test.csv', target_idx=210)
stream.prepare_for_use()
dataset_name = "syn_ds_1"
n_selected_ftr = 100 # 15 are really informative
n_window = 10
batch_size = 100
weights = None

# load test data
test_data = pd.read_csv("datasets/Multiclass/dataset_4_test.csv")
test_y = test_data["label"].to_numpy()
test_x = test_data.drop(columns="label").to_numpy()


In [16]:
stream.restart()
predictor = PerceptronMask()
scaler = MinMaxScaler()
x,y = stream.next_sample(batch_size=batch_size)
# online normalization
scaler.partial_fit(x)
x = scaler.transform(x)

predictor.partial_fit(x,y, stream.target_values)

PerceptronMask(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
               fit_intercept=True, max_iter=1000, n_iter_no_change=5,
               n_jobs=None, penalty=None, random_state=0, shuffle=True,
               tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False)

In [17]:
fires_model = FIRES(n_total_ftr=stream.n_features,
                    target_values=stream.target_values,
                    mu_init=0,
                    sigma_init=1,
                    model='softmax',
                    class_probabilities=weights)

       

In [18]:
fires_cuda_accuracy = []
#fsds_f1 = []
#fires_cuda_times = []

fires_cuda_selected_ftrs = []
fires_cuda_stability = []

start_time_all = timer()
while stream.has_more_samples():
    # Load a new sample
    x, y = stream.next_sample(batch_size=batch_size)
    scaler.partial_fit(x)
    x = scaler.transform(x)

    # Select features
    #start_time = timer()
    ftr_weights = fires_model.weigh_features(x,y)
    ftr_selection = np.argsort(ftr_weights)[::-1][:n_selected_ftr]
    #fires_cuda_times.append(timer()-start_time)

    # Truncate x (retain only selected features, 'remove' all others, e.g. by replacing them with 0)
    x_reduced = np.zeros(x.shape)
    x_reduced[:, ftr_selection] = x[:, ftr_selection]

    # stability test
    ftr_array = np.zeros(stream.n_features)
    ftr_array[ftr_selection] = 1
    fires_cuda_selected_ftrs.append(ftr_array)

    if len(fires_cuda_selected_ftrs) >= 10:
        stability = stability_factor(fires_cuda_selected_ftrs[-10:])
        fires_cuda_stability.append(stability)


    # Test
    y_pred = predictor.predict(x_reduced)
    
    fires_cuda_accuracy.append(accuracy_score(y, y_pred))
    #fsds_f1.append(f1_score(y, y_pred, average=None, labels=stream.target_values))


    # Train
    predictor.partial_fit(x_reduced, y)

# Restart the FileStream
end_time_all = timer()
fires_cuda_run_time = timer() - start_time_all
print("The whole FIRES run took {}".format(fires_cuda_run_time))

fires_moving_average = pd.Series(fires_cuda_accuracy).rolling(window=n_window).mean().iloc[n_window-1:].values


The whole FIRES run took 2.137562492000143


In [19]:
test_x_selected = np.zeros(test_x.shape)
test_x_selected[:,ftr_selection] = test_x[:,ftr_selection]
y_pred = predictor.predict(test_x)
accuracy_fires = accuracy_score(test_y, y_pred)
print("For the test dataset the previous trained predictor reached: {}".format(accuracy_fires))

For the test dataset the previous trained predictor reached: 0.3046


### without normalization


In [20]:
stream.restart()
predictor = PerceptronMask()
x,y = stream.next_sample(batch_size=batch_size)

predictor.partial_fit(x,y, stream.target_values)

PerceptronMask(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
               fit_intercept=True, max_iter=1000, n_iter_no_change=5,
               n_jobs=None, penalty=None, random_state=0, shuffle=True,
               tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False)

In [21]:
fires_model = FIRES(n_total_ftr=stream.n_features,
                         target_values=stream.target_values,
                         mu_init=0,
                         sigma_init=1,
                         model='softmax',
                         class_probabilities=weights)  

In [22]:
fires_cuda_accuracy = []
#fsds_f1 = []
#fires_cuda_times = []

fires_cuda_selected_ftrs = []
fires_stability = []

start_time_all = timer()
while stream.has_more_samples():
    # Load a new sample
    x, y = stream.next_sample(batch_size=batch_size)
    
    # Select features
    #start_time = timer()
    ftr_weights = fires_model.weigh_features(x,y)
    ftr_selection = np.argsort(ftr_weights)[::-1][:n_selected_ftr]
    #fires_cuda_times.append(timer()-start_time)

    # Truncate x (retain only selected features, 'remove' all others, e.g. by replacing them with 0)
    x_reduced = np.zeros(x.shape)
    x_reduced[:, ftr_selection] = x[:, ftr_selection]

    # stability test
    ftr_array = np.zeros(stream.n_features)
    ftr_array[ftr_selection] = 1
    fires_cuda_selected_ftrs.append(ftr_array)

    if len(fires_cuda_selected_ftrs) >= 10:
        stability = stability_factor(fires_cuda_selected_ftrs[-10:])
        fires_stability.append(stability)


    # Test
    y_pred = predictor.predict(x_reduced)
    
    fires_cuda_accuracy.append(accuracy_score(y, y_pred))
    #fsds_f1.append(f1_score(y, y_pred, average=None, labels=stream.target_values))


    # Train
    predictor.partial_fit(x_reduced, y)

# Restart the FileStream
end_time_all = timer()
fires_cuda_run_time = timer() - start_time_all
print("The whole FIRES run took {}".format(fires_cuda_run_time))

fires_moving_average_org = pd.Series(fires_cuda_accuracy).rolling(window=n_window).mean().iloc[n_window-1:].values

The whole FIRES run took 2.158005478000632


In [23]:
test_x_selected = np.zeros(test_x.shape)
test_x_selected[:,ftr_selection] = test_x[:,ftr_selection]
y_pred = predictor.predict(test_x)
accuracy_fires = accuracy_score(test_y, y_pred)
print("For the test dataset the previous trained predictor reached: {}".format(accuracy_fires))

For the test dataset the previous trained predictor reached: 0.1584


### data completely normalized


In [13]:
# Human Activity Recognition
# labels changed from [1,...,6] to [0,...,5]
# rows shuffled
# split into train set with 7352 instances and test set with 2948
stream = FileStream('datasets/Multiclass/har_train_norm.csv', target_idx = 561)
stream.prepare_for_use()
dataset_name = "har"
n_selected_ftr = 100
n_window = 10
batch_size = 20
weights = None

check_ftrs = False


# load test data
test_data = pd.read_csv('datasets/Multiclass/har_test_norm.csv')
test_y = test_data["Class"].to_numpy()
test_x = test_data.drop(columns="Class").to_numpy()

In [14]:
stream.restart()
predictor = PerceptronMask()
x,y = stream.next_sample(batch_size=batch_size)

predictor.partial_fit(x,y, stream.target_values)

PerceptronMask(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
               fit_intercept=True, max_iter=1000, n_iter_no_change=5,
               n_jobs=None, penalty=None, random_state=0, shuffle=True,
               tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False)

In [15]:
fires_model = FIRES(n_total_ftr=stream.n_features,
                         target_values=stream.target_values,
                         mu_init=0,
                         sigma_init=1,
                         model='softmax',
                         class_probabilities=weights)  

In [16]:
fires_cuda_accuracy = []
#fsds_f1 = []
#fires_cuda_times = []

fires_cuda_selected_ftrs = []
fires_pre_stability = []

start_time_all = timer()
while stream.has_more_samples():
    # Load a new sample
    x, y = stream.next_sample(batch_size=batch_size)
    
    # Select features
    #start_time = timer()
    ftr_weights = fires_model.weigh_features(x,y)
    ftr_selection = np.argsort(ftr_weights)[::-1][:n_selected_ftr]
    #fires_cuda_times.append(timer()-start_time)

    # Truncate x (retain only selected features, 'remove' all others, e.g. by replacing them with 0)
    x_reduced = np.zeros(x.shape)
    x_reduced[:, ftr_selection] = x[:, ftr_selection]

    # stability test
    ftr_array = np.zeros(stream.n_features)
    ftr_array[ftr_selection] = 1
    fires_cuda_selected_ftrs.append(ftr_array)

    if len(fires_cuda_selected_ftrs) >= 10:
        stability = stability_factor(fires_cuda_selected_ftrs[-10:])
        fires_pre_stability.append(stability)


    # Test
    y_pred = predictor.predict(x_reduced)
    
    fires_cuda_accuracy.append(accuracy_score(y, y_pred))
    #fsds_f1.append(f1_score(y, y_pred, average=None, labels=stream.target_values))


    # Train
    predictor.partial_fit(x_reduced, y)

# Restart the FileStream
end_time_all = timer()
fires_cuda_run_time = timer() - start_time_all
print("The whole FIRES run took {}".format(fires_cuda_run_time))

fires_moving_average_pre = pd.Series(fires_cuda_accuracy).rolling(window=n_window).mean().iloc[n_window-1:].values

The whole FIRES run took 7.336717555999712


In [17]:
test_x_selected = np.zeros(test_x.shape)
test_x_selected[:,ftr_selection] = test_x[:,ftr_selection]
y_pred = predictor.predict(test_x)
accuracy_fires = accuracy_score(test_y, y_pred)
print("For the test dataset the previous trained predictor reached: {}".format(accuracy_fires))

For the test dataset the previous trained predictor reached: 0.8679565512559403


### Plots

In [24]:
col = ["normalized", "original"]#, "previous_normalized"]
data = {"normalized":fires_moving_average, "original": fires_moving_average_org}#, "previous_normalized":fires_moving_average_pre}
df = pd.DataFrame(data, columns=col)
fig = px.line(df, title="Accuracy", labels={"index":"batches", "value":"accuracy"})
fig.show()

In [25]:
col = ["normalized", "original", "previous_normalized"]
data = {"normalized":fires_cuda_stability, "original": fires_stability, "previous_normalized":fires_pre_stability}
df = pd.DataFrame(data, columns=col)
fig = px.line(df, title="Stability", labels={"index":"batches", "value":"stability"})
fig.show()

NameError: name 'fires_pre_stability' is not defined