## Test if algorithms find high correlated features

### Create dataset with extra ftrs

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [16]:
# covtype as test data,  don't run this
"""
df = pd.read_csv('/home/kitten/BA_FIRES/fires/datasets/Multiclass/covtype.csv')
labels = df.iloc[:,0]
target = labels * 2
random_ftr = np.random.rand(len(target))
df[len(df.columns)] = target
df[len(df.columns)] = random_ftr


#normalize data
df = pd.DataFrame(MinMaxScaler().fit_transform(df))
df[0] = labels

df.to_csv('/home/kitten/BA_FIRES/fires/datasets/Multiclass/covtype_ex_ftr.csv', header=None, index=None)
"""

In [8]:
#values, counts = np.unique(labels, return_counts=True)
#class_weights = counts / len(labels)
#[0.36460521, 0.48759922, 0.06153746, 0.00472796, 0.01633873, 0.02989095, 0.03530048]

### Test algorithms on this data

In [3]:
from skmultiflow.data import FileStream
from skmultiflow.neural_networks import PerceptronMask

In [4]:
import plotly.express as px

In [5]:
# import algorithms
from fires import FIRES
from ofs import OFS, MC_OFS
from ofssgr import OFSSGD, MC_OFSSGD
from fsds import StreamFeatWeight

In [6]:
stream = FileStream('datasets/Multiclass/covtype_ex_ftr.csv', target_idx = 0)
stream.prepare_for_use()
dataset_name = "covtype"
n_selected_ftr = 25
n_window = 50
batch_size = 50
cor_ftr_index = 54
rand_ftr_index = 55
weights = [0.36460521, 0.48759922, 0.06153746, 0.00472796, 0.01633873, 0.02989095, 0.03530047]

In [7]:
np.sum(weights)

1.0

In [8]:
stream.restart()
predictor = PerceptronMask()

x,y = stream.next_sample(batch_size=batch_size)
predictor.partial_fit(x,y, stream.target_values)

PerceptronMask(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
               fit_intercept=True, max_iter=1000, n_iter_no_change=5,
               n_jobs=None, penalty=None, random_state=0, shuffle=True,
               tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False)

In [9]:
fires_model = FIRES(n_total_ftr=stream.n_features,
                    target_values=stream.target_values,
                    mu_init=0,
                    sigma_init=1,
                    model='softmax',
                    class_probabilities=weights)               


In [10]:
fires_cuda_accuracy = []
fires_f1 = []
fires_cor_weights = []
fires_rand_weights = []


fires_cuda_selected_ftrs = []
fires_cuda_stability = []

while stream.has_more_samples():
    # Load a new sample
    x, y = stream.next_sample(batch_size=batch_size)
    # Select features
    
    ftr_weights = fires_model.weigh_features(x,y)

    fires_cor_weights.append(ftr_weights[cor_ftr_index])
    fires_rand_weights.append(ftr_weights[rand_ftr_index])

    ftr_selection = np.argsort(ftr_weights)[::-1][:n_selected_ftr]
    

    # Truncate x (retain only selected features, 'remove' all others, e.g. by replacing them with 0)
    x_reduced = np.zeros(x.shape)
    x_reduced[:, ftr_selection] = x[:, ftr_selection]

    # stability test
    ftr_array = np.zeros(stream.n_features)
    ftr_array[ftr_selection] = 1
    fires_cuda_selected_ftrs.append(ftr_array)

    """
    if len(fires_cuda_selected_ftrs) >= 10:
        stability = stability_factor(fires_cuda_selected_ftrs[-10:])
        fires_cuda_stability.append(stability)
    """

    # Test
    y_pred = predictor.predict(x_reduced)


    # Train
    predictor.partial_fit(x_reduced, y)

In [11]:
cor_ftr = list(map(lambda x: x[cor_ftr_index] == 1, fires_cuda_selected_ftrs))
rand_ftr = list(map(lambda x: x[rand_ftr_index] == 1, fires_cuda_selected_ftrs))

In [12]:
print(np.mean(cor_ftr))
print(np.mean(rand_ftr))

1.0
1.0


In [13]:
df = {"cor": fires_cor_weights, "rand":fires_rand_weights}
df = pd.DataFrame(df)
fig = px.line(df)
fig.show()

In [100]:
stream.restart()
predictor = PerceptronMask()

x,y = stream.next_sample(batch_size=batch_size)
predictor.partial_fit(x,y, stream.target_values)

PerceptronMask(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
               fit_intercept=True, max_iter=1000, n_iter_no_change=5,
               n_jobs=None, penalty=None, random_state=0, shuffle=True,
               tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False)

In [101]:
fsds_model = StreamFeatWeight(m=stream.n_features, k=stream.n_classes)
fsds_model.low_rank_approximation(x.T) # needs some pretraining in the first run

fsds_selected_ftrs = []
fsds_cor_weights = []
fsds_rand_weights = []


while stream.has_more_samples():
    # Load a new sample
    x, y = stream.next_sample(batch_size=batch_size)
    # Select features
   
    ftr_weights = fsds_model.low_rank_approximation(x.T)
    fsds_cor_weights.append(ftr_weights[cor_ftr_index])
    fsds_rand_weights.append(ftr_weights[rand_ftr_index])
    ftr_selection = np.argsort(ftr_weights)[::-1][:n_selected_ftr]

    # Truncate x (retain only selected features, 'remove' all others, e.g. by replacing them with 0)
    x_reduced = np.zeros(x.shape)
    x_reduced[:, ftr_selection] = x[:, ftr_selection]

     # stability test
    ftr_array = np.zeros(stream.n_features)
    ftr_array[ftr_selection] = 1
    fsds_selected_ftrs.append(ftr_array)

    # Test
    y_pred = predictor.predict(x_reduced)


    # Train
    predictor.partial_fit(x_reduced, y)


LinAlgError: SVD did not converge

In [None]:
df = {"cor": fsds_cor_weights, "rand":fsds_rand_weights}
df = pd.DataFrame(df)
fig = px.line(df)
fig.show()

In [None]:
ftr_weights

In [14]:
stream.restart()

predictor = PerceptronMask()

x,y = stream.next_sample(batch_size=batch_size)
predictor.partial_fit(x,y, stream.target_values)

PerceptronMask(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
               fit_intercept=True, max_iter=1000, n_iter_no_change=5,
               n_jobs=None, penalty=None, random_state=0, shuffle=True,
               tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False)

In [15]:
ofs = MC_OFS(regularization_param = 0.01, step_size = 0.1, n_selected_ftr=n_selected_ftr, n_total_ftr=stream.n_num_features, n_classes=stream.n_classes)


ofs_selected_ftrs = []
ofs_cor_weights = []
ofs_rand_weights = []


while stream.has_more_samples():
    # Load a new sample
    x, y = stream.next_sample(batch_size=batch_size)

    # Select features
    for idx, label in enumerate(y):
        ofs.train(x[idx],label)

    ftr_selection, ftr_weights = ofs.get_feature_indices(return_weights=True)
    ofs_cor_weights.append(ftr_weights[cor_ftr_index])
    ofs_rand_weights.append(ftr_weights[rand_ftr_index])


    # Truncate x (retain only selected features, 'remove' all others, e.g. by replacing them with 0)
    x_reduced = np.zeros(x.shape)
    x_reduced[:, ftr_selection] = x[:, ftr_selection]

     # stability test
    ftr_array = np.zeros(stream.n_features)
    ftr_array[ftr_selection] = 1
    ofs_selected_ftrs.append(ftr_array)

    # Test
    y_pred = predictor.predict(x_reduced)


    # Train
    predictor.partial_fit(x_reduced, y)

In [16]:
df = {"cor": ofs_cor_weights, "rand":ofs_rand_weights}
df = pd.DataFrame(df)
fig = px.line(df)
fig.show()