In [None]:
import numpy as np
from collections import defaultdict
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import accuracy_score, confusion_matrix
import os
import sys
import json
from timeit import default_timer as timer
from copy import deepcopy
from ucimlrepo import fetch_ucirepo 

In [None]:
# Get the directory of the current script
current_dir = os.getcwd()
# Get the parent directory
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))
# Add the parent directory to sys.path
sys.path.insert(0, parent_dir)

# Now you can import from the 'flexc' directory
from flexc import FLEXC_Labels, FLEXC_Centroids

In [None]:
# expect seeds in the cwd, if it is not there it generates new ones
seeds_file = os.path.join(current_dir, "seeds.json")

In [None]:
def extract_balanced(N, x, y):
    
    # Group indices by class
    classes, counts = np.unique(y, return_counts=True)
    class_indices = {c: np.where(y == c)[0] for c in classes}
    
    # Determine the number of samples per class
    min_class_count = np.min(counts)
    samples_per_class = min(N, min_class_count)
    
    x_balanced, y_balanced, x_tail, y_tail = [], [], [], []

    for label, indices in class_indices.items():
        if len(indices) < 2 or samples_per_class == 0:
            continue  # Skip classes with insufficient samples
    
        indices = np.random.permutation(indices)
        selected = indices[:samples_per_class]
        leftover = indices[samples_per_class:]
        
        x_balanced.extend(x[selected])
        y_balanced.extend(y[selected])
        x_tail.extend(x[leftover])
        y_tail.extend(y[leftover])
    
    return np.array(x_balanced), np.array(x_tail), np.array(y_balanced), np.array(y_tail)

In [None]:
dataset_id = 1
max_workers = 1
n_seeds = 25

In [None]:
dataset_name = "glass" if dataset_id == 0 else "sat_image"
results_dir = os.path.join(os.getcwd(), f"results_{dataset_name}")
os.makedirs(results_dir, exist_ok=False)
results_file = os.path.join(results_dir, "results.json")

In [7]:
# fetch dataset 
glass_id = 42
sat_image_id=146
dataset = fetch_ucirepo(id=glass_id if dataset_id == 0 else sat_image_id)
  
# data (as pandas dataframes) 
X = dataset.data.features 
y = dataset.data.targets 
  
# metadata 
print(dataset.metadata) 
  
# variable information 
print(dataset.variables) 


y = y.values.astype(int).reshape(-1)
y_mapping = {k: i for i, k in enumerate(np.unique(y))}
y = np.array([y_mapping[i] for i in y])
X = X.values.astype(np.float32)


{'uci_id': 146, 'name': 'Statlog (Landsat Satellite)', 'repository_url': 'https://archive.ics.uci.edu/dataset/146/statlog+landsat+satellite', 'data_url': 'https://archive.ics.uci.edu/static/public/146/data.csv', 'abstract': 'Multi-spectral values of pixels in 3x3 neighbourhoods in a satellite image, and the classification associated with the central pixel in each neighbourhood', 'area': 'Climate and Environment', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 6435, 'num_features': 36, 'feature_types': ['Integer'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1993, 'last_updated': 'Wed Feb 14 2024', 'dataset_doi': '10.24432/C55887', 'creators': ['Ashwin Srinivasan'], 'intro_paper': None, 'additional_info': {'summary': "The database consists of the multi-spectral values of pixels in 3x3 neighbourhoods in a satellite image, and the classification as

In [8]:
print(f"dataset shape: {X.shape}")
print(f"labels shape: {y.shape}")

print(f"labels: {np.unique(y, return_counts=True)}")

dataset shape: (6435, 36)
labels shape: (6435,)
labels: (array([0, 1, 2, 3, 4, 5]), array([1533,  703, 1358,  626,  707, 1508]))


In [9]:
class_0 = [0,1] if dataset_id == 0 else [0,2,5]
print(f"Merged classes into class 0: {class_0}")
other_classes = np.setdiff1d(np.unique(y), class_0)
classes_mapping = {K: 0 for K in class_0} | {k: i+1 for i, k in enumerate(other_classes)}
y = np.array([classes_mapping[i] for i in y])
print(f"labels: {np.unique(y, return_counts=True)}")

Merged classes into class 0: [0, 2, 5]
labels: (array([0, 1, 2, 3]), array([4399,  703,  626,  707]))


In [10]:
def dataset_with_contamination(X, y, contamination=0.1):
    """
    Create a dataset with a specified contamination level.
    """

    normal_index = y == 0
    anomaly_index = ~normal_index
    X_anom = X[anomaly_index]
    Y_anom = y[anomaly_index]
    X_normal = X[normal_index]
    Y_normal = y[normal_index]

    n_anomalies = int(len(X_normal) * contamination)
    if n_anomalies > X_anom.shape[0]:
        n_anomalies = X_anom.shape[0]
        n_normal = int(n_anomalies / contamination)
        selected_normal = np.random.choice(len(X_normal), n_normal, replace=False)
        x = np.concatenate((
            X_normal[selected_normal],
            X_anom
        ))
        y = np.concatenate((
            Y_normal[selected_normal],
            Y_anom
        ))
        return x, y
    
    selected_anomalies = np.random.choice(len(X_anom), n_anomalies, replace=False)
    x = np.concatenate((
        X_normal,  
        X_anom[selected_anomalies]
    ))
    y = np.concatenate((
        Y_normal,  
        Y_anom[selected_anomalies]
    ))
    return x, y

In [11]:
x_balanced, x_tail, y_balanced, y_tail= extract_balanced(np.inf, X , y)

x_train_unsup, y_train_unsup = dataset_with_contamination(
    x_tail, y_tail, contamination=0.1
)

x_train_sup, x_test, y_train_sup, y_test = train_test_split(
    x_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced
)

print(f"Supervised Train set size: {x_train_sup.shape[0]}")
print(f"Unupervised train set size: {x_train_unsup.shape[0]}")
print(f"Test set size: {x_test.shape[0]}")

Supervised Train set size: 1752
Unupervised train set size: 1738
Test set size: 752


In [12]:
print(f"Labels in supervised train set: {np.unique(y_train_sup, return_counts=True)}")
print(f"Labels in unsupervised train set: {np.unique(y_train_unsup, return_counts=True)}")
print(f"Labels in test set: {np.unique(y_test, return_counts=True)}")

Labels in supervised train set: (array([0, 1, 2, 3]), array([438, 438, 438, 438]))
Labels in unsupervised train set: (array([0, 1, 3]), array([1580,   77,   81]))
Labels in test set: (array([0, 1, 2, 3]), array([188, 188, 188, 188]))


# Experiment functions

In [None]:
n_estimators = 100
flexc_centroids_threshold = 0.1
flexc_labels_threshold = 0.0

In [76]:
def incremental_dataset_sizes(n_total, x_min, k):
    """
    Compute dataset increments s_i such that log(cumulative size) grows linearly.

    Args:
        n_total (int): Total number of samples in the dataset.
        x_min (int): Minimum number of samples in the first dataset.
        k (int): Number of steps.

    Returns:
        list of int: Dataset sizes to add at each step.
    """
    alpha = np.log(n_total) / (k-1)
    i_vals = np.arange(k)
    K_vals = np.exp(alpha * i_vals)
    s_vals = np.diff(K_vals, prepend=0)
    s_vals = np.round(s_vals).astype(int).tolist()
    s_vals[0] = max(x_min, s_vals[0])  # Ensure first dataset has enough points
    s_vals = [max(1, s) for s in s_vals]  # Ensure all increments are at least 1
    return s_vals

In [None]:
def set_seeds(seed):
    """
    Set random seeds for reproducibility.
    """
    if seed is not None:
        try:
            import random
            random.seed(seed)
        except:
            pass

        try:
            np.random.seed(seed)
        except:
            pass

In [None]:
def experiment(X, y, seed, n_blocks):
    set_seeds(seed)

    x_balanced, x_tail, y_balanced, y_tail= extract_balanced(np.inf, X , y)

    x_train_unsup, y_train_unsup = dataset_with_contamination(
    x_tail, y_tail, contamination=0.1
    )

    x_train_sup, x_test, y_train_sup, y_test = train_test_split(
        x_balanced, y_balanced, test_size=0.3, random_state=seed, stratify=y_balanced
    )
    # in this case these are point per class
    blocks_points_per_class = incremental_dataset_sizes(
        n_total=x_train_sup.shape[0]/len(np.unique(y_train_sup)),
        x_min=1, 
        k=n_blocks
    )

    # metrics
    flexc_labels_accuracy = defaultdict(list)
    flexc_labels_confusion_matrix = defaultdict(list)
    flexc_labels_training_time = defaultdict(list)
    flexc_labels_inference_time = defaultdict(list)

    flexc_centroids_accuracy = defaultdict(list)
    flexc_centroids_confusion_matrix = defaultdict(list)
    flexc_centroids_training_time = defaultdict(list)
    flexc_centroids_inference_time = defaultdict(list)

    flexc_centroids_v2_accuracy = defaultdict(list)
    flexc_centroids_v2_confusion_matrix = defaultdict(list)
    flexc_centroids_v2_training_time = defaultdict(list)
    flexc_centroids_v2_inference_time = defaultdict(list)

    rf_accuracy = defaultdict(list)
    rf_confusion_matrix = defaultdict(list)
    rf_training_time = defaultdict(list)
    rf_inference_time = defaultdict(list)

    rf_self_accuracy = defaultdict(list)
    rf_self_confusion_matrix = defaultdict(list)
    rf_self_training_time = defaultdict(list)
    rf_self_inference_time = defaultdict(list)

    rf_mislead_accuracy = defaultdict(list)
    rf_mislead_confusion_matrix = defaultdict(list)
    rf_mislead_training_time = defaultdict(list)
    rf_mislead_inference_time = defaultdict(list)

    iso_accuracy = defaultdict(list)
    iso_confusion_matrix = defaultdict(list)
    iso_training_time = defaultdict(list)
    iso_inference_time = defaultdict(list)


    for i, _ in enumerate(blocks_points_per_class):

        n_train_sup = np.sum(blocks_points_per_class[:i+1])
        if i == len(blocks_points_per_class) - 1:
            # last block is the whole dataset
            n_train_sup = np.inf
        x_train_sup_iter, x_discarded, y_train_sup_iter, y_discarded = extract_balanced(n_train_sup, x_train_sup, y_train_sup)

        #set correct n_train_sup
        if i == len(blocks_points_per_class) - 1:
            # last block is the whole dataset
            classes, counts = np.unique(y_train_sup_iter, return_counts=True)
            n_train_sup = np.min(counts)
        
        x_train_sup_clean_iter = x_train_sup_iter[np.where(y_train_sup_iter != 0)]
        y_train_sup_clean_iter = y_train_sup_iter[np.where(y_train_sup_iter != 0)]

        # Initialize the models
        flexc_labels_model = FLEXC_Labels(
            n_estimators=n_estimators,
            contamination=0.1,  
            random_state=seed,
            bootstrap=True
        )
        flexc_centroids_model = FLEXC_Centroids(
            n_estimators=n_estimators,
            contamination=0.1,  
            random_state=seed,
            bootstrap=True
        )
        flexc_centroids_v2_model = deepcopy(flexc_centroids_model)
        rf_model = RandomForestClassifier(
            n_estimators=n_estimators,
            random_state=seed,
            bootstrap=True
        )
        rf_mislead_model = RandomForestClassifier(
            n_estimators=n_estimators,
            random_state=seed,
            bootstrap=True
        )
        iso_f_model = IsolationForest(
            n_estimators=n_estimators,
            #contamination=0.1,
            bootstrap=True,
            random_state=seed
        )
        rf_iso_model = RandomForestClassifier(
            n_estimators=n_estimators,
            random_state=seed,
            bootstrap=True
        )

        ###### FLEX_C Labels
        t = timer()
        flexc_labels_model.fit(x_train_unsup)
        flexc_labels_model.inject_knowledge(x_train_sup_clean_iter, y_train_sup_clean_iter)
        flexc_labels_training_time[f"block:{n_train_sup}"].append(timer() - t)

        t = timer()
        flexc_labels_pred, counts = flexc_labels_model.predict_labels(x_test)
        flexc_labels_pred = np.nan_to_num(flexc_labels_pred, nan=0)
        flexc_labels_inference_time[f"block:{n_train_sup}"].append(timer() - t)
        
        flexc_labels_accuracy[f"block:{n_train_sup}"].append(accuracy_score(y_test, flexc_labels_pred))
        flexc_labels_confusion_matrix[f"block:{n_train_sup}"].append(confusion_matrix(y_test, flexc_labels_pred).tolist())

        ###### FLEX_C Centroids
        t = timer()
        flexc_centroids_model.fit(x_train_unsup)
        flexc_centroids_model.inject_knowledge(x_train_sup_clean_iter, y_train_sup_clean_iter, score_threshold=flexc_centroids_threshold)
        flexc_centroids_training_time[f"block:{n_train_sup}"].append(timer() - t)

        t = timer()
        # alarm_norm False removes the vote from untrained classifiers
        flexc_centroids_pred, alarm = flexc_centroids_model.predict_labels(x_test, alarm_threshold=flexc_centroids_threshold, y=y_test, alarm_norm=False)
        flexc_centroids_pred = np.argmax(flexc_centroids_pred, axis=1)
        flexc_centroids_inference_time[f"block:{n_train_sup}"].append(timer() - t)

        flexc_centroids_accuracy[f"block:{n_train_sup}"].append(accuracy_score(y_test, flexc_centroids_pred))

        flexc_centroids_confusion_matrix[f"block:{n_train_sup}"].append(confusion_matrix(y_test, flexc_centroids_pred).tolist())

        ###### FLEX_C Centroids V2
        t = timer()
        flexc_centroids_v2_model.fit(x_train_unsup)
        flexc_centroids_v2_model.inject_knowledge(x_train_sup_clean_iter, y_train_sup_clean_iter, score_threshold=flexc_centroids_threshold)
        flexc_centroids_v2_training_time[f"block:{n_train_sup}"].append(timer() - t)

        t = timer()
        flexc_centroids_v2_pred, alarm = flexc_centroids_v2_model.predict_labels(x_test, alarm_threshold=flexc_centroids_threshold, y=y_test)
        flexc_centroids_v2_pred = np.argmax(flexc_centroids_v2_pred, axis=1)
        flexc_centroids_v2_inference_time[f"block:{n_train_sup}"].append(timer() - t)

        flexc_centroids_v2_accuracy[f"block:{n_train_sup}"].append(accuracy_score(y_test, flexc_centroids_v2_pred))
       
        flexc_centroids_v2_confusion_matrix[f"block:{n_train_sup}"].append(confusion_matrix(y_test, flexc_centroids_v2_pred).tolist())
        

        ###### Isolation Forest + RF
        t = timer()
        iso_f_model.fit(x_train_unsup)
        rf_iso_model.fit(x_train_sup_clean_iter, y_train_sup_clean_iter)
        iso_training_time[f"block:{n_train_sup}"].append(timer() - t)

        t = timer()
        iso_pred = iso_f_model.predict(x_test)
        outliers_mask = iso_pred == -1
        iso_pred[~outliers_mask] = 0
        iso_pred[outliers_mask] = rf_iso_model.predict(x_test[outliers_mask])
        iso_inference_time[f"block:{n_train_sup}"].append(timer() - t)

        iso_accuracy[f"block:{n_train_sup}"].append(accuracy_score(y_test, iso_pred))
        
        iso_confusion_matrix[f"block:{n_train_sup}"].append(confusion_matrix(y_test, iso_pred).tolist())
        

        ###### RF
        t = timer()
        rf_model.fit(x_train_sup_iter, y_train_sup_iter)
        rf_training_time[f"block:{n_train_sup}"].append(timer() - t)

        t = timer()
        rf_pred = rf_model.predict(x_test)
        rf_inference_time[f"block:{n_train_sup}"].append(timer() - t)

        rf_accuracy[f"block:{n_train_sup}"].append(accuracy_score(y_test, rf_pred))
        
        rf_confusion_matrix[f"block:{n_train_sup}"].append(confusion_matrix(y_test, rf_pred).tolist())
        

        # semi self supervised RF
        t = timer()
        rf_self = deepcopy(rf_model)
        rf_y_train_self = rf_model.predict(x_train_unsup)
        rf_self.fit(np.concatenate([x_train_sup_iter, x_train_unsup]), np.concatenate([y_train_sup_iter, rf_y_train_self]))
        rf_self_training_time[f"block:{n_train_sup}"].append(timer() - t + rf_training_time[f"block:{n_train_sup}"][-1])

        t = timer()
        rf_self_pred = rf_self.predict(x_test)
        rf_self_inference_time[f"block:{n_train_sup}"].append(timer() - t + rf_inference_time[f"block:{n_train_sup}"][-1])

        rf_self_accuracy[f"block:{n_train_sup}"].append(accuracy_score(y_test, rf_self_pred))
        
        rf_self_confusion_matrix[f"block:{n_train_sup}"].append(confusion_matrix(y_test, rf_self_pred).tolist())
        

        # mislead RF

        # our method should be more resistant to this false info
        # in real life it is difficult to ensure normal data has no faults in it
        # but faulty data is certain 
        # the fault is happened for the company to have it categorized and collected

        t = timer()
        rf_mislead_model.fit(
            np.concatenate([x_train_sup_clean_iter, x_train_unsup]),
            np.concatenate([y_train_sup_clean_iter, np.zeros_like(y_train_unsup)])
        )
        rf_mislead_training_time[f"block:{n_train_sup}"].append(timer() - t)

        t = timer()
        rf_mislead_pred = rf_mislead_model.predict(x_test)
        rf_mislead_inference_time[f"block:{n_train_sup}"].append(timer() - t)

        rf_mislead_accuracy[f"block:{n_train_sup}"].append(accuracy_score(y_test, rf_mislead_pred))
        
        rf_mislead_confusion_matrix[f"block:{n_train_sup}"].append(confusion_matrix(y_test, rf_mislead_pred).tolist())
        

    return {
        "flexc_labels": {
            "accuracy": flexc_labels_accuracy,
            "confusion_matrix": flexc_labels_confusion_matrix,
            "training_time": flexc_labels_training_time,
            "inference_time": flexc_labels_inference_time
            },
        "flexc_centroids": {
            "accuracy": flexc_centroids_accuracy,
            "confusion_matrix": flexc_centroids_confusion_matrix,
            "training_time": flexc_centroids_training_time,
            "inference_time": flexc_centroids_inference_time
            },
        "flexc_centroids_v2": {
            "accuracy": flexc_centroids_v2_accuracy,
            "confusion_matrix": flexc_centroids_v2_confusion_matrix,
            "training_time": flexc_centroids_v2_training_time,
            "inference_time": flexc_centroids_v2_inference_time
            },
        "rf": {
            "accuracy": rf_accuracy,
            "confusion_matrix": rf_confusion_matrix,
            "training_time": rf_training_time,
            "inference_time": rf_inference_time
            },
        "rf_self": {
            "accuracy": rf_self_accuracy,
            "confusion_matrix": rf_self_confusion_matrix,
            "training_time": rf_self_training_time,
            "inference_time": rf_self_inference_time
            },
        "rf_mislead": {
            "accuracy": rf_mislead_accuracy,
            "confusion_matrix": rf_mislead_confusion_matrix,
            "training_time": rf_mislead_training_time,
            "inference_time": rf_mislead_inference_time
            },
        "iso + rf": {
            "accuracy": iso_accuracy,
            "confusion_matrix": iso_confusion_matrix,
            "training_time": iso_training_time,
            "inference_time": iso_inference_time
            }
        }

# Experiment

In [None]:
from concurrent.futures import ProcessPoolExecutor
from functools import partial

def run_single_experiment(seed, X, y, n_blocks):
    print(f"Running experiment with seed {seed}")
    results = experiment(X, y, seed, n_blocks=n_blocks)
    return (f"seed:{seed}", results)

experiment_results = {}

# load or generate seeds
if os.path.exists(seeds_file):
    with open(seeds_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        random_seeds = data.get("seeds", [])
    if len(random_seeds) != n_seeds:
        raise ValueError(f"Expected {n_seeds} seeds, but found {len(random_seeds)} in {seeds_file}. Please delete the file to generate new seeds.")
    print("Loaded existing seeds.")
else:
    random_seeds = np.random.choice(np.arange(10000), size=n_seeds, replace=False).tolist()
    
    data_to_save = {"seeds": random_seeds}

    with open(seeds_file, 'w', encoding='utf-8') as f: 
        json.dump(data_to_save, f, indent=4)
    print("Generated and saved new seeds.")

n_blocks = 5 if dataset_id == 0 else 10

# parallel execution of the seeds
with ProcessPoolExecutor(max_workers=max_workers) as executor:
    run_exp = partial(run_single_experiment, X=X, y=y, n_blocks=n_blocks)
    for key, result in executor.map(run_exp, random_seeds):
        experiment_results[key] = result

Running experiment with seed 2439


In [81]:
with open(results_file, 'w', encoding='utf-8') as f:
    json.dump(experiment_results, f, indent=4)