In [1]:
import sys
import os
sys.path.append(os.path.abspath(".."))

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_wine, load_digits, load_breast_cancer
from mislabelling import symmetric_noise, pair_noise, NNAR
from testing import *

In [2]:
RESOLUTION = 10
TRIALS = 35
N_ESTIMATORS = 10
TEST_SIZE = 0.25
ITERATIONS = 20

noises = (symmetric_noise, pair_noise, pair_noise, pair_noise, NNAR)
datasets = (load_wine(), load_digits(), load_breast_cancer(), load_gmm5())
rf = RandomForestClassifier(random_state=42)

accuracies_mean = []
accuracies_se = []
relabelling_f1_success = []
relabelling_f1_se = []
relabelling_acc_success = []
relabelling_acc_se = []

In [3]:
def get_ratio(data):
    values, counts = np.unique(data.target, return_counts=True)
    print({int(k): int(v) for k, v in zip(values, counts)})
    total_vals = sum(counts)

    noise_ratio = [1 - count/total_vals for count in counts]
    noise_ratio /= min(noise_ratio)
    print([float(f"{val:.3g}") for val in noise_ratio])
    return noise_ratio

In [6]:
for data in datasets:
    print(f"Starting Experiments for Dataset {datasets.index(data) + 1}")
    for i in range(4,5):
        print(f"Starting Experiments for Noise {i + 1}")
        
        unique_pairs = None
        noise_ratio = None
        clf = None

        if i == 2:
            noise_ratio = get_ratio(data)
        if i == 3:
            unique_pairs = True
        if i == 4:
            rf.fit(data.data, data.target)
            clf = rf

        accuracies_all, auc_all, relabelling_f1_all, relabelling_acc_all, x_axis = run_noise_level_experiment(
        data, RandomForestClassifier, noises[i],
        n_estimators=N_ESTIMATORS, trials=TRIALS,
        resolution=RESOLUTION, test_size=TEST_SIZE, iterations=ITERATIONS,
        noise_ratio=noise_ratio, clf=clf, unique_pairs = unique_pairs
        )
        accuracies_boot, auc_boot, relabelling_f1_boot, relabelling_acc_boot, x_axis = run_noise_level_experiment(
        data, RandomForestClassifier, noises[i],
        n_estimators=N_ESTIMATORS, trials=TRIALS,
        resolution=RESOLUTION, test_size=TEST_SIZE, iterations=ITERATIONS,
        control=False, bootstrapping=True,
        noise_ratio=noise_ratio, clf=clf, unique_pairs = unique_pairs
        )

        accuracies_all = np.concatenate([accuracies_all, accuracies_boot[:1]], axis=0)
        auc_all = np.concatenate([accuracies_all, auc_boot[:1]], axis=0)
        relabelling_f1_all = np.concatenate([relabelling_f1_all, relabelling_f1_boot[:1]], axis=0)
        relabelling_acc_all = np.concatenate([relabelling_acc_all, relabelling_acc_boot[:1]], axis=0)

        # Process results
        accuracies_mean_exp, accuracies_se_exp = process_experiment_result(accuracies_all)
        relabelling_f1_success_exp, relabelling_f1_se_exp = process_experiment_result(relabelling_f1_all)
        relabelling_acc_success_exp, relabelling_acc_se_exp = process_experiment_result(relabelling_acc_all)

        accuracies_mean.append(accuracies_mean_exp)
        accuracies_se.append(accuracies_se_exp)
        relabelling_f1_success.append(relabelling_f1_success_exp)
        relabelling_f1_se.append(relabelling_f1_se_exp)
        relabelling_acc_success.append(relabelling_acc_success_exp)
        relabelling_acc_se.append(relabelling_acc_se_exp)

Starting Experiments for Dataset 1
Starting Experiments for Noise 5
Starting Experiments for Dataset 2
Starting Experiments for Noise 5
Starting Experiments for Dataset 3
Starting Experiments for Noise 5
Starting Experiments for Dataset 4
Starting Experiments for Noise 5


In [7]:
# Stack your results into a single big DataFrame
results = []

# Loop through the results and create rows
for dataset_idx, dataset_results in enumerate(zip(accuracies_mean, accuracies_se, relabelling_f1_success, relabelling_f1_se, relabelling_acc_success, relabelling_acc_se)):
    for noise_idx, (acc_mean, acc_se, f1_mean, f1_se, acc_succ_mean, acc_succ_se) in enumerate(zip(*dataset_results)):
        for res_idx, (am, ase, f1m, f1se, asm, asse) in enumerate(zip(acc_mean, acc_se, f1_mean, f1_se, acc_succ_mean, acc_succ_se)):
            results.append({
                'Dataset': dataset_idx,
                'Noise Type': noise_idx,
                'Noise Level Index': res_idx,
                'Accuracy Mean': am,
                'Accuracy SE': ase,
                'Relabelling F1 Mean': f1m,
                'Relabelling F1 SE': f1se,
                'Relabelling Accuracy Mean': asm,
                'Relabelling Accuracy SE': asse
            })

# Convert list of dicts into a DataFrame
results_df = pd.DataFrame(results)

# Save to CSV
results_df.to_csv("experiment_results_2.csv", index=False)

print("Saved results to experiment_results.csv")

Saved results to experiment_results.csv
