# **TESTING CUSTOM RANDOM FOREST IMPLEMENTATION**

# Model

In [2]:
import numpy as np
from sklearn.utils._param_validation import StrOptions

from sklearn.ensemble._forest import ForestClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import RandomOverSampler, SMOTE, BorderlineSMOTE
from sklearn.base import clone
from sklearn.ensemble._base import _set_random_states

def _unwrap_data(X, y, sample_weight):
    if sample_weight is None:
        return X, y

    return np.repeat(X, sample_weight.astype(int), axis=0), np.repeat(y, sample_weight.astype(int), axis=0)

class OSRandomForestClassifier(ForestClassifier):

    _parameter_constraints: dict = {
        **ForestClassifier._parameter_constraints,
        **DecisionTreeClassifier._parameter_constraints,
        "class_weight": [
            StrOptions({"balanced_subsample", "balanced"}),
            dict,
            list,
            None,
        ],
    }
    _parameter_constraints.pop("splitter")

    def __init__(
        self,
        oversampling_strategy="random",
        print_indices_list=None,
        n_estimators=100,
        *,
        criterion="gini",
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        min_weight_fraction_leaf=0.0,
        max_features="sqrt",
        max_leaf_nodes=None,
        min_impurity_decrease=0.0,
        bootstrap=True,
        oob_score=False,
        n_jobs=None,
        random_state=None,
        verbose=0,
        warm_start=False,
        class_weight=None,
        ccp_alpha=0.0,
        max_samples=None,
        monotonic_cst=None,
    ):
        super().__init__(
            estimator=OSDecisionTreeClassifier(
                oversampling_strategy=oversampling_strategy,
                print_var=False,
            ),
            n_estimators=n_estimators,
            estimator_params=(
                "criterion",
                "max_depth",
                "min_samples_split",
                "min_samples_leaf",
                "min_weight_fraction_leaf",
                "max_features",
                "max_leaf_nodes",
                "min_impurity_decrease",
                "random_state",
                "ccp_alpha",
                "monotonic_cst",
            ),
            bootstrap=bootstrap,
            oob_score=oob_score,
            n_jobs=n_jobs,
            random_state=random_state,
            verbose=verbose,
            warm_start=warm_start,
            class_weight=class_weight,
            max_samples=max_samples,
        )

        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.min_weight_fraction_leaf = min_weight_fraction_leaf
        self.max_features = max_features
        self.max_leaf_nodes = max_leaf_nodes
        self.min_impurity_decrease = min_impurity_decrease
        self.monotonic_cst = monotonic_cst
        self.ccp_alpha = ccp_alpha
        self.oversampling_strategy = oversampling_strategy
        self.current_tree_count = 0
        self.print_indices_list = print_indices_list if print_indices_list is not None else []

    def _make_estimator(self, append=True, random_state=None):
        self.current_tree_count += 1
        tree_index = self.current_tree_count
        print_var = tree_index in self.print_indices_list
        estimator = clone(self.estimator_)
        estimator.set_params(**{p: getattr(self, p) for p in self.estimator_params},
                             print_var=print_var
                             )

        if random_state is not None:
            _set_random_states(estimator, random_state)

        if append:
            self.estimators_.append(estimator)

        return estimator


class OSDecisionTreeClassifier(DecisionTreeClassifier):
    def __init__(
        self,
        *,
        print_var=False,
        oversampling_strategy="random",
        criterion="gini",
        splitter="best",
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        min_weight_fraction_leaf=0.0,
        max_features=None,
        random_state=None,
        max_leaf_nodes=None,
        min_impurity_decrease=0.0,
        class_weight=None,
        ccp_alpha=0.0,
        monotonic_cst=None,
    ):
        # print(f"Oversampling strategy: {oversampling_strategy}")
        super().__init__(
            criterion=criterion,
            splitter=splitter,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            min_weight_fraction_leaf=min_weight_fraction_leaf,
            max_features=max_features,
            random_state=random_state,
            max_leaf_nodes=max_leaf_nodes,
            min_impurity_decrease=min_impurity_decrease,
            class_weight=class_weight,
            ccp_alpha=ccp_alpha,
            monotonic_cst=monotonic_cst,
        )
        self.oversampling_strategy = oversampling_strategy
        self.print_var = print_var

    def _fit(
        self,
        X,
        y,
        sample_weight=None,
        check_input=True,
        missing_values_in_feature_mask=None,
    ):
        if self.oversampling_strategy == "random":
            sampler = RandomOverSampler(random_state=self.random_state)
        elif self.oversampling_strategy == "SMOTE":
            sampler = SMOTE(random_state=self.random_state)
            # print(f"Using SMOTE with random_state={self.random_state} to generate synthetic samples.")
        elif  self.oversampling_strategy == "BorderlineSMOTE":
            sampler = BorderlineSMOTE(random_state=self.random_state)
            # print(f"Using BorderlineSMOTE with random_state={self.random_state} to generate synthetic samples.")
        elif self.oversampling_strategy == "ADASYN":
            sampler = ADASYN(random_state=self.random_state)
        else:
            raise ValueError(
                f"Oversampling strategy {self.oversampling_strategy} is not supported."
            )

        X_drawn, y_drawn = _unwrap_data(X, y, sample_weight)
        X_resampled, y_resampled = sampler.fit_resample(X_drawn, y_drawn)
        count_1 = sum(y_resampled == 1)
        count_0 = sum(y_resampled == 0)
        # print(f"Count of 1s: {count_1}, Count of 0s: {count_0}")

        sample_weight = [1] * len(X_drawn) + [0.5] * (len(X_resampled) - len(X_drawn))
        # sample_weight = None

        # print(f"len X_drawn: {len(X_drawn)}")
        # print(f"len X_resampled: {len(X_resampled)}")
        # print(X_drawn[:6, 0])
        # print(X_resampled[:6, 0])

        # if self.print_var:
        #     print("Pekaboo!")
        # else:
        #     print("Nope, not this time!")

        return super()._fit(
            # X, y,
            X_resampled,
            y_resampled,
            sample_weight=sample_weight,
            check_input=check_input,
            missing_values_in_feature_mask=missing_values_in_feature_mask,
        )

# Test

In [3]:
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

from imblearn.datasets import fetch_datasets
us_crime = fetch_datasets()['us_crime']

X = us_crime.data
y = us_crime.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.3, random_state=42
)

rf = OSRandomForestClassifier(oversampling_strategy='SMOTE', random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print("=== RF with SMOTE ===")
print(classification_report(y_test, y_pred))

=== RF with SMOTE ===
              precision    recall  f1-score   support

          -1       0.96      0.97      0.96       554
           1       0.57      0.51      0.54        45

    accuracy                           0.93       599
   macro avg       0.77      0.74      0.75       599
weighted avg       0.93      0.93      0.93       599



# **TESTING MODEL COMPARATOR**

# Comparator

In [75]:
from imblearn.datasets import fetch_datasets
from imblearn.over_sampling import RandomOverSampler, SMOTE, BorderlineSMOTE, ADASYN
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from os_sklearn.ensemble._forest import OSRandomForestClassifier
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from umap import UMAP
import warnings
warnings.filterwarnings('ignore')


class Comparator:
    def __init__(
            self,
            datasets=[[fetch_datasets()['us_crime'].data, fetch_datasets()['us_crime'].target],
                      [fetch_datasets()['letter_img'].data, fetch_datasets()['letter_img'].target]],
            test_size=0.2,
            oversampling_strategies=['random', 'SMOTE', 'BorderlineSMOTE', 'ADASYN'], # 'random', 'SMOTE', 'BorderlineSMOTE', 'ADASYN'
            metrics=['precision', 'recall', 'f1-score', 'accuracy'], # 'precision', 'recall', 'f1-score', 'accuracy'
            n_trees=100,
            iterations=100,
            print_indices_list=[[0]] + [[]] * 99,
            dataset_name=['us_crime', 'letter_img'],
            mode='both' # 'both', 'bagging', 'augmentation'
    ):
        self.datasets = datasets
        self.test_size = test_size
        self.oversampling_strategies = oversampling_strategies
        self.metrics = metrics
        self.n_trees = n_trees
        self.iterations = iterations
        self.print_indices_list = print_indices_list
        self.dataset_name = dataset_name
        self.mode = mode
        self.visuals = sum(len(indices) for indices in self.print_indices_list if indices)

    def prepare_data(self, dataset):
        return train_test_split(dataset[0], dataset[1], stratify=dataset[1], test_size=self.test_size)

    def compute(self):

        print('=========================================================================')
        print('=========================     START COMPUTING     =======================')
        print('=========================================================================\n')
        print(f'Mode: {self.mode}')
        print(f'Iterations: {self.iterations}')
        print(f'Oversampling strategies: {self.oversampling_strategies}')
        print(f'Metrics: {self.metrics}')
        print(f'Number of trees: {self.n_trees}')
        print(f'Datasets: {self.dataset_name}')

        self.results_bgg = []
        self.results_aug = []
        self.results_rf = []
        self.visualization_data_bgg = []
        self.visualization_data_aug = []
        for i, dataset in enumerate(self.datasets):
            dataset_name = self.dataset_name[i]
            print(f'\n \n + DATASET: {dataset_name}')
            if self.mode == 'both':
                bgg_results, bgg_visualization_data = self.compute_bagging(dataset, dataset_name)
                aug_results, aug_visualization_data = self.compute_augmentation(dataset, dataset_name)
                rf_results = self.compute_baseline(dataset, dataset_name)
                self.results_bgg.append(bgg_results)
                self.results_aug.append(aug_results)
                self.results_rf.append(rf_results)
                self.visualization_data_bgg.append(bgg_visualization_data)
                self.visualization_data_aug.append(aug_visualization_data)
            elif self.mode == 'bagging':
                bgg_results, bgg_visualization_data = self.compute_bagging(dataset, dataset_name)
                rf_results = self.compute_baseline(dataset, dataset_name)
                self.results_rf.append(rf_results)
                self.results_bgg.append(bgg_results)
                self.visualization_data_bgg.append(bgg_visualization_data)
            elif self.mode == 'augmentation':
                aug_results, aug_visualization_data = self.compute_augmentation(dataset, dataset_name)
                rf_results = self.compute_baseline(dataset, dataset_name)
                self.results_rf.append(rf_results)
                self.results_aug.append(aug_results)
                self.visualization_data_aug.append(aug_visualization_data)
            else:
                raise ValueError(f"Mode {self.mode} is not supported. Choose from 'both', 'bagging', or 'augmentation'.")
            
        print('\n=========================================================================')
        print('==================     COMPUTING ENDED SUCCESSFULLY      ================')
        print('=========================================================================\n')

    def _print_progress_bar(self, iteration, total, prefix='', length=30):
        percent = f"{100 * (iteration / float(total)):.1f}"
        filled_length = int(length * iteration // total)
        bar = '█' * filled_length + '-' * (length - filled_length)
        print(f'\r{prefix} |{bar}| {percent}% Complete', end='\r')
        if iteration == total:
            print()

    def compute_bagging(self, dataset, dataset_name):
        print('\n-=-=-=-=-=-=   BAGGING   =-=-=-=-=-')
        visualization_data = []
        class_names = np.unique(dataset[1])
        if 'accuracy' not in self.metrics:
            n_metrics = len(self.metrics) * len(class_names)
        else:
            n_metrics = (len(self.metrics) - 1) * len(class_names) + 1
        results = []

        for strategy in self.oversampling_strategies:
            strategy_results = [[] for _ in range(n_metrics)]
            strategy_visualization_data = []
            for j in range(self.iterations):
                self._print_progress_bar(j + 1, self.iterations, prefix=f'{strategy} - bagging')
                if self.print_indices_list[j] is None:
                    indices = None
                else:
                    indices = self.print_indices_list[j]
                
                X_train, X_test, y_train, y_test = self.prepare_data(dataset)
                forest = OSRandomForestClassifier(
                    oversampling_strategy=strategy,
                    print_indices_list=indices,
                    n_estimators=self.n_trees,
                    data_name=dataset_name,
                    iteration=j)
                forest.fit(X_train, y_train)

                if indices is not None:
                    for i in indices:
                        strategy_visualization_data.append(forest.estimators_[i].visualization_pack)

                y_pred = forest.predict(X_test)
                report = classification_report(y_test, y_pred, output_dict=True, target_names=[str(c) for c in class_names])
                
                idx = 0
                for cls in class_names:
                    for metric in self.metrics:
                        if metric == 'accuracy':
                            continue
                        if metric in report[str(cls)]:
                            strategy_results[idx].append(report[str(cls)][metric])
                        else:
                            raise ValueError(f"Metric {metric} not found in report for class {cls}.")
                        idx += 1
                if 'accuracy' in self.metrics:
                    strategy_results[-1].append(report['accuracy'])

            results.append(strategy_results)
            visualization_data.append(strategy_visualization_data)

        return results, visualization_data

    def compute_augmentation(self, dataset, dataset_name):
        print('\n-=-=-=-=-=-=   AUGMENTATION   =-=-=-=-=-')
        visualization_data = []
        class_names = np.unique(dataset[1])
        if 'accuracy' not in self.metrics:
            n_metrics = len(self.metrics) * len(class_names)
        else:
            n_metrics = (len(self.metrics) - 1) * len(class_names) + 1
        results = []
        
        for strategy in self.oversampling_strategies:
            strategy_results = [[] for _ in range(n_metrics)]
            strategy_visualization_data = []
            for j in range(self.iterations):
                self._print_progress_bar(j + 1, self.iterations, prefix=f'{strategy} - augmentation')

                X_train, X_test, y_train, y_test = self.prepare_data(dataset)

                if strategy == "random":
                    sampler = RandomOverSampler()
                elif strategy == "SMOTE":
                    sampler = SMOTE()
                elif strategy == "BorderlineSMOTE":
                    sampler = BorderlineSMOTE()
                elif strategy == "ADASYN":
                    sampler = ADASYN()
                else:
                    raise ValueError(f"Oversampling strategy {strategy} is not supported.")
                
                forest = RandomForestClassifier(
                    n_estimators=self.n_trees
                )

                X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)

                forest.fit(X_resampled, y_resampled)
                y_pred = forest.predict(X_test)

                if self.print_indices_list[j] is not None:
                    strategy_visualization_data.append([X_train, X_resampled, y_resampled, dataset_name, strategy, j])

                report = classification_report(y_test, y_pred, output_dict=True, target_names=[str(c) for c in class_names])
                idx = 0
                for cls in class_names:
                    for metric in self.metrics:
                        if metric == 'accuracy':
                            continue
                        if metric in report[str(cls)]:
                            strategy_results[idx].append(report[str(cls)][metric])
                        else:
                            raise ValueError(f"Metric {metric} not found in report for class {cls}.")
                        idx += 1
                if 'accuracy' in self.metrics:
                    strategy_results[-1].append(report['accuracy'])
            
            results.append(strategy_results)
            visualization_data.append(strategy_visualization_data)
                
        return results, visualization_data
    
    def compute_baseline(self, dataset, dataset_name):
        print('\n-=-=-=-=-=-=   BASELINE   =-=-=-=-=-')
        class_names = np.unique(dataset[1])
        if 'accuracy' not in self.metrics:
            n_metrics = len(self.metrics) * len(class_names)
        else:
            n_metrics = (len(self.metrics) - 1) * len(class_names) + 1
        results = [[] for _ in range(n_metrics)]
        for j in range(self.iterations):
            self._print_progress_bar(j + 1, self.iterations, prefix='baseline')

            X_train, X_test, y_train, y_test = self.prepare_data(dataset)
                
            forest = RandomForestClassifier(
                n_estimators=self.n_trees
            )

            forest.fit(X_train, y_train)
            y_pred = forest.predict(X_test)
            report = classification_report(y_test, y_pred, output_dict=True, target_names=[str(c) for c in class_names])
            idx = 0
            for cls in class_names:
                for metric in self.metrics:
                    if metric == 'accuracy':
                        continue
                    if metric in report[str(cls)]:
                        results[idx].append(report[str(cls)][metric])
                    else:
                        raise ValueError(f"Metric {metric} not found in report for class {cls}.")
                    idx += 1
            if 'accuracy' in self.metrics:
                results[-1].append(report['accuracy'])
                
        return results
    
    def print_table(self, results, class_names, metrics):
        idx = 0
        for metric in metrics:
            if metric == 'accuracy':
                vals = np.array(results[-1])
                print(f"\n{'Accuracy':>12} {'min':>12} {'avg':>12} {'max':>12} {'std':>12}")
                print(f"{'':>12} {np.min(vals):12.4f} {np.mean(vals):12.4f} {np.max(vals):12.4f} {np.std(vals):12.4f}")
            else:
                print(f"\n{'':>12} {'min_'+metric:>12} {'avg_'+metric:>12} {'max_'+metric:>12} {'std_'+metric:>12}")
                for cidx, cls in enumerate(class_names):
                    vals = np.array(results[idx])
                    print(f"{str(cls):>12} {np.min(vals):12.4f} {np.mean(vals):12.4f} {np.max(vals):12.4f} {np.std(vals):12.4f}")
                    idx += 1

    def plot_violin_metrics(self, dataset_index):
        dataset_name = self.dataset_name[dataset_index]
        if self.mode == 'both':
            names = [f"{name} bagging" for name in self.oversampling_strategies] + \
                    [f"{name} augmentation" for name in self.oversampling_strategies] + \
                    ['baseline']
            results = self.results_bgg[dataset_index] + self.results_aug[dataset_index] + [self.results_rf[dataset_index]]
        elif self.mode == 'bagging':
            names = [f"{name} bagging" for name in self.oversampling_strategies] + ['baseline']
            results = self.results_bgg[dataset_index] + [self.results_rf[dataset_index]]
        elif self.mode == 'augmentation':
            names = [f"{name} augmentation" for name in self.oversampling_strategies] + ['baseline']
            results = self.results_aug[dataset_index] + [self.results_rf[dataset_index]]

        labels = np.unique(self.datasets[dataset_index][1])
        metrics = self.metrics
        n_classes = len(labels)

        plot_data = []
        plot_labels = []
        plot_methods = []
        plot_metrics = []
        plot_classes = []

        idx_metric = 0
        for m, metric in enumerate(metrics):
            if metric == 'accuracy':
                for i, method in enumerate(names):
                    vals = np.array(results[i][-1])
                    plot_data.extend(vals)
                    plot_labels.extend([method] * len(vals))
                    plot_methods.extend([method] * len(vals))
                    plot_metrics.extend([metric] * len(vals))
                    plot_classes.extend(['accuracy'] * len(vals))
            else:
                for c, cls in enumerate(labels):
                    for i, method in enumerate(names):
                        vals = np.array(results[i][idx_metric])
                        plot_data.extend(vals)
                        plot_labels.extend([method] * len(vals))
                        plot_methods.extend([method] * len(vals))
                        plot_metrics.extend([metric] * len(vals))
                        plot_classes.extend([str(cls)] * len(vals))
                    idx_metric += 1

        df = pd.DataFrame({
            'Value': plot_data,
            'Method': plot_methods,
            'Metric': plot_metrics,
            'Class': plot_classes
        })
        palettes = [["#b39ddb"],
                   ["#ffcc80"],
                   ["#a5d6a7"],
                   ["#90caf9"]]
        plot_idx = 0
        for m, metric in enumerate(metrics):
            if metric == 'accuracy':
                plt.figure(figsize=(8, 6))
                sns.violinplot(
                    data=df[df['Metric'] == 'accuracy'],
                    x='Method', y='Value',
                    palette=palettes[plot_idx//n_classes]
                )
                plt.title(f'Accuracy for {dataset_name}')
                plt.xlabel(None)
                plt.ylabel(None)
                plt.xticks(rotation=70)
                plt.tight_layout()
                plt.show()
                plot_idx += 1
            else:
                for cls in labels:
                    plt.figure(figsize=(8, 6))
                    sns.violinplot(
                    data=df[(df['Metric'] == metric) & (df['Class'] == str(cls))],
                    x='Method', y='Value',
                    palette=palettes[plot_idx//n_classes]
                    )
                    plt.title(f'{metric} (class {cls}) for {dataset_name}')
                    plt.xticks(rotation=70)
                    plt.xlabel(None)
                    plt.ylabel(None)
                    plt.tight_layout()
                    plt.show()
                    plot_idx += 1

                
    def plot_data(self, X_drawn, X_resampled, y_resampled, data_name, oversampling_strategy, iteration, index, type):
        X_drawn_tmp = np.array(X_drawn)
        X_resampled_tmp = np.array(X_resampled)
        y_resampled_tmp = np.array(y_resampled)

        if X_resampled_tmp.shape[1] == 1:
            X_plot = np.hstack([X_resampled_tmp, np.zeros((X_resampled_tmp.shape[0], 1))])
        elif X_resampled_tmp.shape[1] == 2:
            X_plot = X_resampled_tmp
        else:
            umap_model = UMAP(n_components=2)
            X_plot = umap_model.fit_transform(X_resampled_tmp)

        marker = len(X_drawn_tmp)
        classes = np.unique(y_resampled_tmp)

        # Prepare colormaps
        orig_palette = sns.color_palette("magma", len(classes))
        synth_palette = sns.color_palette("viridis", len(classes))

        plt.figure(figsize=(7, 5))

        # Plot synthetic samples
        for idx, cls in enumerate(classes):
            synth_mask = (y_resampled_tmp[marker:] == cls)
            if np.any(synth_mask):
                sns.scatterplot(
                    x=X_plot[marker:, 0][synth_mask],
                    y=X_plot[marker:, 1][synth_mask],
                    color=synth_palette[idx],
                    marker="X",
                    s=100,
                    label=f"Synthetic class {int(cls)}",
                    linewidth=0.4
                )

        # Plot original samples
        for idx, cls in enumerate(classes):
            orig_mask = (y_resampled_tmp[:marker] == cls)
            sns.scatterplot(
                x=X_plot[:marker, 0][orig_mask],
                y=X_plot[:marker, 1][orig_mask],
                color=orig_palette[idx],
                marker="o",
                linewidth=0.4,
                s=40,
                label=f"Original class {int(cls)}"
            )

        plt.title(f"{data_name} data after {oversampling_strategy} {type} (and TSNE), passed to tree no. {index} in forest no. {iteration}")
        plt.legend()
        plt.tight_layout()
        plt.show()

        print('')
        for idx, cls in enumerate(classes):
            orig_mask = (y_resampled_tmp[:marker] == cls)
            print(f"Class {int(cls)} has {np.sum(orig_mask)} original samples and {np.sum(y_resampled_tmp[marker:] == cls)} synthetic samples after {oversampling_strategy} oversampling.")
        print('')

    def summary(self):
        print('\n', '=' * 73)
        print('=========================         SUMMARY         =======================')
        print('=' * 73, '\n')

        for i in range(len(self.datasets)):
            print('*' * 5, f' DATASET: {self.dataset_name[i]}\n')
            class_names = np.unique(self.datasets[i][1])

            if self.mode in ['both', 'bagging']:
                for j, strategy in enumerate(self.oversampling_strategies):
                    print('\n', '\n', f'\n+++ {strategy} - bagging')
                    for v in range(self.visuals):
                        self.plot_data(self.visualization_data_bgg[i][j][v][0],
                                       self.visualization_data_bgg[i][j][v][1],
                                       self.visualization_data_bgg[i][j][v][2],
                                       self.visualization_data_bgg[i][j][v][4],
                                       self.visualization_data_bgg[i][j][v][3],
                                       self.visualization_data_bgg[i][j][v][6],
                                       self.visualization_data_bgg[i][j][v][5],
                                       'bagging')
                    results = self.results_bgg[i][j]
                    self.print_table(results, class_names, self.metrics)

            if self.mode in ['both', 'augmentation']:
                for j, strategy in enumerate(self.oversampling_strategies):
                    print(f'\n \n+++ {strategy} - augmentation +++')
                    self.plot_data(self.visualization_data_aug[i][j][0][0],
                                   self.visualization_data_aug[i][j][0][1],
                                   self.visualization_data_aug[i][j][0][2],
                                   self.visualization_data_aug[i][j][0][3],
                                   self.visualization_data_aug[i][j][0][4],
                                   self.visualization_data_aug[i][j][0][5],
                                   '-', 
                                   'augmentation')
                    results = self.results_aug[i][j]
                    self.print_table(results, class_names, self.metrics)

            self.print_table(self.results_rf[i], class_names, self.metrics)

            self.plot_violin_metrics(i)


porownanie = Comparator(
    datasets=[[fetch_datasets()['us_crime'].data, fetch_datasets()['us_crime'].target],
              [fetch_datasets()['letter_img'].data, fetch_datasets()['letter_img'].target]],
    test_size=0.2,
    oversampling_strategies=['random', 'SMOTE', 'BorderlineSMOTE', 'ADASYN'],
    metrics=['precision', 'recall', 'f1-score', 'accuracy'],
    n_trees=2,
    iterations=2,
    print_indices_list=[[0],[]],
    dataset_name=['us_crime', 'letter_img'],
    mode='both'
)

# Tests

In [None]:
comparator = Comparator()
comparator.compute()


Mode: both
Iterations: 100
Oversampling strategies: ['random', 'SMOTE', 'BorderlineSMOTE', 'ADASYN']
Metrics: ['precision', 'recall', 'f1-score', 'accuracy']
Number of trees: 100
Datasets: ['us_crime', 'letter_img']

 
 + DATASET: us_crime

-=-=-=-=-=-=   BAGGING   =-=-=-=-=-
random - bagging |██████████████████████████████| 100.0% Complete
SMOTE - bagging |██████████████████████████████| 100.0% Complete
BorderlineSMOTE - bagging |████--------------------------| 14.0% Complete

In [None]:
comparator.summary()