In [1]:
from collections import defaultdict
import numpy as np
import pandas as pd
from sklearn.metrics import recall_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelBinarizer
from IPython.core.display import display
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm_notebook

from multi_imbalance.ensemble.SOUPBagging import SOUPBagging
from multi_imbalance.ensemble.mrbbagging import MRBBagging
from multi_imbalance.resampling.SOUP import SOUP
from multi_imbalance.resampling.MDO import MDO
from multi_imbalance.resampling.GlobalCS import GlobalCS

from imblearn.metrics import geometric_mean_score
from imblearn.over_sampling import SMOTE
from multi_imbalance.resampling.spider import SPIDER3

from sklearn.neighbors import KNeighborsClassifier
import warnings
import logging
from multi_imbalance.utils.data import load_arff_datasets
from multi_imbalance.utils.min_int_maj import maj_int_min
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)
warnings.filterwarnings('ignore')

from IPython.display import clear_output
clear_output(wait=True)

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# def green_valid_backgroud(s):
#     correct = ['1czysty-cut', '2delikatne-cut', '3mocniej-cut','4delikatne-bezover-cut', 'cmc', 'dermatology', 'new_ecoli','new_vehicle','thyroid-newthyroid']
#     return ['background-color: green' if v in correct else '' for v in list(s.index)]
# 


def bold_max(s):
    '''
    highlight the maximum in a Series yellow.
    '''
    is_max = s == s.max()
    return ['font-weight: bold' if v else '' for v in is_max]
    
def print_scores(scores, name, only_read_dt = False, columns=None, base=None):
    df = pd.DataFrame(scores).T
    if only_read_dt:
        df = df.iloc[4:]
    if columns is not None:
        df = df[columns]
    if base is not None:
        df = pd.merge(base,df, left_index=True, right_index=True)
    df2 = df.style.apply(bold_max, axis=1)
    display(df2)
    
    with open(f'{name}_main.tex','w') as tf:
        tf.write(df.to_latex())
    df.to_csv(f'{name}_main.csv')
    
    df.fillna(df.median(), inplace=True)
    df_median = pd.DataFrame(df.mean().sort_values(ascending=False),columns=['Mean G-mean'])
    display(df_median)
    df_meanrank = pd.DataFrame(df.rank(axis=1,ascending=False).mean().sort_values(),columns=['Mean rank'])
    display(df_meanrank)
    
    with open(f'{name}_median.tex','w') as tf:
        tf.write(df_median.to_latex())
    df_median.to_csv(f'{name}_median.csv')
    with open(f'{name}_meanrank.tex','w') as tf:
        tf.write(df_meanrank.to_latex())
    df_meanrank.to_csv(f'{name}_meanrank.csv')
# print_scores(scores_knn)

In [3]:
def resample_data(resample, seed, X_train, y_train, no_classes, dataset_name):
    if resample == 'base':
        X_train_resampled, y_train_resampled = X_train, y_train
    elif 'soup' in resample:
        soup = SOUP(k=3)
        X_train_resampled, y_train_resampled = soup.fit_transform(np.copy(X_train), np.copy(y_train), maj_int_min=maj_int_min[dataset_name])
    elif resample=='global':
        global_cs = GlobalCS()
        X_train_resampled, y_train_resampled = global_cs.fit_transform(np.copy(X_train), np.copy(y_train), shuffle=False)
    elif resample=='smote':
        smote = SMOTE(random_state=seed)
        X_train_resampled, y_train_resampled = smote.fit_sample(np.copy(X_train), np.copy(y_train))
    elif 'mdo' in resample:
        mdo = MDO(k=5, k1_frac=.3, seed=seed)
        X_train_resampled, y_train_resampled = mdo.fit_transform(np.copy(X_train), np.copy(y_train), maj_int_min=maj_int_min[dataset_name])
    elif resample=='spider':
        cost = np.ones((no_classes, no_classes))
        np.fill_diagonal(cost, 0)
        clf = SPIDER3(k=5, cost=cost, majority_classes=maj_int_min[dataset_name]['maj'], intermediate_classes=maj_int_min[dataset_name]['int'], minority_classes=maj_int_min[dataset_name]['min'])
        X_train_resampled, y_train_resampled = clf.fit_transform(X_train.astype(np.float64), y_train)
    elif 'soupbg' in resample or 'mrbbag' in resample:
        # SOUP Bagging does it by itself
        X_train_resampled, y_train_resampled = X_train, y_train
    else:
        raise ValueError(f'Bad type{resample}')
    return X_train_resampled, y_train_resampled



def test_resampling(res, dataset_values, dataset_name):
    X, y, scale_index = dataset_values.data, dataset_values.target, dataset_values.cat_length

    no_classes = np.unique(y).size
    minority_class = maj_int_min[dataset_name]['min']
    result_data = defaultdict(int)
    run_data = defaultdict(lambda: defaultdict(list)) # {metric: {run_number: [scores]}}
    for i in range(10):
        skf = StratifiedKFold(n_splits=5, shuffle=True,random_state=i)
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            if scale_index > 0:
                normalizer = StandardScaler().fit(X_train[:,:scale_index])
    
                X_train[:,:scale_index] = normalizer.transform(X_train[:,:scale_index])
                X_test[:,:scale_index] = normalizer.transform(X_test[:,:scale_index])
            X_train_resampled, y_train_resampled = resample_data(res, i, X_train, y_train, no_classes, dataset_name)

            # for clf_name in ['knn']:
            for clf_name in ['knn','tree']:
                if clf_name == 'knn':
                    clf = KNeighborsClassifier(n_neighbors=3)
                elif clf_name == 'tree':
                    clf = DecisionTreeClassifier(random_state=i)
                    
                if  'soupbg005' in res:
                    vote_classifier = SOUPBagging(clf, n_classifiers=5)
                    clf = vote_classifier
                elif  'soupbg015' in res:
                    vote_classifier = SOUPBagging(clf, n_classifiers=15)
                    clf = vote_classifier
                elif  'soupbg030' in res:
                    vote_classifier = SOUPBagging(clf, n_classifiers=30)
                    clf = vote_classifier
                elif  'soupbg050' in res:
                    vote_classifier = SOUPBagging(clf, n_classifiers=50)
                    clf = vote_classifier
                elif  'soupbg100' in res:
                    vote_classifier = SOUPBagging(clf, n_classifiers=100)
                    clf = vote_classifier
                # elif res == 'mrbbag005':
                    
                    
                clf.fit(X_train_resampled, y_train_resampled)
                if 'soupbg' in res:
                    for strategy in ['average','optimistic','pessimistic','mixed', 'global']:
                        y_pred = clf.predict(X_test, strategy=strategy, maj_int_min=maj_int_min[dataset_name])
                        gmean = geometric_mean_score(y_test, y_pred, correction=0.001)
                        minority_gmean = geometric_mean_score(y_test, y_pred,labels=minority_class, correction=0.001)
                        avg_acc = np.mean(recall_score(y_test, y_pred, average=None))
                        run_data['g_mean_{}_{}'.format(clf_name, strategy)][str(i)].append(gmean)
                        run_data['g_mean_{}_minority_{}'.format(clf_name, strategy)][str(i)].append(minority_gmean)
                else:
                    y_pred = clf.predict(X_test)
                    gmean = geometric_mean_score(y_test, y_pred, correction=0.001)
                    minority_gmean = geometric_mean_score(y_test, y_pred,labels=minority_class, correction=0.001)
                    avg_acc = np.mean(recall_score(y_test, y_pred, average=None))
                    run_data['g_mean_{}'.format(clf_name)][str(i)].append(gmean)
                    run_data['g_mean_{}_minority'.format(clf_name)][str(i)].append(minority_gmean)
                # run_data['avg_acc_{}'.format(clf_name)][str(i)].append(avg_acc)
    
    def get_score_from_metric(run_data, metric):
        runs = run_data[metric]
        runs_scores_list = list(runs.values()) #[[one run k-foledscores],[..]]
        result = np.mean(list(map(np.mean, runs_scores_list)))
        return result
            
    for metric_name, metric_values in run_data.items():
        result_data[metric_name] = get_score_from_metric(run_data, metric_name)
        
    return result_data


def provide_test_and_get_scores(datasets, clf_res_names):
    scores = defaultdict(lambda: defaultdict(dict))
    for dataset_name, dataset_values in tqdm_notebook(datasets.items(),total=len(datasets), desc='1st loop'):
        for resample in clf_res_names:
            result_data = test_resampling(resample, dataset_values, dataset_name)
            for key in result_data:
                scores[key][dataset_name][resample] = round(result_data[key],4)
    return scores

clf_res_names =['soup']
datasets = load_arff_datasets(return_cat_length=True)
scores = provide_test_and_get_scores(datasets, clf_res_names)

HBox(children=(IntProgress(value=0, description='1st loop', max=19, style=ProgressStyle(description_width='ini…




#### Gmean knn

In [4]:
print_scores(scores['g_mean_knn'], 'g_mean_knn')

Unnamed: 0,soup
1czysty-cut,0.9661
2delikatne-cut,0.7759
3mocniej-cut,0.5387
4delikatne-bezover-cut,0.8739
balance-scale,0.6126
car,0.4655
cleveland,0.2107
cleveland_v2,0.2515
cmc,0.4808
dermatology,0.9486


Unnamed: 0,Mean G-mean
soup,0.627042


Unnamed: 0,Mean rank
soup,1.0


#### Gmean minority knn

In [5]:
print_scores(scores['g_mean_knn_minority'], 'g_mean_knn_minority')

Unnamed: 0,soup
1czysty-cut,0.9992
2delikatne-cut,0.7008
3mocniej-cut,0.4442
4delikatne-bezover-cut,0.9158
balance-scale,0.4956
car,0.3905
cleveland,0.1729
cleveland_v2,0.1994
cmc,0.5159
dermatology,1.0


Unnamed: 0,Mean G-mean
soup,0.621047


Unnamed: 0,Mean rank
soup,1.0


#### Gmean tree


In [6]:
print_scores(scores['g_mean_tree'], 'g_mean_tree')

Unnamed: 0,soup
1czysty-cut,0.9574
2delikatne-cut,0.775
3mocniej-cut,0.5664
4delikatne-bezover-cut,0.8749
balance-scale,0.5575
car,0.8795
cleveland,0.1028
cleveland_v2,0.1422
cmc,0.4755
dermatology,0.9458


Unnamed: 0,Mean G-mean
soup,0.649932


Unnamed: 0,Mean rank
soup,1.0


#### Gmean minority tree

In [7]:
print_scores(scores['g_mean_tree_minority'], 'g_mean_tree_minority')

Unnamed: 0,soup
1czysty-cut,0.9542
2delikatne-cut,0.67
3mocniej-cut,0.4292
4delikatne-bezover-cut,0.8833
balance-scale,0.3684
car,0.9344
cleveland,0.0701
cleveland_v2,0.0959
cmc,0.5067
dermatology,0.98


Unnamed: 0,Mean G-mean
soup,0.636074


Unnamed: 0,Mean rank
soup,1.0
