In [1]:
from collections import defaultdict, Counter
import numpy as np
import pandas as pd
from distython import HVDM, HEOM
from sklearn.metrics import recall_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelBinarizer
from IPython.core.display import display
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm_notebook

from multi_imbalance.datasets import load_datasets
from multi_imbalance.ensemble.SOUPBagging import SOUPBagging
from multi_imbalance.resampling.SOUP import SOUP
from multi_imbalance.resampling.MDO import MDO
from multi_imbalance.resampling.GlobalCS import GlobalCS

from imblearn.metrics import geometric_mean_score
from imblearn.over_sampling import SMOTE
from multi_imbalance.resampling.spider import SPIDER3

from sklearn.neighbors import KNeighborsClassifier
import warnings
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)
warnings.filterwarnings('ignore')
maj_int_min = {
    'balance_scale' : {
        'maj': [2, 1],
        'int': [],
        'min': [0]
    }, 
    'cleveland': {
        'maj': [0],
        'int': [1],
        'min': [2,3,4]
    }, 
    'cmc': {
        'maj': [0],
        'int': [2],
        'min': [1]
    }, 
    'dermatology': {
        'maj': [0],
        'int': [2,1,4,3],
        'min': [5]
    }, 
    'ecoli': {
        'maj': [0,1],
        'int': [7,4,5],
        'min': [6,3,2]
    }, 
    'glass': {
        'maj': [1,0],
        'int': [5],
        'min': [2,3,4]
    }, 
    'hayes_roth': {
        'maj': [0,1],
        'int': [],
        'min': [2]
    }, 
    'new_thyroid': {
        'maj': [0],
        'int': [],
        'min': [1,2]
    }, 
    'winequailty_red': {
        'maj': [2,3],
        'int': [4],
        'min': [1,5,0]
    }, 
    'yeast': {
        'maj': [0,7],
        'int': [6, 5],
        'min': [4,3,2,9,8,1]
    }
}
from IPython.display import clear_output
clear_output(wait=True)

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [33]:
def resample_data(resample, seed, X_train, y_train, no_classes, dataset_name):
    if resample == 'base':
        X_train_resampled, y_train_resampled = X_train, y_train
    elif resample=='soup':
        soup = SOUP(k=3)
        X_train_resampled, y_train_resampled = soup.fit_transform(np.copy(X_train), np.copy(y_train))
    elif resample=='global':
        global_cs = GlobalCS()
        X_train_resampled, y_train_resampled = global_cs.fit_transform(np.copy(X_train), np.copy(y_train), shuffle=False)
    elif resample=='smote':
        smote = SMOTE(random_state=seed)
        X_train_resampled, y_train_resampled = smote.fit_sample(np.copy(X_train), np.copy(y_train))
    elif resample=='mdo':
        mdo = MDO(k=7, k1_frac=0.5, seed=seed)
        X_train_resampled, y_train_resampled = mdo.fit_transform(np.copy(X_train), np.copy(y_train))
    elif resample=='spider':
        cost = np.ones((no_classes, no_classes))
        np.fill_diagonal(cost, 0)
        clf = SPIDER3(k=5, cost=cost, majority_classes=maj_int_min[dataset_name]['maj'], intermediate_classes=maj_int_min[dataset_name]['int'], minority_classes=maj_int_min[dataset_name]['min'])
        X_train_resampled, y_train_resampled = clf.fit_transform(X_train.astype(np.float64), y_train)
    elif 'soupbg' in resample:
        # SOUP Bagging does it by itself
        X_train_resampled, y_train_resampled = X_train, y_train
    else:
        raise ValueError(f'Bad type{resample}')
    return X_train_resampled, y_train_resampled



def test_resampling(res, dataset_values, dataset_name):
    X, y = dataset_values.data, dataset_values.target
    # if dataset_name.startswith('cleveland'):
    #     nominal = [12]
    #     for id in nominal:
    #         last_column = X[:,id].astype(int)
    #         X = np.delete(X,id,1)
    #         coded = LabelBinarizer().fit_transform(last_column)
    #         X = np.hstack((X,coded))
        

    # normalizer = StandardScaler().fit(X)
    # X = normalizer.transform(X)

    # hvdm_metric = HEOM(X,[1,2,5,6,8,10,12], nan_equivalents = [12345])
    no_classes = np.unique(y).size
    result_data = defaultdict(int)
    run_data = defaultdict(lambda: defaultdict(list)) # {metric: {run_number: [scores]}}
    for i in range(10):
        skf = StratifiedKFold(n_splits=10, shuffle=True,random_state=i)
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            normalizer = StandardScaler().fit(X_train)

            X_train = normalizer.transform(X_train)
            X_test = normalizer.transform(X_test)
            X_train_resampled, y_train_resampled = resample_data(res, i, X_train, y_train, no_classes, dataset_name)

            # for clf_name in ['knn','tree']:
            for clf_name in ['knn']:
                if clf_name == 'knn':
                    clf = KNeighborsClassifier(n_neighbors=3)
                elif clf_name == 'tree':
                    clf = DecisionTreeClassifier(random_state=i)
                # DONT JUDGE ME
                if res == 'soupbg005':
                    vote_classifier = SOUPBagging(clf, n_classifiers=5)
                    clf = vote_classifier
                elif res == 'soupbg015':
                    vote_classifier = SOUPBagging(clf, n_classifiers=15)
                    clf = vote_classifier
                elif res == 'soupbg030':
                    vote_classifier = SOUPBagging(clf, n_classifiers=30)
                    clf = vote_classifier
                elif res == 'soupbg050':
                    vote_classifier = SOUPBagging(clf, n_classifiers=50)
                    clf = vote_classifier
                elif res == 'soupbg100':
                    vote_classifier = SOUPBagging(clf, n_classifiers=100)
                    clf = vote_classifier
                    
                clf.fit(X_train_resampled, y_train_resampled)
                y_pred = clf.predict(X_test)
                gmean = geometric_mean_score(y_test, y_pred, correction=0.001)
                avg_acc = np.mean(recall_score(y_test, y_pred, average=None))
                run_data['g_mean_{}'.format(clf_name)][str(i)].append(gmean)
                run_data['avg_acc_{}'.format(clf_name)][str(i)].append(avg_acc)
    
    def get_score_from_metric(run_data, metric):
        runs = run_data[metric]
        runs_scores_list = list(runs.values()) #[[one run k-foledscores],[..]]
        result = np.mean(list(map(np.mean, runs_scores_list)))
        return result
            
    result_data['g_mean_knn'] = get_score_from_metric(run_data, 'g_mean_knn')
    result_data['avg_acc_knn'] = get_score_from_metric(run_data, 'avg_acc_knn')
    result_data['g_mean_tree'] = get_score_from_metric(run_data, 'g_mean_tree')
    return result_data


def provide_test_and_get_scores(datasets):
    scores_knn = defaultdict(dict)
    scores_tree = defaultdict(dict)
    avg_acc = defaultdict(dict)
    # clf_res_names =['base','global','smote','mdo','soup','soupbg005','soupbg015','soupbg030', 'soupbg050', 'soupbg100']
    # clf_res_names =['base','global','smote','mdo','soup']
    clf_res_names =['base','global','soup']
    for dataset_name, dataset_values in tqdm_notebook(datasets.items(),total=len(datasets), desc='1st loop'):
        # if dataset_name.startswith('cleveland'):
        for resample in clf_res_names:
            result_data = test_resampling(resample, dataset_values, dataset_name)
            scores_knn[dataset_name][resample] = round(result_data['g_mean_knn'],4)
            scores_tree[dataset_name][resample] = round(result_data['g_mean_tree'],4)
            avg_acc[dataset_name][resample] = round(result_data['avg_acc_knn'],4)
    return scores_knn, scores_tree, avg_acc

datasets = load_datasets()
scores_knn, scores_tree, avg_acc = provide_test_and_get_scores(datasets)

HBox(children=(IntProgress(value=0, description='1st loop', max=17, style=ProgressStyle(description_width='ini…




In [34]:
def green_valid_backgroud(s):
    correct = ['1czysty-cut', '2delikatne-cut', '3mocniej-cut','4delikatne-bezover-cut', 'cmc', 'dermatology', 'new_ecoli','new_vehicle','thyroid-newthyroid']
    return ['background-color: green' if v in correct else '' for v in list(s.index)]

def print_scores(scores, only_read_dt = False):
    display("G-MEAN")
    df = pd.DataFrame(scores).T
    if only_read_dt:
        df = df.iloc[4:]
    df = df.style.apply(green_valid_backgroud)
    display(df)
    
    # df.fillna(df.median(), inplace=True)
    # display(pd.DataFrame(df.mean().sort_values(ascending=False),columns=['Mean G-mean']))
    # display(pd.DataFrame(df.rank(axis=1,ascending=False).mean().sort_values(),columns=['Mean rank']))
print_scores(scores_knn)


'G-MEAN'

Unnamed: 0,base,global,soup
1czysty-cut,0.959,0.9681,0.9685
2delikatne-cut,0.679,0.7251,0.7706
3mocniej-cut,0.4081,0.4893,0.5319
4delikatne-bezover-cut,0.7814,0.8135,0.8693
balance-scale,0.0958,0.0892,0.5777
cleveland,0.0343,0.0631,0.0688
cleveland_v2,0.0266,0.0858,0.1065
cmc,0.4095,0.4326,0.4551
dermatology,0.9522,0.9416,0.9431
glass,0.222,0.4073,0.5154


### Wszystkie zbiory danych:
#### Drzewo

In [21]:
print_scores(scores_tree)


'G-MEAN'

Unnamed: 0,base,global,soup
1czysty-cut,,,
2delikatne-cut,,,
3mocniej-cut,,,
4delikatne-bezover-cut,,,
balance-scale,,,
cleveland,,,
cleveland_v2,,,
cmc,,,
dermatology,,,
glass,,,


Unnamed: 0,Mean G-mean
base,
global,
soup,


Unnamed: 0,Mean rank
base,
global,
soup,


#### kNN - 5

In [22]:
print_scores(scores_knn)
print_scores(avg_acc)


'G-MEAN'

Unnamed: 0,base,global,soup
1czysty-cut,0.959,0.968,0.96
2delikatne-cut,0.679,0.725,0.779
3mocniej-cut,0.408,0.489,0.533
4delikatne-bezover-cut,0.781,0.813,0.889
balance-scale,0.096,0.089,0.69
cleveland,0.034,0.063,0.077
cleveland_v2,0.027,0.086,0.108
cmc,0.41,0.433,0.452
dermatology,0.952,0.942,0.944
glass,0.222,0.407,0.482


Unnamed: 0,Mean G-mean
soup,0.620353
global,0.549706
base,0.515353


Unnamed: 0,Mean rank
soup,1.470588
global,1.941176
base,2.588235


'G-MEAN'

Unnamed: 0,base,global,soup
1czysty-cut,0.96,0.969,0.961
2delikatne-cut,0.71,0.739,0.784
3mocniej-cut,0.506,0.53,0.556
4delikatne-bezover-cut,0.791,0.819,0.899
balance-scale,0.584,0.525,0.706
cleveland,0.295,0.305,0.318
cleveland_v2,0.297,0.339,0.361
cmc,0.428,0.439,0.458
dermatology,0.957,0.948,0.949
glass,0.623,0.684,0.678


Unnamed: 0,Mean G-mean
soup,0.683529
global,0.667824
base,0.652588


Unnamed: 0,Mean rank
soup,1.588235
global,1.911765
base,2.5


### Rzeczywiste zbiory danych
#### Drzewo

In [23]:
print_scores(scores_tree,only_read_dt=True)


'G-MEAN'

Unnamed: 0,base,global,soup
balance-scale,,,
cleveland,,,
cleveland_v2,,,
cmc,,,
dermatology,,,
glass,,,
hayes-roth,,,
new_ecoli,,,
new_led7digit,,,
new_vehicle,,,


Unnamed: 0,Mean G-mean
base,
global,
soup,


Unnamed: 0,Mean rank
base,
global,
soup,


#### kNN

In [24]:
print_scores(scores_knn,only_read_dt=True)

'G-MEAN'

Unnamed: 0,base,global,soup
balance-scale,0.096,0.089,0.69
cleveland,0.034,0.063,0.077
cleveland_v2,0.027,0.086,0.108
cmc,0.41,0.433,0.452
dermatology,0.952,0.942,0.944
glass,0.222,0.407,0.482
hayes-roth,0.49,0.673,0.503
new_ecoli,0.711,0.712,0.763
new_led7digit,0.749,0.309,0.735
new_vehicle,0.924,0.923,0.888


Unnamed: 0,Mean G-mean
soup,0.568077
global,0.488462
base,0.456462


Unnamed: 0,Mean rank
soup,1.538462
global,2.0
base,2.461538
