# 1 Imports

#### 1.1 General

In [1]:
import numpy as np
import pandas as pd
import re
import docx
from os import chdir, listdir
from math import sqrt

#### 1.2 scikit-learn

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV

#### 1.3 imbalance-learn

In [3]:
from imblearn.over_sampling import RandomOverSampler, ADASYN, SMOTE
from imblearn.pipeline import make_pipeline
from imblearn.datasets import make_imbalance

# 2 Functions

#### 2.1 Helper function

In [4]:
def measures(estimator, X, y):
    y_predicted = estimator.predict(X)
    true_positive = (y_predicted == 1)[y == 1].sum()
    false_positive = (y_predicted == 1)[y == 0].sum()
    true_negative = (y_predicted == 0)[y == 0].sum()
    false_negative = (y_predicted == 0)[y == 1].sum()
    return true_positive, false_positive, true_negative, false_negative

#### 2.2 F-measure

In [5]:
def F_measure(estimator, X, y):
    true_positive, false_positive, true_negative, false_negative = measures(estimator, X, y)
    return 2 * true_positive / (2 * true_positive + false_positive + false_negative)

#### 2.3 G-mean

In [6]:
def G_mean(estimator, X, y):
    true_positive, false_positive, true_negative, false_negative = measures(estimator, X, y)
    sensitivity = true_positive / (true_positive + false_negative)
    specificity = true_negative / (true_negative + false_positive)
    return sqrt(sensitivity * specificity)

#### 2.3 Export table function

In [7]:
def export_table(df, docx_path):
    doc = docx.Document(docx_path)
    table = doc.add_table(df.shape[0]+1, df.shape[1])
    for j in range(df.shape[1]):
        table.cell(0, j).text = df.columns[j]
    for i in range(df.shape[0]):
        for j in range(df.shape[1]):
            table.cell(i+1, j).text = str(df.values[i,j])
    doc.save(docx_path)

# 3 Data sets

#### 3.1 Datasets container

In [8]:
imbalanced_datasets_initial = []
imbalanced_datasets_undersampled = []

#### 3.2 Get csv files

In [9]:
chdir("/Users/gdouzas/Main/data/imbalanced/")
csv_files = listdir()
csv_files

['breast_tissue.csv',
 'ecoli.csv',
 'eucalyptus.csv',
 'glass.csv',
 'haberman.csv',
 'heart.csv',
 'iris.csv',
 'libra.csv',
 'liver_disorders.csv',
 'pima.csv',
 'segment.csv',
 'vehicle.csv',
 'wine.csv']

#### 3.2 Load and append datasets to containers

In [10]:
for file in csv_files:
    ds = np.genfromtxt(file, delimiter=",", skip_header=1)
    target_index = ds.shape[1] - 1
    nan_indices = np.isnan(ds).any(axis=1)
    X, y = ds[~nan_indices, 0:target_index], ds[~nan_indices, target_index].astype(int)
    imbalanced_datasets_initial.append((X, y))

#### 3.3 Undersample and append datasets to containers

In [11]:
for X, y in imbalanced_datasets_initial:
    ratio = (y == 1).sum() / (y == 0).sum()
    X_, y_ = make_imbalance(X, y, ratio=ratio/5, min_c_=1)
    imbalanced_datasets_undersampled.append((X_, y_))

#### 3.4 Final datasets

In [12]:
imbalanced_datasets = imbalanced_datasets_initial + imbalanced_datasets_undersampled

#### 3.5 Imbalance Ratio of datasets

In [13]:
datasets_names_initial = [re.sub(".csv", "", csv_file) for csv_file in csv_files]
datasets_names_undersampled = [ds + "_undersampled" for ds in datasets_names_initial]
datasets_names = datasets_names_initial + datasets_names_undersampled
minority_class_names = 2 * ["car, fad", "pp", "5", "1", "2", "1", "2", "1, 2, 3", "1", "1", "WINDOW", "van", "2"]
datasets_description = pd.DataFrame({}, columns=["Dataset", "Minority class", "# of features", "# of instances", "# of minority instances", "# of majority instances", "Imbalanced ratio"])
for ind, (X, y) in enumerate(imbalanced_datasets):
    num_features = X.shape[1]
    num_instances = y.size
    num_minority_instances = (y == 1).sum()
    num_majority_instances = (y == 0).sum()
    IR = round((y == 0).sum() / (y == 1).sum(), 2)
    datasets_description.loc[len(datasets_description)] = [datasets_names[ind], minority_class_names[ind], num_features, num_instances, num_minority_instances, num_majority_instances, IR]
data_types = ["object", "object", "int64", "int64", "int64", "int64", "float64"]
for ind, feature in enumerate(datasets_description.columns):
    datasets_description[feature] = datasets_description[feature].astype(data_types[ind])
datasets_description

Unnamed: 0,Dataset,Minority class,# of features,# of instances,# of minority instances,# of majority instances,Imbalanced ratio
0,breast_tissue,"car, fad",9,106,36,70,1.94
1,ecoli,pp,7,336,52,284,5.46
2,eucalyptus,5,8,642,98,544,5.55
3,glass,1,9,214,70,144,2.06
4,haberman,2,3,306,81,225,2.78
5,heart,1,13,270,120,150,1.25
6,iris,2,4,150,50,100,2.0
7,libra,"1, 2, 3",90,360,72,288,4.0
8,liver_disorders,1,6,345,145,200,1.38
9,pima,1,8,768,268,500,1.87


#### 3.6 Export as word table

In [15]:
export_table(datasets_description, '/Users/gdouzas/Main/projects/academic/SOMO/table1.docx')

# 4 Hyperparameter tuning

#### 4.1 Define the hyperparameter space

In [14]:
hyperparameters = {'max_depth':[2, 3, 5, 8], 'n_estimators':[10, 50, 80, 100]}

#### 4.2 Get the optimal hyperparameters

In [15]:
optimal_hyperparameters = []
for X, y in imbalanced_datasets:
        clf = GradientBoostingClassifier()
        clfs = GridSearchCV(clf, hyperparameters)
        clfs.fit(X, y)
        optimal_hyperparameters.append(clfs.best_params_)
optimal_hyperparameters

[{'max_depth': 2, 'n_estimators': 50},
 {'max_depth': 2, 'n_estimators': 100},
 {'max_depth': 2, 'n_estimators': 10},
 {'max_depth': 5, 'n_estimators': 80},
 {'max_depth': 3, 'n_estimators': 50},
 {'max_depth': 2, 'n_estimators': 50},
 {'max_depth': 2, 'n_estimators': 10},
 {'max_depth': 5, 'n_estimators': 50},
 {'max_depth': 2, 'n_estimators': 80},
 {'max_depth': 2, 'n_estimators': 100},
 {'max_depth': 5, 'n_estimators': 100},
 {'max_depth': 2, 'n_estimators': 100},
 {'max_depth': 2, 'n_estimators': 80},
 {'max_depth': 2, 'n_estimators': 10},
 {'max_depth': 2, 'n_estimators': 10},
 {'max_depth': 2, 'n_estimators': 10},
 {'max_depth': 2, 'n_estimators': 10},
 {'max_depth': 2, 'n_estimators': 10},
 {'max_depth': 3, 'n_estimators': 100},
 {'max_depth': 2, 'n_estimators': 50},
 {'max_depth': 2, 'n_estimators': 50},
 {'max_depth': 2, 'n_estimators': 10},
 {'max_depth': 3, 'n_estimators': 100},
 {'max_depth': 3, 'n_estimators': 100},
 {'max_depth': 3, 'n_estimators': 50},
 {'max_depth': 2, 

# 5 Experiment

##### 5.1 Parameters

In [16]:
cv_scores = []
metrics = ['roc_auc', F_measure, G_mean]
random_states = [5 * i for i in range(5)]

#### 5.2 Run experiment

In [17]:
for random_state in random_states:
    cv = StratifiedKFold(n_splits=5, random_state=random_state)
    algorithms = [LogisticRegression(random_state=random_state), GradientBoostingClassifier(random_state=random_state)]
    methods = [None, RandomOverSampler(random_state=random_state), SMOTE(random_state=random_state, k_neighbors=3), SMOTE(random_state=random_state, kind='borderline1', k_neighbors=3), ADASYN(random_state=random_state, n_neighbors=3)]
    for algorithm in algorithms:
        for ind, (X, y) in enumerate(imbalanced_datasets):
            if isinstance(algorithm, GradientBoostingClassifier):
                algorithm.set_params(**optimal_hyperparameters[ind])
            for method in methods:
                for metric in metrics:
                    if method is None:
                        clf = algorithm
                    else:
                        clf = make_pipeline(method, algorithm)
                    cv_scores.append(cross_val_score(clf, X, y, cv=cv, scoring=metric).mean())
cv_scores

[0.92882653061224496,
 0.51000000000000001,
 0.55507595273056165,
 0.93290816326530612,
 0.66560637430202652,
 0.70313615713499344,
 0.90918367346938778,
 0.61688888888888893,
 0.68670940033430061,
 0.93877551020408156,
 0.69724159898072946,
 0.72217486568142308,
 0.93086734693877538,
 0.6711304347826087,
 0.69811727354245678,
 0.90709728867623585,
 0.28515406162464985,
 0.38369230193991272,
 0.90439393939393931,
 0.71848107448107446,
 0.87532753990645595,
 0.90324618364092046,
 0.73682291943161504,
 0.88065848880769459,
 0.91223285486443384,
 0.61016261277921124,
 0.78952971014521689,
 0.89859022556390966,
 0.58947181850407659,
 0.80511869770046651,
 0.81495055171057107,
 0.26268285091814503,
 0.39755897440615184,
 0.84712430924405813,
 0.4688922072130392,
 0.73466956621914559,
 0.84018420158449136,
 0.46571908516873622,
 0.73436585675850385,
 0.84431192660550458,
 0.47141581311301656,
 0.7399148164525039,
 0.86007511132571479,
 0.47262189037993696,
 0.74200090731486978,
 0.8093420126

#### 5.3 Results parameters

In [18]:
algorithms_names = ["LR", "GBM"]
metrics_names = ["AUC", "F", "G"]
methods_names = ["None", "Random", "SMOTE", "Borderline SMOTE", "ADASYN"]

#### 5.4 Mean results

In [19]:
mean_cv_scores = np.array(cv_scores).reshape(len(random_states), -1).mean(axis=0).reshape(len(algorithms), len(imbalanced_datasets), len(methods), len(metrics))
mean_results = pd.DataFrame()
for ind1, alg_name in enumerate(algorithms_names):
    for ind2, ds_name in enumerate(datasets_names):
        partial_results = pd.DataFrame(mean_cv_scores[ind1, ind2, :, :].transpose(), columns=methods_names)
        partial_results.insert(0, "Metric", metrics_names)
        partial_results.insert(0, "Dataset", ds_name)
        partial_results.insert(0, "Algorithm", alg_name)
        mean_results = pd.concat([mean_results, partial_results])
mean_results = mean_results.reset_index(drop=True)

#### 5.5 Standard deviation results

In [20]:
std_cv_scores = np.array(cv_scores).reshape(len(random_states), -1).std(axis=0).reshape(len(algorithms), len(imbalanced_datasets), len(methods), len(metrics))
std_results = pd.DataFrame()
for ind1, alg_name in enumerate(algorithms_names):
    for ind2, ds_name in enumerate(datasets_names):
        partial_results = pd.DataFrame(std_cv_scores[ind1, ind2, :, :].transpose(), columns=methods_names)
        partial_results.insert(0, "Metric", metrics_names)
        partial_results.insert(0, "Dataset", ds_name)
        partial_results.insert(0, "Algorithm", alg_name)
        std_results = pd.concat([std_results, partial_results])
std_results = std_results.reset_index(drop=True)

#### 5.6. Export results

In [196]:
methods_names = methods_names
results = mean_results[["Algorithm", "Dataset", "Metric"]].copy()
for ind1, method_name in enumerate(methods_names):
    results[method_name] = round(mean_results[method_name], 3).apply(str) + " \u00B1 " + round(std_results[method_name], 3).apply(str)
for ind, alg_name in enumerate(algorithms_names):
    export_table(results[results["Algorithm"] == alg_name][results.columns[1:]], '/Users/gdouzas/Main/projects/academic/SOMO/table%s.docx' % str(ind + 3))

#### 5.7 Oversampling methods ranking

In [23]:
mean_results

Unnamed: 0,Algorithm,Dataset,Metric,None,Random,SMOTE,Borderline SMOTE,ADASYN,Cluster SMOTE,SOMO
0,LR,breast_tissue,AUC,0.928010,0.929286,0.948112,0.945000,0.933724,0.945423,0.942621
1,LR,breast_tissue,F,0.510000,0.645443,0.654330,0.706849,0.693929,0.674688,0.664150
2,LR,breast_tissue,G,0.555076,0.670669,0.684545,0.726819,0.711404,0.706143,0.687900
3,LR,ecoli,AUC,0.907097,0.903002,0.903145,0.911569,0.897456,0.902774,0.912187
4,LR,ecoli,F,0.285154,0.715849,0.722750,0.614662,0.587837,0.645913,0.699364
5,LR,ecoli,G,0.383692,0.873614,0.877223,0.790958,0.805643,0.835682,0.862589
6,LR,eucalyptus,AUC,0.814986,0.847528,0.843287,0.835169,0.859666,0.855361,0.848730
7,LR,eucalyptus,F,0.262683,0.471137,0.467724,0.454660,0.475030,0.462053,0.471617
8,LR,eucalyptus,G,0.397559,0.741522,0.734123,0.722865,0.741242,0.738283,0.746134
9,LR,glass,AUC,0.809342,0.811312,0.809441,0.810524,0.811115,0.806248,0.819927


In [22]:
ranking

Unnamed: 0,None,Random,SMOTE,Borderline SMOTE,ADASYN,Cluster SMOTE,SOMO
0,7,6,1,3,5,2,4
1,7,6,5,1,2,3,4
2,7,6,5,1,2,3,4
3,3,5,4,2,7,6,1
4,7,2,1,5,6,4,3
5,7,2,1,6,5,4,3
6,7,4,5,6,1,2,3
7,7,3,4,6,1,5,2
8,7,2,5,6,3,4,1
9,6,2,5,4,3,7,1


In [21]:
ranking = mean_results.apply(lambda row: len(row[3:]) - row[3:].argsort().argsort(), axis=1)
aggregated_ranking = round(pd.concat([mean_results[["Algorithm", "Metric"]], ranking], axis=1).groupby(["Algorithm", "Metric"]).mean(), 2)
aggregated_ranking

Unnamed: 0_level_0,Unnamed: 1_level_0,None,Random,SMOTE,Borderline SMOTE,ADASYN,Cluster SMOTE,SOMO
Algorithm,Metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
GBM,AUC,4.81,4.88,3.46,4.27,4.31,3.65,2.62
GBM,F,5.65,4.58,3.62,3.65,4.0,3.46,3.04
GBM,G,6.31,4.19,3.62,3.73,3.73,3.23,3.19
LR,AUC,4.92,4.08,4.08,4.23,4.23,3.92,2.54
LR,F,5.54,4.65,3.54,4.0,3.27,4.12,2.88
LR,G,6.23,4.38,3.65,4.27,3.69,3.31,2.46


#### 5.8 Export rankings

In [201]:
export_table(pd.concat([aggregated_ranking[3:6], aggregated_ranking[0:3]]), '/Users/gdouzas/Main/projects/academic/SOMO/table5.docx')

#### 5.9 Friedman test

In [272]:
friedman_results = pd.DataFrame({}, columns=["Algorithm", "Metric", "p-value"])
for alg_name in algorithms_names:
    for metric_name in metrics_names:
        partial_ranking = pd.concat([mean_results[["Algorithm", "Metric"]], ranking], axis=1)[(mean_results["Algorithm"] == alg_name) & (mean_results["Metric"] == metric_name)]
        friedman_inputs = []
        for method_name in methods_names:
            friedman_inputs.append(partial_ranking[method_name])
        pvalue = round(scipy.stats.friedmanchisquare(*friedman_inputs).pvalue, 2)
        friedman_results.loc[len(friedman_results)] = [alg_name, metric_name, pvalue]
friedman_results

Unnamed: 0,Algorithm,Metric,p-value
0,LR,AUC,6.122881e-16
1,LR,F,4.46386e-08
2,LR,G,2.249841e-11
3,GBM,AUC,4.305293e-05
4,GBM,F,0.002043343
5,GBM,G,1.75001e-06


#### Export Friedman test results

In [273]:
export_table(friedman_results, '/Users/gdouzas/Main/projects/academic/SOMO/table6.docx')