In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from DataGenerator import ImbalanceGenerator
from sklearn.model_selection import train_test_split
from Hierarchy import EngineTaxonomy
from anytree import RenderTree
import numpy as np
import random
from pathlib import Path
import pandas as pd
import sklearn.metrics as skm
from fairlearn.metrics import MetricFrame
from sklearn.metrics import accuracy_score, silhouette_score, davies_bouldin_score, calinski_harabasz_score
from pymfe.mfe import MFE

from fairlearn.metrics import count
from sklearn.linear_model import LogisticRegression
from dcm import dcm


### Loading Adult Data

Load Adult Data and extract statistics/Metrics out of it

In [5]:
import openml
# Load ADult
dataset_id = 1590
dataset = openml.datasets.get_dataset(dataset_id)
(X, y, categorical, names) = dataset.get_data(
    target=dataset.default_target_attribute,
    dataset_format="dataframe", 
)
print(categorical)
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

X_num = X.select_dtypes(include=numerics)
X_num
X

2022-03-22 13:12:22 INFO openml.datasets.dataset: pickle load data adult


[False, True, False, True, False, True, True, True, True, True, False, False, False, True]


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,25,Private,226802.0,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40,United-States
1,38,Private,89814.0,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50,United-States
2,28,Local-gov,336951.0,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40,United-States
3,44,Private,160323.0,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40,United-States
4,18,,103497.0,Some-college,10,Never-married,,Own-child,White,Female,0.0,0.0,30,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302.0,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0.0,0.0,38,United-States
48838,40,Private,154374.0,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0.0,0.0,40,United-States
48839,58,Private,151910.0,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0.0,0.0,40,United-States
48840,22,Private,201490.0,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0.0,0.0,20,United-States


In [None]:
groups = X["sex"]
groups = [int(g == 'Male') for g in groups]
groups_race = [int(g == 'Black') for g in X["race"]]
#print(groups)
print(f"Silhouette: {silhouette_score(X_num,y)}")
print(f"DBI: {davies_bouldin_score(X_num,y)}")
print(f"CHI: {calinski_harabasz_score(X_num,y)}")
print(f"Gini: {ImbalanceGenerator().gini(y)}")
print(f"Gini (sex): {ImbalanceGenerator().gini(groups)}")
print(f"Gini (race): {ImbalanceGenerator().gini(groups_race)}")

### DON'T DO THIS!!!#####
# Takes too much RAM ...
#mfe = MFE(groups=["all"], features=["c1","c2","sil", "f1v", "f2", "n1", "n2", "impconceptvar", "wg_dist"])
#mfe.fit(X_num.to_numpy(), y.to_numpy())
#ft = mfe.extract()
#print(f"{x}: y" for x, y in zip(ft[0], ft[1]))
########

Silhouette: -0.004170237204554553
DBI: 37.04498080341358
CHI: 14.007493479729758
Gini: 0.2607182343065393
Gini (sex): 0.16848204414233647
Gini (race): 0.40407845706564016




## Getting Statistics from Generated Data

Generate Data and store statistics. Complex statistics are stored using pymfe.

In [None]:
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from rpy2.robjects.packages import importr

f = 100
n = 1000

gi = 'medium'
ci = 'medium'
c = 10
gs = 0.25

metric_mapping = {'c1': "Class entropy", "c2": "Imbalance Degree", "f1v.mean": "Fishers DR", "f2.mean": "Class Overlap", "vdu": "Dunn Index",
                 "n1": "Border Points", "n2.mean": "Inter/Intra Class Dist", "sil": "SIL", "n3.mean": "NN Error", "ch": "CHI"}

metric_mapping = {'c1': "Class entropy", "c2": "Imbalance Degree", "f1v": "Fishers DR", "f2": "Class Overlap", "vdu": "Dunn Index",
                 "n1": "Border Points", "n2": "Inter/Intra Class Dist", "sil": "SIL", "n3": "NN Error", "ch": "CHI"}

result_df = pd.DataFrame()
for c in [10, 30, 50, 70, 100]:#, 100]:
    for gs in [0, 0.25, 0.5, 0.75, 1.0]:
        for gi in ['balanced', 'medium', 'imbalanced']:
            for ci in ['balanced', 'medium', 'imbalanced']:

                print('---------------------------------')
                print(f'---- #classes: {c}, gs={gs}--------')
                generator = ImbalanceGenerator(n_features=f,
                                           n=n,
                                           c=c,
                                           features_remove_percent=0,
                                           hardcoded=False,
                                           group_imbalance=gi,
                                           cls_imbalance=ci,
                                           class_overlap=1.5,
                                           root=EngineTaxonomy().create_taxonomy(),
                                           gs=gs,
                                           n_group_features=10)
                df = generator.generate_data_with_product_hierarchy()
                df = df.dropna(how='all')
                X, y = df[[f"F{i}" for i in range(f)]].to_numpy(), df["target"].to_numpy()
                groups = df["group"].to_numpy()
                
                stats = {}
                ##########################################
                # Drop rows where we have only NaN values! --> Happens if we have two classes for only one sample ...
                #not_nan_rows = [id_ for id_, x in enumerate(X.isna().sum(axis=1)) if x == 0]
                #X = X.to_numpy()[not_nan_rows, :]
                #y = y.to_numpy()[not_nan_rows]
                #groups = groups[not_nan_rows].to_numpy()
                #########################################
                complexit_metrics = ["f1v", "n1", "n3",]
                #####################################
                ## Complexity metrics  from PyMFE ###
                mfe = MFE(groups=["all"], features=complexit_metrics, 
                          summary=None)
                mfe.fit(X, y)
                ft = mfe.extract()
                for metric, value in zip(ft[0], ft[1]):
                    print(f"{metric_mapping[metric]} (C): {value}")
                    stats[f"{metric_mapping[metric]} (C)"] = value
                continue
                
                for group in groups:
                    group_df = df[df["group"] == group]
                    group_X, group_y = group_df[[f"F{i}" for i in range(f)]].to_numpy(), group_df["target"].to_numpy()
                    mfe = MFE(groups=["all"], features=complexit_metrics, summary=["mean"])
                    mfe.fit(group_X, group_y)
                    ft = mfe.extract()
                    for metric, value in zip(ft[0], ft[1]):
                        print(f"{metric_mapping[metric]} (G): {value}")
                        stats[f"{metric_mapping[metric]} (G)"] = value
                    
                ######################################
                stats_df[f"Gini (C)"] = generator.gini(y)
                stats_df[f"Gini (G)"] = generator.gini(groups)
                
                stats_df = pd.DataFrame({metric_mapping[metric]: [value] for metric, value in stats_dict.items()})

                # Basic stats (#instances etc.)
                stats_df["avg #n groups"] = df.groupby(['group']).size().mean()
                stats_df["avg #n classes+groups"] = df.groupby(['group', 'target']).size().mean()
                stats_df["min #n groups"] = df.groupby(['group']).size().min()
                stats_df["max #n groups"] = df.groupby(['group']).size().max()

                stats_df["target"] = target_name
                stats_df["#c"] = c
                stats_df["gs"] = gs
                stats_df["#n"] = n
                stats_df["gi"] = gi
                stats_df["ci"] = ci

                result_df = pd.concat([result_df, stats_df])

                result_df.to_csv('complexity_metrics.csv', sep=';', decimal=',')

---------------------------------
---- #classes: 10, gs=0--------
features that are currently not used: []
Fishers DR (C): [0.11851664 0.11414226 0.06513007 0.01117213 0.11529027 0.03196682
 0.00763868 0.05692798 0.04819866 0.17478719 0.18340871 0.04670633
 0.13525853 0.11829427 0.05454593 0.12647413 0.14833988 0.176784
 0.12196554 0.14011589 0.06536736 0.0533614  0.19423448 0.14275483
 0.03161747 0.12734315 0.05808788 0.04152191 0.10770614 0.1764405
 0.03732524 0.01582012 0.00154818 0.03531833 0.04417765 0.07512624
 0.01819383 0.12514176 0.09270522 0.00570789 0.09575337 0.05810333
 0.07179398 0.10310402 0.21414404]
Border Points (C): 0.771
NN Error (C): [1 1 0 1 1 1 1 1 0 1 1 0 1 1 1 1 1 0 0 1 1 1 1 1 0 1 1 1 0 1 1 1 1 0 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0
 1 0 1 0 1 0 1 1 1 1 1 1 1 0 1 1 0 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 1 1
 1 1 0 1 1 1 1 0 0 1 0 0 1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 0 0 1 1 1 1 1 1 1 1
 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 

features that are currently not used: []
Fishers DR (C): [0.11939018 0.14696901 0.08134678 0.08861677 0.10091805 0.11114572
 0.08269545 0.20523423 0.12924338 0.07788953 0.06080866 0.04180455
 0.087697   0.04552521 0.07030317 0.08565857 0.11086414 0.09247004
 0.10143628 0.11057382 0.11192033 0.06103487 0.11239496 0.0847711
 0.03582104 0.09218078 0.04216372 0.06332301 0.07742567 0.0501971
 0.08808905 0.06359172 0.05634352 0.1171723  0.08147612 0.129111
 0.08639328 0.15169654 0.15857821 0.05123973 0.1084376  0.12571044
 0.11626043 0.08708802 0.22359578]
Border Points (C): 0.808
NN Error (C): [1 1 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1
 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 0 1 0 0 1 1 1 1 0 1 1 1 0 1 0 0 1
 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 

features that are currently not used: []
Fishers DR (C): [0.11689589 0.26655287 0.08919454 0.10911039 0.02291762 0.03453062
 0.01979325 0.14699123 0.06369065 0.14275793 0.04950049 0.0121268
 0.02169401 0.00697483 0.00186973 0.09064794 0.01666297 0.09415331
 0.07163064 0.02349714 0.03499734 0.01763634 0.18363361 0.0666138
 0.14410039 0.1039284  0.13511794 0.15372686 0.06840969 0.07811982
 0.22367697 0.18769942 0.28561289 0.04442631 0.13509296 0.43322087
 0.55858885 0.01388602 0.10571471 0.32135777 0.01961282 0.20188551
 0.01058028 0.20400986 0.07333457]
Border Points (C): 0.66
NN Error (C): [0 1 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 1 0 0 0 0 1 1 0 1 1 1 1 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 0 0
 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 1 0 1 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1
 1 1 0 0 0 1 0 0 0 0 0 0 0 1

features that are currently not used: []
Fishers DR (C): [0.12374977 0.06886506 0.05464677 0.00616178 0.01085471 0.0064855
 0.00177769 0.00276721 0.00290988 0.07786755 0.04392679 0.00534887
 0.01032472 0.00578426 0.00137001 0.00230177 0.00197918 0.11036037
 0.0206907  0.02347512 0.01854579 0.00600968 0.00860013 0.0066877
 0.01925289 0.02316378 0.02060992 0.00418403 0.00622216 0.00486788
 0.06890406 0.04788599 0.00560675 0.01320658 0.00868991 0.12188058
 0.04310051 0.06594432 0.06639822 0.04184499 0.0572513  0.06851259
 0.07279375 0.0802262  0.29172024]
Border Points (C): 0.604
NN Error (C): [1 0 0 0 0 1 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 0 1 1 0 1 0 1
 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 1 0 1 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0
 0 1 1 0 1 1 0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 1 0 0 1 1 1 0 0 0 0 0 0 0 0 1
 1 1 0 0 0 0 1 0 1 0 0 1 0 

features that are currently not used: []
Fishers DR (C): [0.18708537 0.15276777 0.13620781 0.02301883 0.02872722 0.0203192
 0.04994211 0.0524406  0.0489483  0.21801131 0.19513944 0.03212641
 0.03253431 0.03332657 0.03467627 0.0418514  0.0414756  0.23092841
 0.06227711 0.04497062 0.05705608 0.06824011 0.05296539 0.05819791
 0.04406273 0.036107   0.0379351  0.04322863 0.04702688 0.05509597
 0.10315845 0.05825805 0.0637825  0.03582566 0.03994975 0.13889574
 0.11591501 0.02221777 0.01954039 0.13154716 0.03630276 0.02546882
 0.03497044 0.04407747 0.25226097]
Border Points (C): 0.614
NN Error (C): [0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 1 1 0 1 1 0 0 0 0 1 0 1 0 1 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 1 0 0 1 0 1 0 1 1 1 1 1 0 0
 0 1 1 1 1 0 1 1 1 0 0 1 1 1 1 0 1 0 1 0 0 1 1 0 0 1 1 0 1 0 0 0 0 0 0 0 0
 1 0 1 1 0 0 1 0 1 0 1 1 1 1 0 0 0 1 0 0 1 1 1 1 0 1 0 0 0 1 0 1 0 1 0 0 1
 1 1 1 1 1 0 1 1 0 1 1 1 0 1 0 0 1 1 0 0 0 0 1 1 0 1 1 1 1 1 1 0 0 0 1 0 1
 0 1 0 1 0 1 0 1 0 0 1 0 0

features that are currently not used: []
Fishers DR (C): [0.07491132 0.01740267 0.00881554 0.00182531 0.00235521 0.00123807
 0.00056757 0.00079163 0.00069144 0.12567989 0.05495725 0.00324397
 0.0040788  0.00212882 0.00099749 0.00096573 0.00096471 0.15437524
 0.02175868 0.01892733 0.01268577 0.00383522 0.00327959 0.00324414
 0.00768177 0.01298267 0.00732716 0.00169692 0.00311863 0.00264415
 0.04129335 0.01344515 0.00080583 0.0028904  0.00214473 0.06918079
 0.00259356 0.00406395 0.00286356 0.00330453 0.00744984 0.00237075
 0.05391521 0.01980115 0.23655421]
Border Points (C): 0.568
NN Error (C): [1 0 0 0 0 1 0 1 1 0 0 0 0 0 0 1 0 0 1 0 1 0 1 0 0 0 1 0 1 0 0 0 0 0 1 0 0
 0 1 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 1 0 1 1 1 0 0
 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 1 1 1 1 1 0 0 0 1 0 1 0 1 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 1 0
 0 0 0 0 1 1 0 1 0 1 0 0 0 0 0 1 0 1 0 0 1 0 0 0 1 0 0 1 0 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 0 1 1 0 1 1 

features that are currently not used: []
Fishers DR (C): [6.48140006e-02 4.86627147e-02 1.81061719e-02 4.38927056e-03
 3.70237981e-03 2.24416197e-03 7.83140001e-04 8.87933321e-04
 7.45722158e-04 1.90990530e-02 6.46460195e-02 2.06234205e-03
 2.34472306e-03 3.97947588e-03 1.78193061e-05 6.99320758e-04
 7.42499621e-03 4.31843381e-02 1.80958245e-02 1.23083569e-02
 7.50949772e-03 2.86699881e-03 5.35182721e-03 3.43335070e-03
 6.31864639e-03 5.00049306e-03 8.66071410e-03 1.77814108e-04
 2.17128854e-03 1.18049412e-02 3.85014708e-02 1.98864504e-02
 3.47228669e-03 6.60888566e-03 3.17311510e-03 4.22159858e-02
 3.46923984e-02 1.92270117e-02 9.88416079e-03 4.63817864e-03
 1.96036563e-02 1.20522674e-02 3.80962086e-02 6.21016413e-04
 5.24847417e-02]
Border Points (C): 0.377
NN Error (C): [0 0 0 1 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 1 0 0 0 1 1 1 1 1 1
 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 0 0 0 0 0
 0 0 1 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 1 1 1 1 1 1 1 1 1

features that are currently not used: []
Fishers DR (C): [0.05895373 0.10965607 0.04376738 0.02401151 0.0129201  0.00782598
 0.01076705 0.01302565 0.01171245 0.10442437 0.02979743 0.00581983
 0.00067654 0.00292578 0.00256614 0.01001003 0.00782362 0.10356129
 0.02980637 0.01411855 0.01185775 0.01911851 0.01541584 0.01567114
 0.03857178 0.0540235  0.03086266 0.07929525 0.00936376 0.02630738
 0.2183091  0.08262578 0.22724334 0.010187   0.02899314 0.19867308
 0.53102986 0.0073734  0.05497114 0.22939902 0.00853613 0.05532036
 0.00761508 0.09412507 0.0702171 ]
Border Points (C): 0.365
NN Error (C): [1 1 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 1
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 1 0 0 0 1 1 0 0 0
 0 1 0 1 1 0 1 0 1 1 0 1 0 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0
 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 1
 1 1 0 0 0 1 0 0 0 0 0 0 

features that are currently not used: []
Fishers DR (C): [1.21088764e-01 4.44510832e-02 2.82909570e-02 1.22596676e-03
 2.15140840e-03 1.28151859e-03 2.54091449e-04 3.81230217e-04
 4.06063708e-04 5.33456943e-02 2.81960251e-02 1.31255264e-03
 2.10415035e-03 1.14053906e-03 1.84129162e-04 3.26561992e-04
 2.76233751e-04 1.12865731e-01 6.49518733e-03 9.43390579e-03
 5.53622104e-03 1.22086944e-03 1.96904312e-03 1.69930063e-03
 4.79518650e-03 7.62476423e-03 6.96488505e-03 7.06403975e-04
 1.26094329e-03 1.16419000e-03 6.47142169e-02 4.64288695e-02
 1.36808415e-03 2.41305003e-03 1.73877892e-03 1.08131814e-01
 2.98582778e-02 4.17587432e-02 4.44545935e-02 4.12877005e-02
 3.92642563e-02 4.73426767e-02 6.76672076e-02 6.77370480e-02
 2.85471353e-01]
Border Points (C): 0.569
NN Error (C): [1 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 1 0 0
 0 0 0 0 0 0 0 0 1 1 0 0 1 0 1 0 0 0 0 0 1 0 1 1 0 1 0 1 0 1 1 1 1 0 1 0 1
 0 0 1 0 0 1 0 0 0 0 0 1 0 1 0 1 1 1 1 0 1 0 0 0 1 0 0 1 0 0 0 0 0

features that are currently not used: []
Fishers DR (C): [0.17598774 0.11242065 0.09973852 0.02360395 0.02427325 0.01877126
 0.04470983 0.03604072 0.03880594 0.18707343 0.15699693 0.03163121
 0.02685369 0.0245211  0.02632206 0.0308171  0.03156295 0.2265729
 0.0502252  0.03564819 0.04323006 0.05369885 0.03298362 0.03677535
 0.04093169 0.02870272 0.03459509 0.04007918 0.03201485 0.03497213
 0.10486079 0.05807214 0.06221634 0.02856592 0.02840318 0.11406435
 0.0944021  0.01672437 0.01455191 0.1235954  0.02408042 0.02007536
 0.0234853  0.02836903 0.25229007]
Border Points (C): 0.586
NN Error (C): [0 0 1 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 1 1 0 0 0 1 0 1 0 1 0 1 1 1
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 1 1 0 1 1 0 0 0 0 0 1 0 1 0 0 0
 0 1 0 1 1 0 0 0 0 1 1 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 1 1 0 1 0 0 0 0 0
 1 0 0 1 0 0 1 0 0 0 0 0 1 1 0 0 1 0 0 0 1 1 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0
 1 1 1 1 0 0 1 1 0 1 1 1 1 0 0 0 0 1 0 0 1 0 1 1 1 1 1 1 1 1 1 0 0 0 1 0 1
 0 1 0 1 0 1 0 1 0 0 1 1 0

features that are currently not used: []
Fishers DR (C): [6.92620161e-02 5.99897656e-03 3.31315458e-03 4.51506562e-04
 6.86945948e-04 3.42025109e-04 1.59331453e-04 2.24094042e-04
 1.81632558e-04 1.13435376e-01 3.81186611e-02 1.01198601e-03
 1.28137721e-03 6.10874280e-04 2.92431221e-04 2.76426219e-04
 2.61111815e-04 1.56192963e-01 7.20820920e-03 7.93399691e-03
 4.66171862e-03 1.32860094e-03 1.12334976e-03 1.02992813e-03
 3.02149962e-03 4.28976325e-03 2.82273597e-03 5.68528664e-04
 9.80520810e-04 7.57114759e-04 3.92678715e-02 1.14872091e-02
 2.12012237e-04 1.00847845e-03 7.09594813e-04 6.25849744e-02
 9.22302234e-04 1.33147544e-03 8.62677352e-04 2.61468715e-03
 2.03891488e-03 7.49109573e-04 4.75844543e-02 1.93591994e-02
 2.39434401e-01]
Border Points (C): 0.552
NN Error (C): [1 0 0 0 0 1 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 1
 0 0 0 1 0 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 1 0 0
 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 1 0

In [88]:
def extract_mfes(X, y, meta_feature_set, summary=["mean"], groups=["all"]):
    mfe = MFE(groups=["all"], features=meta_feature_set,
              summary=summary)
    mfe.fit(X, y)
    ft = mfe.extract()
    return ft

metric_mapping = {'c1': "Class entropy", "c2": "Imbalance Degree", "f1v.mean": "Fishers DR", "f2.mean": "Class Overlap", "vdu": "Dunn Index",
                 "n1": "Border Points", "n2.mean": "Inter/Intra Class Dist", "sil": "SIL", "n3.mean": "NN Error", "ch": "CHI"}
c = 100
gs = 0
f = 100
n = 1000
gi = 'very_balanced'
ci = 'very_balanced'
generator = ImbalanceGenerator(n_features=f,
                           n_samples_total=n,
                           total_n_classes=c,
                           features_remove_percent=0,
                           hardcoded=False,
                           group_imbalance=gi,
                           cls_imbalance=ci,
                           class_overlap=1.5,
                           root=EngineTaxonomy().create_taxonomy(),
                           group_separation=gs,
                           n_group_features=10)
df = generator.generate_data_with_product_hierarchy()
print(df.shape)
df = df.dropna(how='any')
print(df.shape)
X, y = df[[f"F{i}" for i in range(f)]].to_numpy(), df["target"].to_numpy()
groups = df["group"].unique()



---------------------------------
---- #classes: 100, gs=0--------
features that are currently not used: []
(1000, 107)
(1000, 107)


In [89]:
print('---------------------------------')
print(f'---- #classes: {c}, gs={gs}--------')
stats = {}
##########################################
# Drop rows where we have only NaN values! --> Happens if we have two classes for only one sample ...
#not_nan_rows = [id_ for id_, x in enumerate(X.isna().sum(axis=1)) if x == 0]
#X = X.to_numpy()[not_nan_rows, :]
#y = y.to_numpy()[not_nan_rows]
#groups = groups[not_nan_rows].to_numpy()
#########################################
complexit_metrics = ["f1v", "f2", "n1", "n2", "c1", "c2", "n3",]
CVI_s = ["sil", "ch", "vdu"]
#####################################
## Complexity metrics  from PyMFE ###
ft = extract_mfes(X,y,complexit_metrics)
for metric, value in zip(ft[0], ft[1]):
    print(f"{metric_mapping[metric]} (C): {value}")
    stats[f"{metric_mapping[metric]} (C)"] = value


for group in groups:
    group_df = df[df["group"] == group]
    group_X, group_y = group_df[[f"F{i}" for i in range(f)]].to_numpy(), group_df["target"].to_numpy()
    ft = extract_mfes(group_X,group_y, complexit_metrics)
    for metric, value in zip(ft[0], ft[1]):
        metric_name = f"{metric_mapping[metric]} (G)"
        if metric_name in stats:
            stats[f"{metric_mapping[metric]} (G)"].append(value)
        else:
            stats[f"{metric_mapping[metric]} (G)"] = [value]

for key, value in stats.items():
    if "(G)" in key:
        stats[key] = np.nanmean(value)
        
ft = extract_mfes(X, y, CVI_s)
for metric, value in zip(ft[0], ft[1]):
    print(f"{metric_mapping[metric]}: {value}")
    stats[f"{metric_mapping[metric]}"] = value

print(stats)

---------------------------------
---- #classes: 100, gs=0--------




Class entropy (C): 0.9662486989182697
Imbalance Degree (C): 0.003101517527922426
Fishers DR (C): 0.4889595538591901
Class Overlap (C): 1.1347203224354809e-10
Border Points (C): 0.971
Inter/Intra Class Dist (C): 0.5539196892195815
NN Error (C): 0.956


 Exception message: ValueError('expected matrix').
 Will set it as 'np.nan' for all summary functions.
 Exception message: ValueError('expected matrix').
 Will set it as 'np.nan' for all summary functions.


CHI: 1.1896144478301796
SIL: -0.05636389132834306
Dunn Index: 0.0
{'Class entropy (C)': 0.9662486989182697, 'Imbalance Degree (C)': 0.003101517527922426, 'Fishers DR (C)': 0.4889595538591901, 'Class Overlap (C)': 1.1347203224354809e-10, 'Border Points (C)': 0.971, 'Inter/Intra Class Dist (C)': 0.5539196892195815, 'NN Error (C)': 0.956, 'Class entropy (G)': 0.9845745347192153, 'Imbalance Degree (G)': 0.011084872217797954, 'Fishers DR (G)': 0.8266767236588964, 'Class Overlap (G)': 1.0280397974792868e-20, 'Border Points (G)': 0.8683133755940131, 'Inter/Intra Class Dist (G)': 0.5212593117138279, 'NN Error (G)': 0.802475154225389, 'CHI': 1.1896144478301796, 'SIL': -0.05636389132834306, 'Dunn Index': 0.0}


In [3]:


np.random.seed(0)
random.seed(0)
stats = []

for n in [1000, 2000, 5000, 7000, 10000]:
    for f in [20, 50, 80, 100]:
        for c in [10, 30, 50, 70, 100]:
            for gi in ["very_balanced", "balanced", "medium","imbalanced", "very_imbalanced"]:
                for ci in ["very_balanced", "balanced", "medium","imbalanced", "very_imbalanced"]:
                    for gs in [0, 0.5, 1, 3, 5]:
                        generator = ImbalanceGenerator(n_features=f,
                                                       n_samples_total=n,
                                                       total_n_classes=c,
                                                       features_remove_percent=0,
                                                       hardcoded=False,
                                                       group_imbalance=gi,
                                                       cls_imbalance=ci,
                                                       class_overlap=1.5,
                                                       root=EngineTaxonomy().create_taxonomy(),
                                                       group_separation=gs,
                                                       n_group_features=10)

                        dataset_name = f'n{n}_f{f}_c{c}_gi{gi}_ci{ci}_gs{gs}'
                        df = generator.generate_data_with_product_hierarchy()
                        print(RenderTree(generator.root))
                        df_train, df_test = train_test_split(df, train_size=0.7, stratify=df[["group", "target"]])

                        Path("data").mkdir(parents=True, exist_ok=True)
                        Path("stats").mkdir(parents=True, exist_ok=True)
                        Path("predictions").mkdir(parents=True, exist_ok=True)

                        df_train.to_csv(f'data/train_{dataset_name}.csv')
                        df_test.to_csv(f'data/test_{dataset_name}.csv')

                        #### Dataset statistics ####
                        class_gini = generator.gini(df['target'])
                        group_gini = generator.gini(df['group'])
                        avg_n_groups = df.groupby(['group']).size().mean()
                        avg_c_groups = df.groupby(['group', 'target']).size().mean()
                        min_n_groups = df.groupby(['group']).size().min()
                        max_n_groups = df.groupby(['group']).size().max()

                        stats.append({'gini class': class_gini,
                                      'gini group': group_gini, 
                                      'min n groups': min_n_groups,
                                      'max n groups': max_n_groups,
                                      'avg n groups': avg_n_groups, 
                                      'avg c groups': avg_c_groups,
                                     'dataset':dataset_name})
                        pd.DataFrame(stats).to_csv('stats/stats.csv')

                        ### Prediction part ###
                        X_train, y_train = df_train[[f"F{i}" for i in range(f)]], df_train["target"]
                        X_test, y_test = df_test[[f"F{i}" for i in range(f)]], df_test["target"]

                        ## Whole data ##
                        model_X = LogisticRegression()
                        model_X.fit(X_train, y_train)
                        y_pred_X = model_X.predict(X_test)
                        mf_X = MetricFrame({'accuracy':skm.accuracy_score,
                                'count': count}, y_true=y_test, y_pred=y_pred_X, sensitive_features=df_test['group'])
                        mf_X.by_group.to_csv(f'predictions/X_{dataset_name}.csv')

                        ## For each group ##
                        model_repo = {}
                        for group in df_train["group"]:
                            group_df = df_train[df_train["group"] == group]
                            group_X = group_df[[f"F{i}" for i in range(f)]]
                            group_y = group_df["target"]
                            model = LogisticRegression()
                            model.fit(group_X, group_y)
                            model_repo[group] = model
                        df_test["predictions"] = df.apply(lambda row: model_repo[row["group"]].predict(row[[f"F{i}" for i in range(f)]].to_numpy().reshape(1,-1))[0], axis=1)
                        mf = MetricFrame({'accuracy':skm.accuracy_score,
                                'count': count},y_true=df_test['target'],y_pred=df_test['predictions'],
                                         sensitive_features=df_test['group'])
                        mf.by_group.to_csv(f'predictions/group_{dataset_name}.csv')

features that are currently not used: []
Level-0;Engine[n_samples=1000, n_classes=10, classes=(0, 10)]
├── Level-1;Diesel[n_samples=496, n_classes=6, classes=(0, 6)]
│   ├── Level-2;DE-OM1[n_samples=160, n_classes=2, classes=(0, 2)]
│   │   ├── Level-3;DE-OM1-2[n_samples=26, n_classes=2, classes=(0, 2), class_occurences=[12, 14]]
│   │   ├── Level-3;DE-OM1-3[n_samples=27, n_classes=2, classes=(0, 2), class_occurences=[13, 14]]
│   │   ├── Level-3;DE-OM1-4[n_samples=34, n_classes=2, classes=(0, 2), class_occurences=[16, 18]]
│   │   ├── Level-3;DE-OM1-5[n_samples=36, n_classes=2, classes=(0, 2), class_occurences=[17, 19]]
│   │   └── Level-3;DE-OM1-6[n_samples=37, n_classes=2, classes=(0, 2), class_occurences=[17, 20]]
│   ├── Level-2;DE-OM2[n_samples=161, n_classes=3, classes=(2, 5)]
│   │   ├── Level-3;DE-OM2-1[n_samples=30, n_classes=2, classes=(2, 4), class_occurences=[14, 16]]
│   │   ├── Level-3;DE-OM2-2[n_samples=31, n_classes=2, classes=(3, 5), class_occurences=[15, 16]]
│   │  

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Found 29 subgroups. Evaluation may be slow




Found 29 subgroups. Evaluation may be slow


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [10]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, train_size=0.7, stratify=df["group"])

## Run Classifier on whole data and on each group separately

In [61]:
from sklearn.ensemble import RandomForestClassifier

X_train, y_train = df_train[[f"F{i}" for i in range(100)]], df_train["target"]
X_test, y_test = df_test[[f"F{i}" for i in range(100)]], df_test["target"]

model_X = RandomForestClassifier()
model_X.fit(X_train, y_train)

RandomForestClassifier()

In [62]:
y_pred_X = model_X.predict(X_test)

In [63]:
accuracy_score(y_pred_X, y_test)

0.11

In [58]:
model_X = LogisticRegression()
model_X.fit(X_train, y_train)
y_pred_X = model_X.predict(X_test)
accuracy_score(y_pred_X, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.13333333333333333

In [33]:
import pandas as pd
pd.DataFrame({'y_true': y_test, 'y_pred': y_pred_X, 'group': df_test['group']})

Unnamed: 0,y_true,y_pred,group
249,35,12,DE-OM3-3
611,13,47,GE-OM3-6
433,45,33,GE-OM1-6
770,67,12,GE-OM3-11
852,88,75,GE-OM3-12
...,...,...,...
535,33,23,GE-OM1-7
151,25,12,DE-OM2-6
987,91,47,GE-OM3-13
256,12,12,DE-OM3-3


In [34]:
import sklearn.metrics as skm
from fairlearn.metrics import MetricFrame
from sklearn.metrics import accuracy_score
from fairlearn.metrics import count
from functools import partial

mf_X = MetricFrame({'accuracy': skm.accuracy_score, 
                    'F1': partial(skm.f1_score, average='weighted'), 
                    'prec': partial(skm.precision_score, average='weighted'), 
                    'recall': partial(skm.recall_score, average='weighted'),
                            'count': count},
                 y_true=y_test,
                 y_pred=y_pred_X,
                 sensitive_features=df_test['group'])




In [35]:
mf_X.by_group

Unnamed: 0_level_0,accuracy,F1,prec,recall,count
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DE-OM1-2,0.0,0.0,0.0,0.0,1
DE-OM1-3,0.0,0.0,0.0,0.0,2
DE-OM1-4,0.0,0.0,0.0,0.0,2
DE-OM1-5,0.0,0.0,0.0,0.0,5
DE-OM1-6,0.0,0.0,0.0,0.0,8
DE-OM2-1,0.0,0.0,0.0,0.0,3
DE-OM2-2,0.0,0.0,0.0,0.0,4
DE-OM2-3,0.0,0.0,0.0,0.0,4
DE-OM2-5,0.0,0.0,0.0,0.0,7
DE-OM2-6,0.0,0.0,0.0,0.0,11


In [92]:
train_test_split(df, train_size=0.7, stratify=df[["group"]])
model_repo = {}
for group in df_train["group"].unique():
    group_df = df_train[df_train["group"] == group]
    group_X = group_df[[f"F{i}" for i in range(100)]].to_numpy()
    group_y = group_df["target"].to_numpy()
    model = RandomForestClassifier()
    model.fit(group_X, group_y)
    model_repo[group] = model

In [83]:
#df_test["predictions"] =
y_group_pred = df_test.apply(lambda row: model_repo[row["group"]].predict(row[[f"F{i}" for i in range(100)]].to_numpy().reshape(1,-1))[0], axis=1).to_numpy()

In [84]:
print(len(y_group_pred))

300


In [85]:
print(accuracy_score(y_group_pred, df_test["target"]))

0.37


In [71]:
mf = MetricFrame({'accuracy': skm.accuracy_score, 
                    'F1': partial(skm.f1_score, average='weighted'), 
                    'prec': partial(skm.precision_score, average='weighted'), 
                    'recall': partial(skm.recall_score, average='weighted'),
                            'count': count},
                 y_true=df_test['target'],
                 y_pred=df_test['predictions'],
                 sensitive_features=df_test['group'])
mf.by_group



Unnamed: 0_level_0,accuracy,F1,prec,recall,count
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DE-OM1-2,0.0,0.0,0.0,0.0,1
DE-OM1-3,1.0,1.0,1.0,1.0,2
DE-OM1-4,0.5,0.333333,0.25,0.5,2
DE-OM1-5,0.6,0.45,0.36,0.6,5
DE-OM1-6,0.5,0.333333,0.25,0.5,8
DE-OM2-1,1.0,1.0,1.0,1.0,3
DE-OM2-2,0.5,0.5,0.5,0.5,4
DE-OM2-3,0.0,0.0,0.0,0.0,4
DE-OM2-5,0.0,0.0,0.0,0.0,7
DE-OM2-6,0.454545,0.30303,0.227273,0.454545,11


In [75]:
mf_g = mf.by_group
mf_g["Model"] = "G"
mf_x = mf_X.by_group
mf_x["Model"] = "X"
pd.concat([mf.by_group, mf_X.by_group])

Unnamed: 0_level_0,accuracy,F1,prec,recall,count,Model
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DE-OM1-2,0.0,0.0,0.0,0.0,1,G
DE-OM1-3,1.0,1.0,1.0,1.0,2,G
DE-OM1-4,0.5,0.333333,0.25,0.5,2,G
DE-OM1-5,0.6,0.45,0.36,0.6,5,G
DE-OM1-6,0.5,0.333333,0.25,0.5,8,G
DE-OM2-1,1.0,1.0,1.0,1.0,3,G
DE-OM2-2,0.5,0.5,0.5,0.5,4,G
DE-OM2-3,0.0,0.0,0.0,0.0,4,G
DE-OM2-5,0.0,0.0,0.0,0.0,7,G
DE-OM2-6,0.454545,0.30303,0.227273,0.454545,11,G


In [73]:
new_df = mf.by_group.copy()
new_df["accuracy"] = mf.by_group["accuracy"] - mf_X.by_group["accuracy"]
new_df

Unnamed: 0_level_0,accuracy,count
group,Unnamed: 1_level_1,Unnamed: 2_level_1
1,-1.0,1
10,0.142857,7
11,0.4,10
12,0.09375,32
13,0.0,1
14,1.0,2
15,0.6,5
16,-0.133333,15
17,0.6,5
18,0.142857,7


### Run R Library (ECoL) 

In [15]:
c = 20
gs = 0
f = 50
n = 1000
gi = 'medium'
ci = 'medium'
generator = ImbalanceGenerator(n_features=f,
                           n=n,
                           c=c,
                           features_remove_percent=0,
                           hardcoded=False,
                           group_imbalance=gi,
                           cls_imbalance=ci,
                           class_overlap=1.5,
                           root=EngineTaxonomy().create_taxonomy(),
                           gs=gs,
                           n_group_features=10)
df = generator.generate_data_with_product_hierarchy()

features that are currently not used: []


In [16]:
#df.to_csv("test.csv", inde)

In [18]:
from rpy2.robjects.packages import importr
import rpy2.robjects as robjects

ECoL = importr('ECoL')
rpy2_df = robjects.r('''
data = read.csv("test.csv");
complex_ = complexity(data[2:51], factor(data$target));
data.frame(as.list(complex_))
''')
import rpy2.robjects as ro

pd_dt = ro.conversion.rpy2py(rpy2_df)
pd_dt

overlapping.F1.mean,overlapping.F1.sd,overlapping.F1v.mean,...,network.ClsCoef,network.Hubs.mean,network.Hubs.sd
0.95834,0.020884,0.081297,...,0.57918,0.923723,0.181625


In [19]:
from rpy2.robjects import pandas2ri

from rpy2.robjects.conversion import localconverter
with localconverter(ro.default_converter + pandas2ri.converter):
  pd_from_r_df = ro.conversion.rpy2py(rpy2_df)
pd_from_r_df

Unnamed: 0,overlapping.F1.mean,overlapping.F1.sd,overlapping.F1v.mean,overlapping.F1v.sd,overlapping.F2.mean,overlapping.F2.sd,overlapping.F3.mean,overlapping.F3.sd,overlapping.F4.mean,overlapping.F4.sd,...,linearity.L3.sd,dimensionality.T2,dimensionality.T3,dimensionality.T4,balance.C1,balance.C2,network.Density,network.ClsCoef,network.Hubs.mean,network.Hubs.sd
1,0.95834,0.020884,0.081297,0.07338,0.060903,0.207843,0.672028,0.193688,0.035155,0.158907,...,0.040067,0.05,0.001,0.02,0.897766,0.037984,0.978016,0.57918,0.923723,0.181625


In [20]:
pd_from_r_df.T

Unnamed: 0,1
overlapping.F1.mean,0.95834
overlapping.F1.sd,0.020884
overlapping.F1v.mean,0.081297
overlapping.F1v.sd,0.07338
overlapping.F2.mean,0.060903
overlapping.F2.sd,0.207843
overlapping.F3.mean,0.672028
overlapping.F3.sd,0.193688
overlapping.F4.mean,0.035155
overlapping.F4.sd,0.158907


In [9]:
pd_from_r_df.T

Unnamed: 0,1
overlapping.F1.mean,0.361129
overlapping.F1.sd,0.272835
overlapping.F1v.mean,0.029457
overlapping.F1v.sd,0.065424
overlapping.F2.mean,2.3e-05
overlapping.F2.sd,0.000157
overlapping.F3.mean,0.168758
overlapping.F3.sd,0.274824
overlapping.F4.mean,0.001307
overlapping.F4.sd,0.008769
