In [1]:
import numpy as np
import pandas as pd
from subprocess import call
from time import time
from itertools import product
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold, StratifiedKFold

In [2]:
def compute_thresholds(df):
    threshs = []
    threshs_raveled = []
    cur_neg = -1
    for i in range(df.shape[1] - 1):
        df_sorted = df.sort_values(by=i).reset_index(drop='index')

        for val in df_sorted[i]:
            if len(df_sorted[df_sorted[i] == val]['y'].unique()) > 1:
                y_ = df_sorted['y'].values
                y_[df_sorted[i] == val] = cur_neg
                df_sorted['y'] = y_
                cur_neg -= 1
        
        tmp = df_sorted.rolling(2).apply(lambda x: x.iloc[1] - x.iloc[0])[1:].reset_index()
        ids = tmp[(tmp.y != 0) & (tmp[i] != 0)]['index'].values
        threshs.append((df_sorted.iloc[ids, i].values + df_sorted.iloc[ids-1, i].values) / 2)
        threshs_raveled += list(threshs[-1])

    return threshs, threshs_raveled

In [3]:
def gen_bool_matrix(df: pd.DataFrame, threshs, threshs_raveled):

    unique_labels = sorted(df['y'].unique())
    n_threshs = []
    for i in range(len(threshs)):
        n_threshs += [i] * len(threshs[i])
    n_threshs += [-1]  # label

    df_stretched = df.iloc[:, n_threshs]
    classes = [df_stretched[df_stretched['y'] == y].drop(columns=['y']).values for y in unique_labels]
    res = []
    for c1, c2 in product(unique_labels, unique_labels):
        if c1 >= c2:
            continue
        for row1, row2 in product(classes[c1], classes[c2]):
            res.append(
                (row1 <= threshs_raveled) & (row2 > threshs_raveled) | \
                (row1 > threshs_raveled) & (row2 <= threshs_raveled)
            )
    return pd.DataFrame(res, columns=df_stretched.columns[:-1])

In [4]:
def get_weights(df, threshs, column_names, rank_formula="-", recalibrate=True):
    weights = []
    for feature in df.columns:
        if feature == 'y':
            continue
        feature_ts = [-np.inf] + threshs[feature].tolist() + [+np.inf]
        obj = df[feature]
        classes = df['y'].value_counts().sort_index()
        w = []
        N = []
        
        for i in range(1, len(feature_ts) - 1):
            #left_class = df[(obj > feature_ts[i-1]) & (obj <= feature_ts[i])]['y'].value_counts(ascending=True).argmax()
            #right_class = df[(obj < feature_ts[i+1]) & (obj >= feature_ts[i])]['y'].value_counts(ascending=True).argmax()
            #assert left_class != right_class
            NL = (obj > feature_ts[i-1]) & (obj <= feature_ts[i])
            NR = (obj < feature_ts[i+1]) & (obj >= feature_ts[i])
            NL_classes = df[NL]['y'].value_counts().sort_index()
            NR_classes = df[NR]['y'].value_counts().sort_index()
        
            res = []
        
            for cls in NL_classes.index:
                if cls not in NR_classes.index:
                    NR_classes.at[cls] = 0
            for cls in NR_classes.index:
                if cls not in NL_classes.index:
                    NL_classes.at[cls] = 0
        
            for cls1 in NL_classes.index:
                for cls2 in NR_classes.index:
                    if cls1 == cls2:
                        continue
                    KL = classes[cls1]
                    KR = classes[cls2]
                    left_n = (df[NL]['y'] == cls1).sum()
                    right_n = (df[NR]['y'] == cls2).sum()
                    if rank_formula == "-":
                        c = abs(KR / (KR + KL) * left_n - KL / (KR + KL) * right_n)
                    elif rank_formula == "+":
                        c = -abs(KR / (KR + KL) * left_n + KL / (KR + KL) * right_n)
                    elif rank_formula == "anti+":
                        c = abs(KR / (KR + KL) * left_n + KL / (KR + KL) * right_n)
                    else:
                        raise NotImplementedError
                    res.append(c)
            w.append(np.mean(res))
            
            N.append(NL.sum())
        N.append(NR.sum())
        
        assert sum(N) == df.shape[0], len(threshs[feature])
        assert 0 not in N
        assert len(w) + 1 == len(N)

        weights.append(np.array(w))

    # recalibrate
    if recalibrate:
        means = [w.mean() for w in weights]
        mu = np.mean(means)
        weights = [w * (mu / w.mean()) for w in weights]

    res = []
    for elem in weights:
        res += elem.tolist()
    res = np.array(res)

    temp = pd.DataFrame(res.reshape(1, -1))
    temp.columns = column_names

    return temp

In [5]:
def get_encoded_df(M, cols, df, thresh, threshs_raveled, train=True):
    features_threshs = {}
    for c in cols:
        feature = M.columns[c]
        if feature not in features_threshs:
            features_threshs[feature] = []
        features_threshs[feature].append(threshs_raveled[c])
    
    df_ = {}
    for feature in M.columns:
        all_t = [-np.inf] + features_threshs[feature] + [+np.inf]
        feature_copy = df[feature].values.copy()
        for t_idx in range(1, len(all_t)):
            cond = (df[feature] > all_t[t_idx - 1]) & (df[feature] <= all_t[t_idx])
            feature_copy[cond] = t_idx - 1
        df_[feature] = feature_copy
        if train:
            assert len(np.unique(feature_copy)) == len(features_threshs[feature]) + 1, feature
    df_['y'] = df['y'].copy()
    df_ = pd.DataFrame(df_)

    return df_.astype(int)

In [6]:
def encode_nums(df, threshs=None):
    df = df.copy()
    num = [c for c in df.columns if df[c].dtype == float]
    threshs_ = []
    for i, feature in enumerate(num):
        if threshs:
            all_t = threshs[i]
        else:
            all_t = [-np.inf] + np.unique(np.sort(df[feature].values)).tolist()[:-1] + [+np.inf]
            threshs_.append(all_t)
        feature_copy = df[feature].values.copy().astype(int)
        for t_idx in range(1, len(all_t)):
            cond = (df[feature] > all_t[t_idx - 1]) & (df[feature] <= all_t[t_idx])
            feature_copy[cond] = t_idx - 1
        df[feature] = feature_copy
    if threshs is None:
        return df, threshs_
    return df

# Dataset

In [7]:
def rf_test(filename, n_splits=5, sample=None):
    df = pd.read_csv(filename)
    y = df.y
    df = pd.DataFrame(df.drop(columns=['y']).values)
    df['y'] = y.values
    df.drop_duplicates(inplace=True, subset=list(range(df.shape[1] - 1)))

    if sample:
        df = df.sample(min(sample, len(df)), random_state=42)

    data = df
    X, y = data.drop(columns=['y']).values, data['y'].values
    kf = StratifiedKFold(n_splits=n_splits)

    accs = []

    for i, (train_index, test_index) in enumerate(kf.split(X, y)):
        X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
        model = RandomForestClassifier(500, random_state=42)
        model.fit(X_train, y_train)
        y_hat = model.predict(X_test)

        #print(y_hat.sum())
        
        acc = (y_hat == y_test).sum() / len(y_test)
        print(f"{i}th fold, accuracy {acc}")
        accs.append(acc)

    return np.mean(accs)

In [8]:
rf_test('../data/Titanic/cleaned.csv', sample=100, n_splits=3)

0th fold, accuracy 0.6176470588235294
1th fold, accuracy 0.5454545454545454
2th fold, accuracy 0.7878787878787878


0.6503267973856209

# Whole pipeline

In [10]:
def test_method(filename, algorithm, n_iter, noiter_frac, alpha, rank_function, fitness_function, n_folds, p=None, encode=False, rank_formula="-", recalibrate=True, sample=None, random_state=42):
    def clean():
        files = ['scores_0_pred.txt', 'scores_0_irr.txt']
        for f in files:
            if f in os.listdir("./data"):
                os.remove(f"./data/{f}")

    np.random.seed(random_state)
    df = pd.read_csv(filename)
    y = df.y
    df = pd.DataFrame(df.drop(columns=['y']).values)
    df['y'] = y.values
    df.drop_duplicates(inplace=True, subset=list(range(df.shape[1] - 1)))
    if sample:
        df = df.sample(min(sample, len(df)), random_state=42) # ALWAYS 42!!!!!!!!!
    print(f"Df shape: {df.shape}")
    y = df['y']
    all_c = np.unique(y)
    binary = (len(all_c) == 2)

    kf = StratifiedKFold(n_splits=n_folds)

    accs = {'irr': [], 'pred': []}
    times = []
    for i, (train_index, test_index) in enumerate(kf.split(df, y.values)):
        seed = np.random.randint(0, 1e6)
        df_train, df_test = df.iloc[train_index].reset_index(drop='index').copy(), df.iloc[test_index].reset_index(drop='index').copy()

        assert df_train.nunique().min() > 1

        if encode:
            threshs, threshs_raveled = compute_thresholds(df_train)
            M = gen_bool_matrix(df_train, threshs, threshs_raveled)
            assert M.sum(axis=1).min() >= 1
            M.astype(int).to_csv('./data/bool.csv', index=False)
            weights = get_weights(df_train, threshs, M.columns, rank_formula=rank_formula, recalibrate=recalibrate)
            weights.to_csv('./data/ranks.csv', index=False, header=False)

            t = time()
            call(["./CorrectEncoding.o", str(seed), str(M.shape[0]), str(M.shape[1]), str(n_iter), str(noiter_frac), str(alpha), rank_function, fitness_function, algorithm])
            t = time() - t
            with open("./data/results.txt", 'r') as file:
                cols = list(map(int, file.readline().split()))

            df_train = get_encoded_df(M, cols, df_train, threshs, threshs_raveled, train=True)
            df_test = get_encoded_df(M, cols, df_test, threshs, threshs_raveled, train=False)
        
        else:
            t = 0.0
            df_train, enc_threshs = encode_nums(df_train)
            df_test = encode_nums(df_test, enc_threshs)
        times.append(t)

        X_train, y_train = df_train.drop(columns=['y']).values, df_train['y'].values
        X_test, y_test = df_test.drop(columns=['y']).values, df_test['y'].values

        enc = OneHotEncoder()
        X_train = enc.fit_transform(X_train).toarray().astype(int)
        X_test = enc.transform(X_test).toarray().astype(int)

        if binary and False:
            # perform binary one-run classification

            X_k = pd.DataFrame(X_train[y_train == 1])
            X_no = pd.DataFrame(X_train[y_train == 0])
            X_te = pd.DataFrame(X_train)
            X_k.to_csv('./data/X_k.txt', index=False, header=False, sep=' ')
            X_no.to_csv('./data/X_no.txt', index=False, header=False, sep=' ')
            X_te.to_csv('./data/X_test.txt', index=False, header=False, sep=' ')

            clean()
            if p is None:
                m, n = X_k.shape[0], X_k.shape[1]
                p_ = round(0.5 * np.log2(m * n) - 0.5 * np.log2(np.log2(m * n)) - np.log2(np.log2(np.log2(n))))
                p_ = 1 if (p_ <= 0) else p_
            else:
                p_ = p
            call(["./ADR.o", str(p_)])

            for classifier in ['irr', 'pred']:
                with open(f"./data/scores_0_{classifier}.txt") as f:
                    votes = list(map(float, f.read().strip().split(' | ')[-1].split()))
                #assert len(votes) == len(X_test)

                votes = np.array(votes)
                y_pred = (votes > 0.0)
                accs[classifier].append((y_pred == y_train).mean())

        else:
            # perform one-vs-all classification

            votes = {'irr': [], 'pred': []}
            for c in all_c:
                X_k = pd.DataFrame(X_train[y_train == c])
                X_no = pd.DataFrame(X_train[y_train != c])
                X_te = pd.DataFrame(X_test)
                X_k.to_csv('./data/X_k.txt', index=False, header=False, sep=' ')
                X_no.to_csv('./data/X_no.txt', index=False, header=False, sep=' ')
                X_te.to_csv('./data/X_test.txt', index=False, header=False, sep=' ')

                clean()
                if p is None:
                    m, n = X_k.shape[0], X_k.shape[1]
                    p_ = round(0.5 * np.log2(m * n) - 0.5 * np.log2(np.log2(m * n)) - np.log2(np.log2(np.log2(n))))
                    p_ = 1 if (p_ <= 0) else p_
                else:
                    p_ = p
                call(["./ADR.o", str(p_)])

                for classifier in ['irr', 'pred']:
                    with open(f"./data/scores_0_{classifier}.txt") as f:
                        votes[classifier].append(list(map(float, f.read().strip().split(' | ')[-1].split())))
                    assert len(votes[classifier][-1]) == len(X_test)

            for classifier in ['irr', 'pred']:
                votes[classifier] = np.array(votes[classifier])
                y_pred = votes[classifier].argmax(axis=0)
                accs[classifier].append((y_pred == y_test).mean())
    
    return times, accs

./CorrectEncoding.o 42 90932 245 250 2 0.5 elementwise maxbinsnum gencode+

In [12]:
    times, accs = test_method(
        filename=f'../data/Iris/cleaned.csv',
        algorithm='gencode+',
        n_iter=250, 
        noiter_frac=2,
        alpha=0.5,
        rank_function='elementwise',
        fitness_function='maxbinsnum',
        n_folds=5,
        p=None,
        encode=True,
        rank_formula="+",
        recalibrate=True,
        sample=None,
        random_state=42
    )

Df shape: (147, 5)
(4562, 48)
(4562, 54)
(4640, 54)
(4641, 53)
(4641, 54)


['Medicine', 'Music', 'Cars', 'Cameras', 'Iris', 'Titanic']

Whole data, no encoding

[(0.6052515201768933, 0.6141293532338308),
 (0.26, 0.26),
 (0.4807317073170731, 0.7645121951219512),
 (0.3207164179104478, 0.3675273631840796),
 (0.3264367816091954, 0.3264367816091954),
 (0.5947712418300654, 0.6941176470588235)]

 Greedy encoding

 [(0.6334549474847984, 0.6349143173023769),
 (0.472, 0.56),
 (0.8184146341463416, 0.8042682926829269),
 (0.6842985074626865, 0.7311194029850745),
 (0.8372413793103448, 0.9386206896551723),
 (0.6797385620915033, 0.7908496732026143)]

 CODE3

 [(0.6393587617468215, 0.6453067993366501),
 (0.6120000000000001, 0.6200000000000001),
 (0.7748780487804878, 0.7796341463414634),
 (0.6782935323383084, 0.7231094527363184),
 (0.7475862068965518, 0.9048275862068966),
 (0.6784313725490196, 0.7594771241830065)]

 GENCODE+

 [(0.6349364289662798, 0.6660917634051963),
 (0.6040000000000001, 0.6120000000000001),
 (0.8482926829268294, 0.8631707317073172),
 (0.6673781094527363, 0.7241293532338309),
 (0.8622988505747127, 0.9312643678160919), # fake
 (0.673202614379085, 0.7856209150326798)]

In [17]:
all_times, all_accs = [], []
for task in ['Medicine', 'Music', 'Cars', 'Cameras', 'Iris', 'Titanic']:
    times, accs = test_method(
        filename=f'../data/{task}/cleaned.csv',
        algorithm='greedy',
        n_iter=250, 
        noiter_frac=2,
        alpha=0.5,
        rank_function='elementwise',
        fitness_function='maxbinsnum',
        n_folds=5,
        p=None,
        encode=True,
        rank_formula="+",
        recalibrate=True,
        sample=None,
        random_state=42
    )
    all_times.append(np.mean(times))
    all_accs.append((np.mean(accs['irr']), np.mean(accs['pred'])))
    print(f"Task {task}, avg_time {all_times[-1]} all_accs {all_accs[-1]}")

Df shape: (674, 19)
Task Medicine, avg_time 1.3010878086090087 all_accs (0.6334549474847984, 0.6349143173023769)
Df shape: (250, 15)
Task Music, avg_time 1.244109296798706 all_accs (0.472, 0.56)
Df shape: (204, 23)
Task Cars, avg_time 0.31728811264038087 all_accs (0.8184146341463416, 0.8042682926829269)
Df shape: (1004, 12)
Task Cameras, avg_time 42.021964359283444 all_accs (0.6842985074626865, 0.7311194029850745)
Df shape: (147, 5)
Task Iris, avg_time 0.04492635726928711 all_accs (0.8372413793103448, 0.9386206896551723)
Df shape: (765, 9)
Task Titanic, avg_time 5.652727699279785 all_accs (0.6797385620915033, 0.7908496732026143)


In [18]:
all_times, all_accs

([1.3010878086090087,
  1.244109296798706,
  0.31728811264038087,
  42.021964359283444,
  0.04492635726928711,
  5.652727699279785],
 [(0.6334549474847984, 0.6349143173023769),
  (0.472, 0.56),
  (0.8184146341463416, 0.8042682926829269),
  (0.6842985074626865, 0.7311194029850745),
  (0.8372413793103448, 0.9386206896551723),
  (0.6797385620915033, 0.7908496732026143)])

In [13]:
all_times, all_accs = [], []
for task in ['Medicine', 'Music', 'Cars', 'Cameras', 'Iris', 'Titanic']:
    times, accs = test_method(
        filename=f'../data/{task}/cleaned.csv',
        algorithm='code3',
        n_iter=500, 
        noiter_frac=2,
        alpha=0.5,
        rank_function='elementwise',
        fitness_function='maxbinsnum',
        n_folds=5,
        p=None,
        encode=True,
        rank_formula="+",
        recalibrate=True,
        sample=None,
        random_state=42
    )
    all_times.append(np.mean(times))
    all_accs.append((np.mean(accs['irr']), np.mean(accs['pred'])))
    print(f"Task {task}, avg_time {all_times[-1]} all_accs {all_accs[-1]}")

Df shape: (674, 19)
Task Medicine, avg_time 9.340883111953735 all_accs (0.6394140409065783, 0.654228855721393)
Df shape: (250, 15)
Task Music, avg_time 4.681877660751343 all_accs (0.524, 0.5559999999999999)
Df shape: (204, 23)
Task Cars, avg_time 1.066990041732788 all_accs (0.7942682926829268, 0.8285365853658536)
Df shape: (1004, 12)
Task Cameras, avg_time 172.34560232162477 all_accs (0.6583930348258706, 0.7002487562189055)
Df shape: (147, 5)
Task Iris, avg_time 0.21153702735900878 all_accs (0.7193103448275862, 0.7813793103448277)
Df shape: (765, 9)
Task Titanic, avg_time 23.173298931121828 all_accs (0.677124183006536, 0.776470588235294)


In [14]:
all_times, all_accs

([9.340883111953735,
  4.681877660751343,
  1.066990041732788,
  172.34560232162477,
  0.21153702735900878,
  23.173298931121828],
 [(0.6394140409065783, 0.654228855721393),
  (0.524, 0.5559999999999999),
  (0.7942682926829268, 0.8285365853658536),
  (0.6583930348258706, 0.7002487562189055),
  (0.7193103448275862, 0.7813793103448277),
  (0.677124183006536, 0.776470588235294)])

In [15]:
all_times, all_accs = [], []
for task in ['Medicine', 'Music', 'Cars', 'Cameras', 'Iris', 'Titanic']:
    times, accs = test_method(
        filename=f'../data/{task}/cleaned.csv',
        algorithm='gencode',
        n_iter=250, 
        noiter_frac=2,
        alpha=0.5,
        rank_function='elementwise',
        fitness_function='maxbinsnum',
        n_folds=5,
        p=None,
        encode=True,
        rank_formula="+",
        recalibrate=True,
        sample=None,
        random_state=42
    )
    all_times.append(np.mean(times))
    all_accs.append((np.mean(accs['irr']), np.mean(accs['pred'])))
    print(f"Task {task}, avg_time {all_times[-1]} all_accs {all_accs[-1]}")

Df shape: (674, 19)
Task Medicine, avg_time 4.45449275970459 all_accs (0.6246213377556661, 0.6379657269209509)
Df shape: (250, 15)
Task Music, avg_time 2.5882005214691164 all_accs (0.548, 0.548)
Df shape: (204, 23)
Task Cars, avg_time 0.6235669612884521 all_accs (0.77, 0.7746341463414634)
Df shape: (1004, 12)
Task Cameras, avg_time 76.6169870853424 all_accs (0.6902885572139305, 0.7311293532338309)
Df shape: (147, 5)
Task Iris, avg_time 0.13288440704345703 all_accs (0.8289655172413793, 0.8698850574712644)
Df shape: (765, 9)
Task Titanic, avg_time 11.446849918365478 all_accs (0.6718954248366013, 0.776470588235294)


In [16]:
all_times, all_accs

([4.45449275970459,
  2.5882005214691164,
  0.6235669612884521,
  76.6169870853424,
  0.13288440704345703,
  11.446849918365478],
 [(0.6246213377556661, 0.6379657269209509),
  (0.548, 0.548),
  (0.77, 0.7746341463414634),
  (0.6902885572139305, 0.7311293532338309),
  (0.8289655172413793, 0.8698850574712644),
  (0.6718954248366013, 0.776470588235294)])

In [22]:
all_times, all_accs = [], []
for task in ['Medicine', 'Music', 'Cars', 'Cameras', 'Iris', 'Titanic']:
    times, accs = test_method(
        filename=f'../data/{task}/cleaned.csv',
        algorithm='gencode+',
        n_iter=250, 
        noiter_frac=2,
        alpha=0.5,
        rank_function='elementwise',
        fitness_function='maxbinsnum',
        n_folds=5,
        p=None,
        encode=True,
        rank_formula="+",
        recalibrate=True,
        sample=None,
        random_state=42
    )
    all_times.append(np.mean(times))
    all_accs.append((np.mean(accs['irr']), np.mean(accs['pred'])))
    print(f"Task {task}, avg_time {all_times[-1]} all_accs {all_accs[-1]}")

Df shape: (674, 19)
Task Medicine, avg_time 4.525192165374756 all_accs (0.636506357103372, 0.6408734107241569)
Df shape: (250, 15)
Task Music, avg_time 2.000855827331543 all_accs (0.5800000000000001, 0.5800000000000001)
Df shape: (204, 23)
Task Cars, avg_time 0.5133427143096924 all_accs (0.808658536585366, 0.8384146341463413)
Df shape: (1004, 12)
Task Cameras, avg_time 55.15505781173706 all_accs (0.6842885572139303, 0.7141542288557214)
Df shape: (147, 5)
Task Iris, avg_time 0.1390974998474121 all_accs (0.8694252873563219, 0.9314942528735631)
Df shape: (765, 9)
Task Titanic, avg_time 10.239969444274902 all_accs (0.6797385620915033, 0.7790849673202614)


In [23]:
all_times, all_accs

([4.525192165374756,
  2.000855827331543,
  0.5133427143096924,
  55.15505781173706,
  0.1390974998474121,
  10.239969444274902],
 [(0.636506357103372, 0.6408734107241569),
  (0.5800000000000001, 0.5800000000000001),
  (0.808658536585366, 0.8384146341463413),
  (0.6842885572139303, 0.7141542288557214),
  (0.8694252873563219, 0.9314942528735631),
  (0.6797385620915033, 0.7790849673202614)])

In [24]:
all_times, all_accs = [], []
for task in ['Medicine', 'Music', 'Cars', 'Cameras', 'Iris', 'Titanic']:
    times, accs = test_method(
        filename=f'../data/{task}/cleaned.csv',
        algorithm='gencode+',
        n_iter=250, 
        noiter_frac=2,
        alpha=0.5,
        rank_function='elementwise',
        fitness_function='maxbinsnum',
        n_folds=5,
        p=None,
        encode=True,
        rank_formula="+",
        recalibrate=False,
        sample=None,
        random_state=42
    )
    all_times.append(np.mean(times))
    all_accs.append((np.mean(accs['irr']), np.mean(accs['pred'])))
    print(f"Task {task}, avg_time {all_times[-1]} all_accs {all_accs[-1]}")

Df shape: (674, 19)
Task Medicine, avg_time 4.7061420440673825 all_accs (0.6305583195135434, 0.6557324488667773)
Df shape: (250, 15)
Task Music, avg_time 1.9709696292877197 all_accs (0.532, 0.532)
Df shape: (204, 23)
Task Cars, avg_time 0.4903904438018799 all_accs (0.7990243902439025, 0.8189024390243901)
Df shape: (1004, 12)
Task Cameras, avg_time 61.78007264137268 all_accs (0.6574228855721393, 0.6962437810945274)
Df shape: (147, 5)
Task Iris, avg_time 0.1283331871032715 all_accs (0.8910344827586207, 0.9183908045977013)
Df shape: (765, 9)
Task Titanic, avg_time 10.319379901885986 all_accs (0.6732026143790849, 0.7843137254901962)


In [25]:
all_times, all_accs

([4.7061420440673825,
  1.9709696292877197,
  0.4903904438018799,
  61.78007264137268,
  0.1283331871032715,
  10.319379901885986],
 [(0.6305583195135434, 0.6557324488667773),
  (0.532, 0.532),
  (0.7990243902439025, 0.8189024390243901),
  (0.6574228855721393, 0.6962437810945274),
  (0.8910344827586207, 0.9183908045977013),
  (0.6732026143790849, 0.7843137254901962)])

In [26]:
all_times, all_accs = [], []
for task in ['Medicine', 'Music', 'Cars', 'Cameras', 'Iris', 'Titanic']:
    times, accs = test_method(
        filename=f'../data/{task}/cleaned.csv',
        algorithm='gencode+',
        n_iter=250, 
        noiter_frac=2,
        alpha=0.5,
        rank_function='groupwise',
        fitness_function='maxbinsnum',
        n_folds=5,
        p=None,
        encode=True,
        rank_formula="+",
        recalibrate=False,
        sample=None,
        random_state=42
    )
    all_times.append(np.mean(times))
    all_accs.append((np.mean(accs['irr']), np.mean(accs['pred'])))
    print(f"Task {task}, avg_time {all_times[-1]} all_accs {all_accs[-1]}")

Df shape: (674, 19)
Task Medicine, avg_time 4.8019242763519285 all_accs (0.6290547263681592, 0.6483250414593698)
Df shape: (250, 15)
Task Music, avg_time 2.344622802734375 all_accs (0.5760000000000001, 0.5760000000000001)
Df shape: (204, 23)
Task Cars, avg_time 0.5771588325500489 all_accs (0.8240243902439024, 0.809390243902439)
Df shape: (1004, 12)
Task Cameras, avg_time 66.74905843734741 all_accs (0.6395174129353233, 0.6953134328358209)
Df shape: (147, 5)
Task Iris, avg_time 0.15609617233276368 all_accs (0.8556321839080461, 0.9245977011494253)
Df shape: (765, 9)
Task Titanic, avg_time 13.857624816894532 all_accs (0.6679738562091504, 0.7803921568627451)


In [27]:
all_times, all_accs

([4.8019242763519285,
  2.344622802734375,
  0.5771588325500489,
  66.74905843734741,
  0.15609617233276368,
  13.857624816894532],
 [(0.6290547263681592, 0.6483250414593698),
  (0.5760000000000001, 0.5760000000000001),
  (0.8240243902439024, 0.809390243902439),
  (0.6395174129353233, 0.6953134328358209),
  (0.8556321839080461, 0.9245977011494253),
  (0.6679738562091504, 0.7803921568627451)])

In [28]:
all_times, all_accs = [], []
for task in ['Medicine', 'Music', 'Cars', 'Cameras', 'Iris', 'Titanic']:
    times, accs = test_method(
        filename=f'../data/{task}/cleaned.csv',
        algorithm='gencode+',
        n_iter=250, 
        noiter_frac=2,
        alpha=0.5,
        rank_function='groupwise',
        fitness_function='maxbinsnum',
        n_folds=5,
        p=None,
        encode=True,
        rank_formula="+",
        recalibrate=True,
        sample=None,
        random_state=42
    )
    all_times.append(np.mean(times))
    all_accs.append((np.mean(accs['irr']), np.mean(accs['pred'])))
    print(f"Task {task}, avg_time {all_times[-1]} all_accs {all_accs[-1]}")

Df shape: (674, 19)
Task Medicine, avg_time 5.554165172576904 all_accs (0.6498728579325593, 0.6527805417357657)
Df shape: (250, 15)
Task Music, avg_time 2.349442720413208 all_accs (0.532, 0.52)
Df shape: (204, 23)
Task Cars, avg_time 0.6046135425567627 all_accs (0.7796341463414633, 0.8335365853658537)
Df shape: (1004, 12)
Task Cameras, avg_time 72.94669871330261 all_accs (0.6643880597014925, 0.7231094527363184)
Df shape: (147, 5)
Task Iris, avg_time 0.15090174674987794 all_accs (0.8696551724137931, 0.9317241379310344)
Df shape: (765, 9)
Task Titanic, avg_time 13.430852127075195 all_accs (0.673202614379085, 0.7790849673202614)


In [29]:
all_times, all_accs

([5.554165172576904,
  2.349442720413208,
  0.6046135425567627,
  72.94669871330261,
  0.15090174674987794,
  13.430852127075195],
 [(0.6498728579325593, 0.6527805417357657),
  (0.532, 0.52),
  (0.7796341463414633, 0.8335365853658537),
  (0.6643880597014925, 0.7231094527363184),
  (0.8696551724137931, 0.9317241379310344),
  (0.673202614379085, 0.7790849673202614)])

In [30]:
all_times, all_accs = [], []
for task in ['Medicine', 'Music', 'Cars', 'Cameras', 'Iris', 'Titanic']:
    times, accs = test_method(
        filename=f'../data/{task}/cleaned.csv',
        algorithm='gencode+',
        n_iter=250, 
        noiter_frac=2,
        alpha=0.5,
        rank_function='elementwise',
        fitness_function='maxbinsnum',
        n_folds=5,
        p=None,
        encode=True,
        rank_formula="-",
        recalibrate=True,
        sample=None,
        random_state=42
    )
    all_times.append(np.mean(times))
    all_accs.append((np.mean(accs['irr']), np.mean(accs['pred'])))
    print(f"Task {task}, avg_time {all_times[-1]} all_accs {all_accs[-1]}")

Df shape: (674, 19)
Task Medicine, avg_time 5.278198528289795 all_accs (0.6349364289662798, 0.6660917634051963)
Df shape: (250, 15)
Task Music, avg_time 2.275641345977783 all_accs (0.6040000000000001, 0.6120000000000001)
Df shape: (204, 23)
Task Cars, avg_time 0.5580727100372315 all_accs (0.8482926829268294, 0.8631707317073172)
Df shape: (1004, 12)
Task Cameras, avg_time 69.15301852226257 all_accs (0.6673781094527363, 0.7241293532338309)
Df shape: (147, 5)
Task Iris, avg_time 0.14617457389831542 all_accs (0.8351724137931035, 0.8351724137931035)
Df shape: (765, 9)
Task Titanic, avg_time 12.732571363449097 all_accs (0.673202614379085, 0.7856209150326798)


In [31]:
all_times, all_accs

([5.278198528289795,
  2.275641345977783,
  0.5580727100372315,
  69.15301852226257,
  0.14617457389831542,
  12.732571363449097],
 [(0.6349364289662798, 0.6660917634051963),
  (0.6040000000000001, 0.6120000000000001),
  (0.8482926829268294, 0.8631707317073172),
  (0.6673781094527363, 0.7241293532338309),
  (0.8351724137931035, 0.8351724137931035),
  (0.673202614379085, 0.7856209150326798)])

In [32]:
all_times, all_accs = [], []
for task in ['Medicine', 'Music', 'Cars', 'Cameras', 'Iris', 'Titanic']:
    times, accs = test_method(
        filename=f'../data/{task}/cleaned.csv',
        algorithm='gencode+',
        n_iter=250, 
        noiter_frac=2,
        alpha=0.5,
        rank_function='elementwise',
        fitness_function='maxbinsnum',
        n_folds=5,
        p=None,
        encode=True,
        rank_formula="-",
        recalibrate=False,
        sample=None,
        random_state=42
    )
    all_times.append(np.mean(times))
    all_accs.append((np.mean(accs['irr']), np.mean(accs['pred'])))
    print(f"Task {task}, avg_time {all_times[-1]} all_accs {all_accs[-1]}")

Df shape: (674, 19)
Task Medicine, avg_time 4.568147802352906 all_accs (0.6394140409065783, 0.6498065229408512)
Df shape: (250, 15)
Task Music, avg_time 2.079791450500488 all_accs (0.556, 0.556)
Df shape: (204, 23)
Task Cars, avg_time 0.5154966354370117 all_accs (0.8184146341463416, 0.833780487804878)
Df shape: (1004, 12)
Task Cameras, avg_time 57.329112434387206 all_accs (0.6494378109452736, 0.6962686567164179)
Df shape: (147, 5)
Task Iris, avg_time 0.13687868118286134 all_accs (0.8289655172413793, 0.8289655172413793)
Df shape: (765, 9)
Task Titanic, avg_time 10.946486282348634 all_accs (0.6745098039215687, 0.7843137254901961)


In [33]:
all_times, all_accs

([4.568147802352906,
  2.079791450500488,
  0.5154966354370117,
  57.329112434387206,
  0.13687868118286134,
  10.946486282348634],
 [(0.6394140409065783, 0.6498065229408512),
  (0.556, 0.556),
  (0.8184146341463416, 0.833780487804878),
  (0.6494378109452736, 0.6962686567164179),
  (0.8289655172413793, 0.8289655172413793),
  (0.6745098039215687, 0.7843137254901961)])

In [34]:
all_times, all_accs = [], []
for task in ['Medicine', 'Music', 'Cars', 'Cameras', 'Iris', 'Titanic']:
    times, accs = test_method(
        filename=f'../data/{task}/cleaned.csv',
        algorithm='gencode+',
        n_iter=250, 
        noiter_frac=2,
        alpha=0.5,
        rank_function='groupwise',
        fitness_function='maxbinsnum',
        n_folds=5,
        p=None,
        encode=True,
        rank_formula="-",
        recalibrate=False,
        sample=None,
        random_state=42
    )
    all_times.append(np.mean(times))
    all_accs.append((np.mean(accs['irr']), np.mean(accs['pred'])))
    print(f"Task {task}, avg_time {all_times[-1]} all_accs {all_accs[-1]}")

Df shape: (674, 19)
Task Medicine, avg_time 4.535551309585571 all_accs (0.6453067993366501, 0.6467993366500829)
Df shape: (250, 15)
Task Music, avg_time 2.1001484870910643 all_accs (0.62, 0.6279999999999999)
Df shape: (204, 23)
Task Cars, avg_time 0.5218067646026612 all_accs (0.8140243902439025, 0.8285365853658536)
Df shape: (1004, 12)
Task Cameras, avg_time 66.36536836624146 all_accs (0.6553781094527362, 0.7041940298507463)
Df shape: (147, 5)
Task Iris, avg_time 0.13694353103637696 all_accs (0.890574712643678, 0.890574712643678)
Df shape: (765, 9)
Task Titanic, avg_time 10.99763526916504 all_accs (0.681045751633987, 0.7856209150326797)


In [35]:
all_times, all_accs

([4.535551309585571,
  2.1001484870910643,
  0.5218067646026612,
  66.36536836624146,
  0.13694353103637696,
  10.99763526916504],
 [(0.6453067993366501, 0.6467993366500829),
  (0.62, 0.6279999999999999),
  (0.8140243902439025, 0.8285365853658536),
  (0.6553781094527362, 0.7041940298507463),
  (0.890574712643678, 0.890574712643678),
  (0.681045751633987, 0.7856209150326797)])

In [36]:
all_times, all_accs = [], []
for task in ['Medicine', 'Music', 'Cars', 'Cameras', 'Iris', 'Titanic']:
    times, accs = test_method(
        filename=f'../data/{task}/cleaned.csv',
        algorithm='gencode+',
        n_iter=250, 
        noiter_frac=2,
        alpha=0.5,
        rank_function='groupwise',
        fitness_function='maxbinsnum',
        n_folds=5,
        p=None,
        encode=True,
        rank_formula="-",
        recalibrate=True,
        sample=None,
        random_state=42
    )
    all_times.append(np.mean(times))
    all_accs.append((np.mean(accs['irr']), np.mean(accs['pred'])))
    print(f"Task {task}, avg_time {all_times[-1]} all_accs {all_accs[-1]}")

Df shape: (674, 19)
Task Medicine, avg_time 4.510274362564087 all_accs (0.6468546158098397, 0.6453731343283582)
Df shape: (250, 15)
Task Music, avg_time 2.064577913284302 all_accs (0.588, 0.588)
Df shape: (204, 23)
Task Cars, avg_time 0.5179290294647216 all_accs (0.7991463414634147, 0.8434146341463415)
Df shape: (1004, 12)
Task Cameras, avg_time 81.58032422065735 all_accs (0.6454577114427861, 0.7101990049751243)
Df shape: (147, 5)
Task Iris, avg_time 0.16032109260559083 all_accs (0.8974712643678162, 0.9319540229885057)
Df shape: (765, 9)
Task Titanic, avg_time 13.144987726211548 all_accs (0.6745098039215687, 0.7869281045751634)


In [37]:
all_times, all_accs

([4.510274362564087,
  2.064577913284302,
  0.5179290294647216,
  81.58032422065735,
  0.16032109260559083,
  13.144987726211548],
 [(0.6468546158098397, 0.6453731343283582),
  (0.588, 0.588),
  (0.7991463414634147, 0.8434146341463415),
  (0.6454577114427861, 0.7101990049751243),
  (0.8974712643678162, 0.9319540229885057),
  (0.6745098039215687, 0.7869281045751634)])