In [4]:
import numpy as np
import pandas as pd
from subprocess import call
from itertools import product
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, StratifiedKFold

In [5]:
def compute_thresholds(df):
    threshs = []
    threshs_raveled = []
    cur_neg = -1
    for i in range(df.shape[1] - 1):
        df_sorted = df.sort_values(by=i).reset_index(drop='index')

        for val in df_sorted[i]:
            if len(df_sorted[df_sorted[i] == val]['y'].unique()) > 1:
                y_ = df_sorted['y'].values
                y_[df_sorted[i] == val] = cur_neg
                df_sorted['y'] = y_
                cur_neg -= 1
        
        tmp = df_sorted.rolling(2).apply(lambda x: x.iloc[1] - x.iloc[0])[1:].reset_index()
        ids = tmp[(tmp.y != 0) & (tmp[i] != 0)]['index'].values
        threshs.append((df_sorted.iloc[ids, i].values + df_sorted.iloc[ids-1, i].values) / 2)
        threshs_raveled += list(threshs[-1])
    return threshs, threshs_raveled

In [6]:
def gen_bool_matrix(df: pd.DataFrame, threshs, threshs_raveled):

    unique_labels = sorted(df['y'].unique())
    n_threshs = []
    for i in range(len(threshs)):
        n_threshs += [i] * len(threshs[i])
    n_threshs += [-1]  # label

    df_stretched = df.iloc[:, n_threshs]
    classes = [df_stretched[df_stretched['y'] == y].drop(columns=['y']).values for y in unique_labels]
    res = []
    for c1, c2 in product(unique_labels, unique_labels):
        if c1 >= c2:
            continue
        for row1, row2 in product(classes[c1], classes[c2]):
            res.append(
                (row1 <= threshs_raveled) & (row2 > threshs_raveled) | \
                (row1 > threshs_raveled) & (row2 <= threshs_raveled)
            )
    return pd.DataFrame(res, columns=df_stretched.columns[:-1])

In [7]:
def get_weights(df, threshs, column_names, rank_formula="-", recalibrate=True):
    weights = []
    for feature in df.columns:
        if feature == 'y':
            continue
        feature_ts = [-np.inf] + threshs[feature].tolist() + [+np.inf]
        obj = df[feature]
        classes = df['y'].value_counts().sort_index()
        w = []
        N = []
        
        for i in range(1, len(feature_ts) - 1):
            #left_class = df[(obj > feature_ts[i-1]) & (obj <= feature_ts[i])]['y'].value_counts(ascending=True).argmax()
            #right_class = df[(obj < feature_ts[i+1]) & (obj >= feature_ts[i])]['y'].value_counts(ascending=True).argmax()
            #assert left_class != right_class
            NL = (obj > feature_ts[i-1]) & (obj <= feature_ts[i])
            NR = (obj < feature_ts[i+1]) & (obj >= feature_ts[i])
            NL_classes = df[NL]['y'].value_counts().sort_index()
            NR_classes = df[NR]['y'].value_counts().sort_index()
        
            res = []
        
            for cls in NL_classes.index:
                if cls not in NR_classes.index:
                    NR_classes.at[cls] = 0
            for cls in NR_classes.index:
                if cls not in NL_classes.index:
                    NL_classes.at[cls] = 0
        
            for cls1 in NL_classes.index:
                for cls2 in NR_classes.index:
                    if cls1 == cls2:
                        continue
                    KL = classes[cls1]
                    KR = classes[cls2]
                    left_n = (df[NL]['y'] == cls1).sum()
                    right_n = (df[NR]['y'] == cls2).sum()
                    if rank_formula == "-":
                        c = abs(KR / (KR + KL) * left_n - KL / (KR + KL) * right_n)
                    elif rank_formula == "+":
                        c = -abs(KR / (KR + KL) * left_n + KL / (KR + KL) * right_n)
                    elif rank_formula == "anti+":
                        c = abs(KR / (KR + KL) * left_n + KL / (KR + KL) * right_n)
                    else:
                        raise NotImplementedError
                    res.append(c)
            w.append(np.mean(res))
            
            N.append(NL.sum())
        N.append(NR.sum())
        
        assert sum(N) == df.shape[0]
        assert 0 not in N
        assert len(w) + 1 == len(N)

        weights.append(np.array(w))

    # recalibrate
    if recalibrate:
        means = [w.mean() for w in weights]
        mu = np.mean(means)
        weights = [w * (mu / w.mean()) for w in weights]

    res = []
    for elem in weights:
        res += elem.tolist()
    res = np.array(res)

    temp = pd.DataFrame(res.reshape(1, -1))
    temp.columns = column_names

    return temp

In [8]:
def get_encoded_df(M, cols, df, thresh, threshs_raveled, train=True):
    features_threshs = {}
    for c in cols:
        feature = M.columns[c]
        if feature not in features_threshs:
            features_threshs[feature] = []
        features_threshs[feature].append(threshs_raveled[c])
    
    df_ = {}
    for feature in M.columns:
        all_t = [-np.inf] + features_threshs[feature] + [+np.inf]
        feature_copy = df[feature].values.copy()
        for t_idx in range(1, len(all_t)):
            cond = (df[feature] > all_t[t_idx - 1]) & (df[feature] < all_t[t_idx])
            feature_copy[cond] = t_idx - 1
        df_[feature] = feature_copy
        if train:
            assert len(np.unique(feature_copy)) == len(features_threshs[feature]) + 1, feature
    df_['y'] = df['y'].copy()
    df_ = pd.DataFrame(df_)

    return df_.astype(int)

# Dataset

In [6]:
dataset_filename = "../../data/Medicine/cleaned.csv"
df = pd.read_csv(dataset_filename)

y = df.y
df = pd.DataFrame(df.drop(columns=['y']).values)
df['y'] = y.values

df.drop_duplicates(inplace=True, subset=list(range(df.shape[1] - 1)))

df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,y
0,7.0,1.0,1.0,0.0,0.0,0.0,37.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
1,11.7,1.0,1.0,0.0,0.0,0.0,38.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1
2,5.6,1.0,2.0,1.0,1.0,0.0,37.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1
3,6.0,1.0,2.0,1.0,1.0,0.0,40.8,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1
4,5.2,1.0,2.0,0.0,1.0,0.0,40.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
671,4.7,0.0,1.0,0.0,1.0,0.0,39.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1
672,3.3,1.0,0.0,0.0,1.0,1.0,38.5,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
673,4.6,1.0,1.0,0.0,1.0,0.0,39.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1
674,3.4,1.0,2.0,0.0,1.0,0.0,39.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0


In [7]:
df = df.sample(250, random_state=42)

In [8]:
data = df
X, y = data.drop(columns=['y']).values, data['y'].values
kf = StratifiedKFold(n_splits=10)

In [9]:
accs = []

for i, (train_index, test_index) in enumerate(kf.split(X, y)):
    X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    y_hat = model.predict(X_test)

    #print(y_hat.sum())
    
    acc = (y_hat == y_test).sum() / len(y_test)
    print(f"{i}th fold, accuracy {acc}")
    accs.append(acc)

print(np.mean(accs))

0th fold, accuracy 0.64
1th fold, accuracy 0.72
2th fold, accuracy 0.68
3th fold, accuracy 0.84
4th fold, accuracy 0.56
5th fold, accuracy 0.64
6th fold, accuracy 0.6
7th fold, accuracy 0.72
8th fold, accuracy 0.72
9th fold, accuracy 0.52
0.664


# Whole pipeline

In [9]:
def test_method(filename, algorithm, rank_function, fitness_function, n_folds, rank_formula="-", recalibrate=True, sample=None, random_state=42):
    np.random.seed(random_state)
    df = pd.read_csv(filename)
    y = df.y
    df = pd.DataFrame(df.drop(columns=['y']).values)
    df['y'] = y.values
    df.drop_duplicates(inplace=True, subset=list(range(df.shape[1] - 1)))
    if sample:
        df = df.sample(sample, random_state=42) # ALWAYS 42!!!!!!!!!
    y = df['y']

    kf = StratifiedKFold(n_splits=n_folds)

    accs = []
    for i, (train_index, test_index) in enumerate(kf.split(df, y.values)):
        seed = np.random.randint(0, 1e6)
        df_train, df_test = df.iloc[train_index].reset_index(drop='index'), df.iloc[test_index].reset_index(drop='index')

        threshs, threshs_raveled = compute_thresholds(df_train)
        M = gen_bool_matrix(df_train, threshs, threshs_raveled)
        assert M.sum(axis=1).min() >= 1
        M.astype(int).to_csv('./data/bool.csv', index=False)
        weights = get_weights(df_train, threshs, M.columns, rank_formula=rank_formula, recalibrate=recalibrate)
        weights.to_csv('./data/ranks.csv', index=False, header=False)

        call(["./GeneticAlgorithm.o", str(seed), str(M.shape[0]), str(M.shape[1]), rank_function, fitness_function, algorithm])
        with open("./data/results.txt", 'r') as file:
            cols = list(map(int, file.readline().split()))

        df_train = get_encoded_df(M, cols, df_train, threshs, threshs_raveled, train=True)
        df_test = get_encoded_df(M, cols, df_test, threshs, threshs_raveled, train=False)
        X_train, y_train = df_train.drop(columns=['y']).values, df_train['y'].values
        X_test, y_test = df_test.drop(columns=['y']).values, df_test['y'].values

        model = RandomForestClassifier(random_state=42)
        model.fit(X_train, y_train)
        y_hat = model.predict(X_test)

        acc = (y_hat == y_test).sum() / len(y_test)
        accs.append(acc)

    return np.mean(accs)

In [27]:
def test_all(dataset, sample=None, n_folds=10, rank_formula="-", random_state=42):
    print(f"Dataset: {dataset}, seed: {random_state}")
    scores = []
    scores.append(
        test_method(
        filename=f"../data/{dataset}/cleaned.csv",
        algorithm="greedy",
        rank_function="elementwise",
        fitness_function="maxbinsnum",
        n_folds=n_folds,
        rank_formula=rank_formula,
        recalibrate=True,
        sample=sample,
        random_state=random_state)
    )
    scores.append(
        test_method(
        filename=f"../data/{dataset}/cleaned.csv",
        algorithm="gencode",
        rank_function="elementwise",
        fitness_function="maxbinsnum",
        n_folds=n_folds,
        rank_formula=rank_formula,
        recalibrate=True,
        sample=sample,
        random_state=random_state)
    )
    scores.append(
        test_method(
        filename=f"../data/{dataset}/cleaned.csv",
        algorithm="gencode+",
        rank_function="elementwise",
        fitness_function="maxbinsnum",
        n_folds=n_folds,
        rank_formula=rank_formula,
        recalibrate=True,
        sample=sample,
        random_state=random_state)
    )
    scores.append(
        test_method(
        filename=f"../data/{dataset}/cleaned.csv",
        algorithm="gencode+",
        rank_function="groupwise",
        fitness_function="maxbinsnum",
        n_folds=n_folds,
        rank_formula=rank_formula,
        recalibrate=False,
        sample=sample,
        random_state=random_state)
    )
    return scores

In [28]:
def test_many_seeds(dataset, sample=None, n_folds=10, rank_formula="-", n_seeds=10):
    scores = []
    for seed in range(n_seeds):
        scores.append(test_all(dataset=dataset, sample=sample, n_folds=n_folds, rank_formula=rank_formula, random_state=seed))
    return scores

In [32]:
all_scores = []

In [33]:
scores = test_many_seeds("Titanic", sample=250, n_folds=5, rank_formula='+', n_seeds=10)
all_scores.append(scores)
print(np.mean(scores, axis=0))

Dataset: Titanic, seed: 0
Dataset: Titanic, seed: 1
Dataset: Titanic, seed: 2
Dataset: Titanic, seed: 3
Dataset: Titanic, seed: 4
Dataset: Titanic, seed: 5
Dataset: Titanic, seed: 6
Dataset: Titanic, seed: 7
Dataset: Titanic, seed: 8
Dataset: Titanic, seed: 9
[0.752  0.7272 0.7272 0.7372]


In [34]:
scores = test_many_seeds("Cars", sample=204, n_folds=5, rank_formula='+', n_seeds=10)
all_scores.append(scores)
print(np.mean(scores, axis=0))

Dataset: Cars, seed: 0
Dataset: Cars, seed: 1
Dataset: Cars, seed: 2
Dataset: Cars, seed: 3
Dataset: Cars, seed: 4
Dataset: Cars, seed: 5
Dataset: Cars, seed: 6
Dataset: Cars, seed: 7
Dataset: Cars, seed: 8
Dataset: Cars, seed: 9
[0.82878049 0.85613415 0.85758537 0.85171951]


In [35]:
scores = test_many_seeds("Cameras", sample=250, n_folds=5, rank_formula='+', n_seeds=10)
all_scores.append(scores)
print(np.mean(scores, axis=0))

Dataset: Cameras, seed: 0
Dataset: Cameras, seed: 1
Dataset: Cameras, seed: 2
Dataset: Cameras, seed: 3
Dataset: Cameras, seed: 4
Dataset: Cameras, seed: 5
Dataset: Cameras, seed: 6
Dataset: Cameras, seed: 7
Dataset: Cameras, seed: 8
Dataset: Cameras, seed: 9
[0.736  0.7136 0.7088 0.7104]


In [36]:
scores = test_many_seeds("Airplane", sample=250, n_folds=5, rank_formula='+', n_seeds=10)
all_scores.append(scores)
print(np.mean(scores, axis=0))

Dataset: Airplane, seed: 0
Dataset: Airplane, seed: 1
Dataset: Airplane, seed: 2
Dataset: Airplane, seed: 3
Dataset: Airplane, seed: 4
Dataset: Airplane, seed: 5
Dataset: Airplane, seed: 6
Dataset: Airplane, seed: 7
Dataset: Airplane, seed: 8
Dataset: Airplane, seed: 9
[0.888  0.8916 0.8968 0.8972]


In [37]:
scores = test_many_seeds("Music", sample=250, n_folds=5, rank_formula='+', n_seeds=10)
all_scores.append(scores)
print(np.mean(scores, axis=0))

Dataset: Music, seed: 0
Dataset: Music, seed: 1
Dataset: Music, seed: 2
Dataset: Music, seed: 3
Dataset: Music, seed: 4
Dataset: Music, seed: 5
Dataset: Music, seed: 6
Dataset: Music, seed: 7
Dataset: Music, seed: 8
Dataset: Music, seed: 9
[0.732  0.7176 0.726  0.7252]


In [38]:
scores = test_many_seeds("Medicine", sample=250, n_folds=5, rank_formula='+', n_seeds=10)
all_scores.append(scores)
print(np.mean(scores, axis=0))

Dataset: Medicine, seed: 0
Dataset: Medicine, seed: 1
Dataset: Medicine, seed: 2
Dataset: Medicine, seed: 3
Dataset: Medicine, seed: 4
Dataset: Medicine, seed: 5
Dataset: Medicine, seed: 6
Dataset: Medicine, seed: 7
Dataset: Medicine, seed: 8
Dataset: Medicine, seed: 9
[0.664  0.6584 0.6588 0.6568]


In [40]:
all_scores[4]

[[0.732, 0.7, 0.7280000000000001, 0.708],
 [0.732, 0.732, 0.712, 0.732],
 [0.732, 0.708, 0.716, 0.736],
 [0.732, 0.7200000000000001, 0.716, 0.7280000000000001],
 [0.732, 0.716, 0.72, 0.692],
 [0.732, 0.692, 0.772, 0.708],
 [0.732, 0.732, 0.7, 0.736],
 [0.732, 0.724, 0.7200000000000001, 0.732],
 [0.732, 0.732, 0.7319999999999999, 0.7319999999999999],
 [0.732, 0.72, 0.744, 0.7479999999999999]]

In [41]:
np.mean(all_scores[4], axis=0)

array([0.732 , 0.7176, 0.726 , 0.7252])

Cars: gencode, gencode+ macro recalib -> gencode+ micro not recalib -> gencode+ macro not recalib, gencode+ micro recalib

Titanic: same

Weather: almost same (bad dataset)

Cameras: same, but macro recalib is good (micro recalib 2nd place)

Iris: bad dataset

In [47]:
df = pd.read_csv("../data/Music/cleaned.csv")
y = df.y
df = pd.DataFrame(df.drop(columns=['y']).values)
df['y'] = y.values
df.drop_duplicates(inplace=True, subset=list(range(df.shape[1] - 1)))
df = df.sample(250, random_state=42) # ALWAYS 42!!!!!!!!!
threshs, threshs_raveled = compute_thresholds(df)
M = gen_bool_matrix(df, threshs, threshs_raveled)
assert M.sum(axis=1).min() >= 1
M.astype(int).to_csv('./data/bool.csv', index=False)
M.shape

(20425, 1310)