In [298]:
import numpy as np
import pandas as pd
from subprocess import call
from itertools import product
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import KFold, StratifiedKFold

In [299]:
def compute_thresholds(df):
    threshs = []
    threshs_raveled = []
    cur_neg = -1
    for i in range(df.shape[1] - 1):
        df_sorted = df.sort_values(by=i).reset_index(drop='index')

        for val in df_sorted[i]:
            if len(df_sorted[df_sorted[i] == val]['y'].unique()) > 1:
                y_ = df_sorted['y'].values
                y_[df_sorted[i] == val] = cur_neg
                df_sorted['y'] = y_
                cur_neg -= 1
        
        tmp = df_sorted.rolling(2).apply(lambda x: x.iloc[1] - x.iloc[0])[1:].reset_index()
        ids = tmp[(tmp.y != 0) & (tmp[i] != 0)]['index'].values
        threshs.append((df_sorted.iloc[ids, i].values + df_sorted.iloc[ids-1, i].values) / 2)
        threshs_raveled += list(threshs[-1])
    return threshs, threshs_raveled

In [300]:
def gen_bool_matrix(df: pd.DataFrame, threshs, threshs_raveled):

    unique_labels = sorted(df['y'].unique())
    n_threshs = []
    for i in range(len(threshs)):
        n_threshs += [i] * len(threshs[i])
    n_threshs += [-1]  # label

    df_stretched = df.iloc[:, n_threshs]
    classes = [df_stretched[df_stretched['y'] == y].drop(columns=['y']).values for y in unique_labels]
    res = []
    for c1, c2 in product(unique_labels, unique_labels):
        if c1 >= c2:
            continue
        for row1, row2 in product(classes[c1], classes[c2]):
            res.append(
                (row1 <= threshs_raveled) & (row2 > threshs_raveled) | \
                (row1 > threshs_raveled) & (row2 <= threshs_raveled)
            )
    return pd.DataFrame(res, columns=df_stretched.columns[:-1])

In [301]:
def get_weights(df, threshs, column_names, recalibrate=True):
    weights = []
    for feature in df.columns:
        if feature == 'y':
            continue
        feature_ts = [-np.inf] + threshs[feature].tolist() + [+np.inf]
        obj = df[feature]
        classes = df['y'].value_counts().sort_index()
        w = []
        N = []
        
        for i in range(1, len(feature_ts) - 1):
            #left_class = df[(obj > feature_ts[i-1]) & (obj <= feature_ts[i])]['y'].value_counts(ascending=True).argmax()
            #right_class = df[(obj < feature_ts[i+1]) & (obj >= feature_ts[i])]['y'].value_counts(ascending=True).argmax()
            #assert left_class != right_class
            NL = (obj > feature_ts[i-1]) & (obj <= feature_ts[i])
            NR = (obj < feature_ts[i+1]) & (obj >= feature_ts[i])
            NL_classes = df[NL]['y'].value_counts().sort_index()
            NR_classes = df[NR]['y'].value_counts().sort_index()
        
            res = []
        
            for cls in NL_classes.index:
                if cls not in NR_classes.index:
                    NR_classes.at[cls] = 0
            for cls in NR_classes.index:
                if cls not in NL_classes.index:
                    NL_classes.at[cls] = 0
        
            for cls1 in NL_classes.index:
                for cls2 in NR_classes.index:
                    if cls1 == cls2:
                        continue
                    KL = classes[cls1]
                    KR = classes[cls2]
                    left_n = (df[NL]['y'] == cls1).sum()
                    right_n = (df[NR]['y'] == cls2).sum()
                    c = abs(KR / (KR + KL) * left_n - KL / (KR + KL) * right_n)
                    res.append(c)
            w.append(np.mean(res))
            
            N.append(NL.sum())
        N.append(NR.sum())
        
        assert sum(N) == df.shape[0]
        assert 0 not in N
        assert len(w) + 1 == len(N)

        weights.append(np.array(w))

    # recalibrate
    if recalibrate:
        means = [w.mean() for w in weights]
        mu = np.mean(means)
        weights = [w * (mu / w.mean()) for w in weights]

    res = []
    for elem in weights:
        res += elem.tolist()
    res = np.array(res)

    temp = pd.DataFrame(res.reshape(1, -1))
    temp.columns = column_names

    return temp

In [445]:
def get_encoded_df(M, cols, df, thresh, threshs_raveled, train=True):
    features_threshs = {}
    for c in cols:
        feature = M.columns[c]
        if feature not in features_threshs:
            features_threshs[feature] = []
        features_threshs[feature].append(threshs_raveled[c])
    
    df_ = {}
    for feature in M.columns:
        all_t = [-np.inf] + features_threshs[feature] + [+np.inf]
        feature_copy = df[feature].values.copy()
        for t_idx in range(1, len(all_t)):
            cond = (df[feature] > all_t[t_idx - 1]) & (df[feature] < all_t[t_idx])
            feature_copy[cond] = t_idx - 1
        df_[feature] = feature_copy
        if train:
            assert len(np.unique(feature_copy)) == len(features_threshs[feature]) + 1, feature
    df_['y'] = df['y'].copy()
    df_ = pd.DataFrame(df_)

    return df_.astype(int)

# Dataset

In [544]:
dataset_filename = "../../data/Titanic/cleaned.csv"
df = pd.read_csv(dataset_filename)

y = df.y
df = pd.DataFrame(df.drop(columns=['y']).values)
df['y'] = y.values

df.drop_duplicates(inplace=True, subset=list(range(df.shape[1] - 1)))

df

Unnamed: 0,0,1,2,3,4,5,6,7,y
0,3.0,0.0,22.0,1.0,0.0,7.2500,0.0,0.0,0
1,1.0,1.0,38.0,1.0,0.0,71.2833,1.0,1.0,1
2,3.0,1.0,26.0,0.0,0.0,7.9250,0.0,0.0,1
3,1.0,1.0,35.0,1.0,0.0,53.1000,1.0,0.0,1
4,3.0,0.0,35.0,0.0,0.0,8.0500,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...
883,3.0,1.0,39.0,0.0,5.0,29.1250,0.0,2.0,0
885,1.0,1.0,19.0,0.0,0.0,30.0000,6.0,0.0,1
886,3.0,1.0,28.0,1.0,2.0,23.4500,0.0,0.0,0
887,1.0,0.0,26.0,0.0,0.0,30.0000,1.0,1.0,1


In [545]:
df = df.sample(250, random_state=42)

In [546]:
data = df
X, y = data.drop(columns=['y']).values, data['y'].values
kf = StratifiedKFold(n_splits=10)

In [547]:
accs = []

for i, (train_index, test_index) in enumerate(kf.split(X, y)):
    X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    y_hat = model.predict(X_test)

    #print(y_hat.sum())
    
    acc = (y_hat == y_test).sum() / len(y_test)
    print(f"{i}th fold, accuracy {acc}")
    accs.append(acc)

print(np.mean(accs))

0th fold, accuracy 0.64
1th fold, accuracy 0.68
2th fold, accuracy 0.72
3th fold, accuracy 0.8
4th fold, accuracy 0.6
5th fold, accuracy 0.64
6th fold, accuracy 0.88
7th fold, accuracy 0.76
8th fold, accuracy 0.64
9th fold, accuracy 0.8
0.716


In [522]:
threshs, threshs_raveled = compute_thresholds(df)
M = gen_bool_matrix(df, threshs, threshs_raveled)
assert M.sum(axis=1).min() >= 1
M.astype(int).to_csv('./data/bool.csv', index=False)
weights = get_weights(df, threshs, M.columns, recalibrate=True)
weights.to_csv('./data/ranks.csv', index=False, header=False)
call(["./GeneticAlgorithm.o", str(M.shape[0]), str(M.shape[1]), "elementwise", "maxbinsnum"])
with open("./data/results.txt", 'r') as file:
    cols = list(map(int, file.readline().split()))

# Whole pipeline

In [548]:
def test_method(filename, rank_function, fitness_function, n_folds, recalibrate=True, sample=None, random_state=42):
    df = pd.read_csv(filename)
    y = df.y
    df = pd.DataFrame(df.drop(columns=['y']).values)
    df['y'] = y.values
    df.drop_duplicates(inplace=True, subset=list(range(df.shape[1] - 1)))
    if sample:
        df = df.sample(sample, random_state=random_state)
    y = df['y']

    kf = StratifiedKFold(n_splits=n_folds)

    accs = []
    for i, (train_index, test_index) in enumerate(kf.split(df, y.values)):
        df_train, df_test = df.iloc[train_index].reset_index(drop='index'), df.iloc[test_index].reset_index(drop='index')

        threshs, threshs_raveled = compute_thresholds(df_train)
        M = gen_bool_matrix(df_train, threshs, threshs_raveled)
        assert M.sum(axis=1).min() >= 1
        M.astype(int).to_csv('./data/bool.csv', index=False)
        weights = get_weights(df_train, threshs, M.columns, recalibrate=recalibrate)
        weights.to_csv('./data/ranks.csv', index=False, header=False)

        call(["./GeneticAlgorithm.o", str(M.shape[0]), str(M.shape[1]), rank_function, fitness_function])
        with open("./data/results.txt", 'r') as file:
            cols = list(map(int, file.readline().split()))

        df_train = get_encoded_df(M, cols, df_train, threshs, threshs_raveled, train=True)
        df_test = get_encoded_df(M, cols, df_test, threshs, threshs_raveled, train=False)
        X_train, y_train = df_train.drop(columns=['y']).values, df_train['y'].values
        X_test, y_test = df_test.drop(columns=['y']).values, df_test['y'].values

        model = RandomForestClassifier(random_state=random_state)
        model.fit(X_train, y_train)
        y_hat = model.predict(X_test)

        acc = (y_hat == y_test).sum() / len(y_test)
        print(f"{i}th fold, accuracy {acc}")
        accs.append(acc)

    print(f"Mean accuracy {np.mean(accs)}")

In [549]:
test_method(
    filename="../../data/Titanic/cleaned.csv",
    rank_function="elementwise",
    fitness_function="maxbinsnum",
    n_folds=10,
    recalibrate=True,
    sample=250
)

0th fold, accuracy 0.64
1th fold, accuracy 0.72
2th fold, accuracy 0.76
3th fold, accuracy 0.84
4th fold, accuracy 0.6
5th fold, accuracy 0.64
6th fold, accuracy 0.84
7th fold, accuracy 0.84
8th fold, accuracy 0.56
9th fold, accuracy 0.76
Mean accuracy 0.72


In [550]:
test_method(
    filename="../../data/Titanic/cleaned.csv",
    rank_function="groupwise",
    fitness_function="maxbinsnum",
    n_folds=10,
    recalibrate=True,
    sample=250
)

0th fold, accuracy 0.56
1th fold, accuracy 0.72
2th fold, accuracy 0.84
3th fold, accuracy 0.84
4th fold, accuracy 0.6
5th fold, accuracy 0.72
6th fold, accuracy 0.88
7th fold, accuracy 0.8
8th fold, accuracy 0.56
9th fold, accuracy 0.68
Mean accuracy 0.72
