In [1]:
from ipynb.fs.full.Mimic import *
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings("ignore")

In [21]:
def generate_evil_data(size, num_clusters, num_dims, num_bias, drop_min=0.05, plot=False, seed=None, spread=40):
    if num_bias > num_clusters: num_bias=num_clusters
    while True:
        if seed is None: seed = np.random.randint(1000, 9999)
        clf = SVC(random_state=seed)
        rng = np.random.default_rng(seed)
            
        # generate dataset
        data1, labels1 = generatePills(int(size/2), num_clusters, num_dims, mean_low=spread, mean_high=spread+1, seed=seed)
        data2, labels2 = generatePills(int(size/2), num_clusters, num_dims, mean_low=spread, mean_high=spread+1, seed=seed+1)
    
        # split into test/train
        idcs_test1 = rng.choice(range(len(data1)), int(0.3*len(data1)))
        idcs_test2 = rng.choice(range(len(data2)), int(0.3*len(data2)))       
        idcs_train1 = np.setdiff1d(range(len(data1)), idcs_test1)
        idcs_train2 = np.setdiff1d(range(len(data2)), idcs_test2)
        
        # bias training set
        d1, l1, deleted1 = generateBias(data1[idcs_train1], labels1[idcs_train1], num_bias, prob=0, seed=seed+2)
        d2, l2, deleted2 = generateBias(data2[idcs_train2], labels2[idcs_train2], num_bias, prob=0, seed=seed+3)
        
        # concatenate
        train = np.concatenate((data1[idcs_train1], data2[idcs_train2]))
        l_train = np.append([0]*len(idcs_train1), [1]*len(idcs_train2))
        test = np.concatenate((data1[idcs_test1], data2[idcs_test2]))
        l_test = np.append([0]*len(idcs_test1), [1]*len(idcs_test2))
        train_b = np.concatenate((d1, d2))
        l_b = np.append([0]*len(l1), [1]*len(l2))
        deleted = np.append(deleted1, deleted2+len(idcs_train1))
        
        # calculate accuracies
        sample = np.random.randint(0, len(l_train), 300)
        acc_full = clf.fit(train[sample], l_train[sample]).score(test, l_test)
        sample = np.random.randint(0, len(l_b), 300)
        acc_bias = clf.fit(train_b[sample], l_b[sample]).score(test, l_test)
        if acc_full - acc_bias >= drop_min:
            if plot: plotBias(train, l_train, deleted, title="Accuracy drop: "+str(acc_full - acc_bias))
            print("Seed: ", seed)
            return train, l_train, test, l_test, train_b, l_b, deleted, seed
        seed += 4

In [42]:
# if we select the number of components via bic
def fit_gmm(data, max_clusters=50):
    best_bic = np.Inf
    for c in range(1, max_clusters):
        gm = GaussianMixture(n_components=c, random_state=0).fit(data)
        if gm.bic(data) < best_bic: 
            best_comp = c
            best_bic = gm.bic(data)

    gm = GaussianMixture(n_components=best_comp, random_state=0).fit(data)
    return list(zip(gm.means_, gm.covariances_))

In [35]:
def augment_multiClass_gmm(data, labels):
    points = []
    point_labels = []
    clusters = 0
    for c in np.unique(labels):
        gm_params = fit_gmm(data[labels==c])
        aug_p, _ = augment(data[labels==c], gm_params, plots=False, purge=False)
        points.append(aug_p)
        point_labels.append([c]*len(aug_p))
        clusters += len(gm_params)
    return np.concatenate(points), np.concatenate(point_labels), clusters

In [37]:
def augment_multiClass_mimic(data, labels):
    points = []
    point_labels = []
    clusters = 0
    for c in np.unique(labels):
        _, imi_params = fit(data[labels==c], plots=False)
        aug_p, _ = augment(data[labels==c], imi_params, plots=False, purge=False)
        points.append(aug_p)
        point_labels.append([c]*len(aug_p))
        clusters += len(imi_params)
    return np.concatenate(points), np.concatenate(point_labels), clusters

In [38]:
def augment_multiClass_imi(data, labels):
    points = []
    point_labels = []
    for c in np.unique(labels):
        aug_p, _ = Imitate_augment(data[labels==c], np.array([0]*np.sum(labels==c)))
        points.append(aug_p)
        point_labels.append([c]*len(aug_p))
    return np.concatenate(points), np.concatenate(point_labels), len(np.unique(labels))

# Spread

In [23]:
# # initialize file
# f = open("Results/exp_predictionAcc_spread.txt", "w") # a is append, w is write
# f.write("seed,num_clusters,num_dims,spread,clf,optimal,biased,method,augmented_acc,added_points")
# f.close()

In [30]:
def run_experiment_spread(num_clusters, num_dims, spread, filename, rep=10):
    clfs = [DecisionTreeClassifier(), SVC(), RandomForestClassifier()]
    clf_names = ['DT', 'SVM', 'RF']
    for r in range(rep):
        seed = np.random.randint(1000, 9999)
        X_tr, y_tr, X_te, y_te, X_b, y_b, _, seed = generate_evil_data(5000, num_clusters, num_dims, num_clusters, 
                                                                       drop_min=0.10, spread=spread, seed=seed)
        
        p_gmm, l_gmm = augment_multiClass_gmm(X_b, y_b)
        X_gmm, y_gmm = np.concatenate((X_b, p_gmm)), np.append(y_b, l_gmm)
        
        p_imi, l_imi = augment_multiClass_imi(X_b, y_b)
        X_imi, y_imi = np.concatenate((X_b, p_imi)), np.append(y_b, l_imi)
        
        try:
            p_mim, l_mim = augment_multiClass_mimic(X_b, y_b)
            X_mim, y_mim = np.concatenate((X_b, p_mim)), np.append(y_b, l_mim)
        except:
            print(str(seed)+","+str(num_clusters)+","+str(num_dims), "broken")
            continue
        
        f = open("Results/"+filename+".txt", "a")
        for i in range(len(clfs)):
            o = clfs[i].fit(X_tr, y_tr).score(X_te, y_te) # original
            b = clfs[i].fit(X_b, y_b).score(X_te, y_te) # biased
            a_gmm = clfs[i].fit(X_gmm, y_gmm).score(X_te, y_te)
            a_imi = clfs[i].fit(X_imi, y_imi).score(X_te, y_te)
            a_mim = clfs[i].fit(X_mim, y_mim).score(X_te, y_te)

            fixed = "\n"+str(seed)+","+str(num_clusters)+","+str(num_dims)+","+str(spread)+","+clf_names[i]+","+str(o)+","+str(b)
            f.write(fixed+",GMM,"+str(a_gmm)+","+str(len(l_gmm)))
            f.write(fixed+",Imitate,"+str(a_imi)+","+str(len(l_imi)))
            f.write(fixed+",Mimic (ours),"+str(a_mim)+","+str(len(l_mim)))
        f.close()

In [27]:
for cl in range(1, 6):
    for dims in [2]:
        for spread in [10, 50, 100, 150, 200]:
            print("Starting", cl, "clusters and", dims, "dimensions with", spread, "spread")
            run_experiment_spread(cl, dims, spread, rep=25)

Starting 1 clusters and 2 dimensions with 10 spread
Seed:  5333
Seed:  9557
Seed:  3515
Seed:  2023
Seed:  4975
Seed:  3022
Seed:  5636
Seed:  8638
Seed:  7862
7862,1,2 broken
Seed:  5773
Seed:  5398
Seed:  3567
Seed:  5952
Seed:  5986
Seed:  1160
Seed:  9299
Seed:  8905
Seed:  6542
Seed:  9902
Seed:  5803
5803,1,2 broken
Seed:  4250
Seed:  5098
Seed:  1564
1564,1,2 broken
Seed:  2438
Seed:  7418
Starting 1 clusters and 2 dimensions with 50 spread
Seed:  9407
Seed:  3355
Seed:  1102
Seed:  3477
Seed:  9457
Seed:  9823
Seed:  2690
Seed:  3963
Seed:  2134
Seed:  5198
5198,1,2 broken
Seed:  1646
Seed:  6590
Seed:  2702
Seed:  5371
Seed:  7280
Seed:  8904
Seed:  7257
Seed:  3997
Seed:  1691
Seed:  7417
Seed:  5745
Seed:  2990
Seed:  2987
2987,1,2 broken
Seed:  4705
Seed:  1289
Starting 1 clusters and 2 dimensions with 100 spread
Seed:  9179
Seed:  4380
Seed:  8824
Seed:  5630
Seed:  1560
Seed:  4557
Seed:  7138
Seed:  4787
4787,1,2 broken
Seed:  4658
Seed:  9373
Seed:  6203
6203,1,2 broken

Seed:  7371
Seed:  8201
Seed:  8946
Seed:  7954
Seed:  3881
Seed:  1248
Seed:  9950
Seed:  2033
Seed:  6919
Seed:  8239
Starting 5 clusters and 2 dimensions with 150 spread
Seed:  1248
Seed:  6307
Seed:  5450
Seed:  8512
Seed:  8084
Seed:  8748
Seed:  1222
Seed:  5205
Seed:  2843
Seed:  2755
Seed:  4785
Seed:  2309
Seed:  9528
Seed:  6774
Seed:  2387
Seed:  8005
Seed:  2109
Seed:  7494
Seed:  8084
Seed:  5440
Seed:  7505
Seed:  1259
Seed:  9328
Seed:  5021
Seed:  4960
Starting 5 clusters and 2 dimensions with 200 spread
Seed:  7288
Seed:  4959
Seed:  7566
Seed:  3429
Seed:  6885
Seed:  1876
Seed:  4043
Seed:  7727
Seed:  4959
Seed:  10417
Seed:  5034
Seed:  7079
Seed:  8718
Seed:  5958
Seed:  6927
Seed:  3598
Seed:  6470
Seed:  2879
Seed:  5746
Seed:  2454
2454,5,2 broken
Seed:  7433
Seed:  4825
Seed:  2775
Seed:  4932
Seed:  7288


# Dimensions

In [28]:
# # initialize file
# f = open("Results/exp_predictionAcc_dims.txt", "w") # a is append, w is write
# f.write("seed,num_clusters,num_dims,spread,clf,optimal,biased,method,augmented_acc,added_points")
# f.close()

In [31]:
for cl in range(1, 6):
    for dims in [2, 3, 4, 5]:
        for spread in [100]:
            print("Starting", cl, "clusters and", dims, "dimensions with", spread, "spread")
            run_experiment_spread(cl, dims, spread, rep=30, filename="exp_predictionAcc_dims")

Starting 1 clusters and 2 dimensions with 100 spread
Seed:  8217
Seed:  7546
Seed:  7987
Seed:  8902
Seed:  8574
8574,1,2 broken
Seed:  9130
Seed:  1443
Seed:  4345
Seed:  8249
Seed:  1779
Seed:  1965
Seed:  1467
Seed:  1965
Seed:  9061
Seed:  9159
Seed:  8185
Seed:  8184
Seed:  8184
Seed:  6745
Seed:  2242
Seed:  9377
Seed:  2314
Seed:  9244
Seed:  9958
Seed:  8184
Seed:  8574
Seed:  1670
Seed:  5437
Seed:  1948
Seed:  6375
Starting 1 clusters and 3 dimensions with 100 spread
Seed:  9972
Seed:  4279
Seed:  8076
Seed:  4674
4674,1,3 broken
Seed:  8449
Seed:  4749
4749,1,3 broken
Seed:  5602
Seed:  4810
Seed:  4345
Seed:  4618
Seed:  2112
Seed:  6620
6620,1,3 broken
Seed:  5069
5069,1,3 broken
Seed:  10090
Seed:  1171
Seed:  6918
Seed:  8551
Seed:  7570
7570,1,3 broken
Seed:  1895
1895,1,3 broken
Seed:  7989
Seed:  3183
Seed:  7465
Seed:  3361
3361,1,3 broken
Seed:  4217
4217,1,3 broken
Seed:  3319
Seed:  8609
Seed:  8076
8076,1,3 broken
Seed:  7539
7539,1,3 broken
Seed:  8968
Seed:  35

Seed:  7748
Seed:  6516
Seed:  5431
Seed:  6990
Seed:  3146
Seed:  9382
Seed:  5020
Seed:  1891
Seed:  7264
Seed:  6760
Seed:  5203
Seed:  8216
Seed:  3744
Seed:  5758
Seed:  5315
Seed:  1135
Seed:  4031
Seed:  5697
Seed:  5450
Seed:  6787
Starting 5 clusters and 4 dimensions with 100 spread
Seed:  10233
Seed:  8207
Seed:  9229
Seed:  7994
Seed:  4873
Seed:  1427
Seed:  9746
Seed:  1714
Seed:  4083
Seed:  8312
Seed:  4435
Seed:  9388
Seed:  5530
Seed:  6409
Seed:  12030
Seed:  8768
Seed:  8352
Seed:  2453
2453,5,4 broken
Seed:  9828
Seed:  2247
Seed:  9290
Seed:  6306
6306,5,4 broken
Seed:  8884
8884,5,4 broken
Seed:  7994
Seed:  4920
Seed:  2428
Seed:  4920
Seed:  4248
4248,5,4 broken
Seed:  4975
Seed:  6786
Starting 5 clusters and 5 dimensions with 100 spread
Seed:  11003
Seed:  19227
Seed:  9496
9496,5,5 broken
Seed:  8615
Seed:  7706
Seed:  8184
Seed:  14819
Seed:  5937
Seed:  2888
2888,5,5 broken
Seed:  14899
14899,5,5 broken
Seed:  7652
Seed:  24445
24445,5,5 broken
Seed:  17446
