In [1]:
import pandas as pd
import numpy as np
import h5py
import math

from keras.models import Sequential, model_from_json
from keras.layers import Dense, Dropout, Activation, regularizers, Flatten

from sklearn import ensemble, preprocessing, multiclass
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_val_score, train_test_split


from collections import Counter

Using TensorFlow backend.


In [2]:
# Transformation

def sqrt(col):
    return list(map(np.sqrt, col));

def freq(col):
    col = np.floor(col)
    counter = Counter(col)
    return [counter.get(elem) for elem in col]


In [181]:
# Globals

# Datasets
dids = np.load("datasets/indexes.npy")

# RF model parameters
seed = 67
transformations = [sqrt, freq]
transformations_name = ["sqrt", "freq"]
#trans2target1 = {}
#trans2target2 = {}
#trans2target3 = {}

# Comrpessed Dataset paramters
qsa_representation = []
num_bin = 10
too_big = 100000

# Neural Nets Parameters and Variables
#MLP_LFE_Nets = {}
inp_shape = (2,num_bin)
dropout = 0.2
norm = (0, 10)
pred_threshold = 0.65


In [4]:
#def binarize_dataset():

def load_dataset(id):
    X = np.load("datasets/" + str(id) + "-data.npy")
    y = np.load("datasets/" + str(id) + "-target.npy")
    categorical = np.load("datasets/" + str(id) + "-categorical.npy")
    return X,y,categorical

    
def evaluate_model(X, y, categorical):
    imp = Imputer(missing_values="NaN")
    X = imp.fit_transform(X)
    enc = preprocessing.OneHotEncoder(categorical_features=categorical)
    X = enc.fit_transform(X)
    clf = ensemble.RandomForestClassifier(random_state=seed)
    #clf_ovsr = multiclass.OneVsRestClassifier(clf, n_jobs=-1)
    
    return cross_val_score(clf, X, y,cv=10)
    
def is_positive(X,y,categorical,base_score,transformation,feature):
    transformed_feature = np.array(transformation(X[:,feature]))
    X = np.c_[X,transformed_feature]
    categorical = np.append(categorical,False)
    new_score = evaluate_model(X,y,categorical).mean()
    
    return 1 if(base_score <= (new_score - 0.01)) else 0

def is_positive_2(X, y, categorical, base_score, transformation, feature):
    transformed_feature = np.array(transformation(X[:,feature]))
    new_score = evaluate_model(transformed_feature.reshape(-1,1),y,[False]).mean()
    
    return 1 if(base_score <= (new_score - 0.005)) else 0

def is_positive_3(X, y, categorical, base_score, transformation, feature):
    transformed_feature = np.array(transformation(X[:,feature]))
    new_score = evaluate_model(transformed_feature.reshape(-1,1),y,[False]).mean()
    
    return 1 if(new_score > base_score*1.01) else 0
    
    

In [5]:
# Build the target for the compressed feature
bad_datasets = []

def build_target_for_compressed(dids):

    for transf in transformations:
        trans2target1[transf] = []
        trans2target2[transf] = []
        trans2target3[transf] = []


    
    for did in [161, 162, 464, 724, 741, 772, 774, 795, 811, 814, 827, 860, 914,
       923, 925, 931, 934, 948, 983, 997, 1169, 1460, 1462, 1464,
       1470, 1502, 40698, 40704]:
        print("Start dataset number", did)
        
    ## try:

        X, y, categorical = load_dataset(did)       

        new_indexes = []

        if(X.shape[0] > too_big):
            new_indexes = np.random.choice(X.shape[0], too_big, replace=False)
            X = X[new_indexes]
            y = y[new_indexes]

        base_score = evaluate_model(X, y, categorical).mean()

        # Find the indexes of numeric attributes
        numerical_indexes = np.where(np.invert(categorical))[0]

        for i,transf in enumerate(transformations):
            for feature in numerical_indexes:

                print("\tEvaluating feature " + str(feature))

                mlp_target_1 = is_positive(X,y,categorical,base_score,transf, feature)
                mlp_target_2 = is_positive_2(X,y,categorical,base_score,transf, feature)
                mlp_target_3 = is_positive_3(X,y,categorical,base_score,transf, feature)

                print("\t\t" + str(mlp_target_1), str(mlp_target_2), str(mlp_target_3))

                trans2target1[transf].append((did,feature,mlp_target_1))
                trans2target2[transf].append((did,feature,mlp_target_2))
                trans2target3[transf].append((did,feature,mlp_target_3))

    #except:
            #print("The evaluation of dataset " + str(did) + " failed")
            #bad_datasets.append(did)
            #continue

In [6]:
# Save the result
def save_target_for_compressed(path):

    for transf, name in zip(transformations, transformations_name):
        np.save(path + name + "1", trans2target1[transf])
        np.save(path + name + "2", trans2target2[transf])
        np.save(path + name + "3", trans2target3[transf])


In [7]:
def normalize_Rx(matrix):
    
    Rxc = np.zeros(shape=matrix.shape)
    
    for i,row in enumerate(matrix):
        max_c = np.amax(row)
        min_c = np.amin(row)
        bin_width = (max_c-min_c)/(norm[1]-norm[0])
        Rxc[i] = np.apply_along_axis(lambda x : np.floor((x-min_c)/(bin_width)+norm[0]), 0, row)
    
    return Rxc

def to_quantile_sketch_array(did, col, targets, bins, t_class, index):
    max_c = np.nanmax(col)
    min_c = np.nanmin(col)
    bin_width = (max_c-min_c)/num_bin
    Rx = np.zeros(shape=(2,num_bin))
    
    if(bin_width == 0):
        return
    
    for val,y in zip(col,targets):
        if not np.isnan(val):
            bin_value = int(np.floor((val-min_c)/bin_width))
            bin_value = np.clip(bin_value, 0, num_bin-1)
            my_class = 0 if t_class == y else 1
            Rx[my_class][bin_value] = Rx[my_class][bin_value] + 1
            
    Rx = normalize_Rx(Rx)

    qsa_representation.append(np.insert(Rx.flatten(), 0, [did,index]))

In [42]:
# Build the compressed dataset
def build_compressed_dataset(dids):

    qsa_representation = []

    for did in [161, 162, 464, 724, 741, 772, 774, 795, 811, 814, 827, 860, 914,
       923, 925, 931, 934, 948, 983, 997, 1169, 1460, 1462, 1464,
       1470, 1502, 40698, 40704]:
        print("Start dataset number", did)

        try:
            X, y, categorical = load_dataset(did)
        except:
            print("Dataset " + str(did) + " not found")
            continue;

        new_indexes = []

        if(X.shape[0] > too_big):
            new_indexes = np.random.choice(X.shape[0], too_big, replace=False)
            X = X[new_indexes]
            y = y[new_indexes]

        numerical_indexes = np.where(np.invert(categorical))[0]

        classes = set(y)

        for t_class in classes:
            for index in numerical_indexes:
                to_quantile_sketch_array(did,X[:,index], y, num_bin, t_class, index)
        
    

In [9]:
# Save the compressed datasets
def save_compressed_dataset(path):
    np.save(path + "compressed.npy", qsa_representation)

In [10]:
# CREATING THE NEURAL NETS

def initialize_MLPs():
    
    for transf in transformations_name:
        model = Sequential()

        model.add(Dense(64, input_shape=inp_shape, W_regularizer=regularizers.l2(0.01)))
        model.add(Activation('softmax'))
        model.add(Dropout(dropout))
        model.add(Flatten())
        model.add(Dense(output_dim=1))
        model.add(Activation('relu'))

        # For a binary classification problem
        model.compile(optimizer='rmsprop',
                      loss='binary_crossentropy',
                      metrics=['accuracy'])

        MLP_LFE_Nets[transf] = model
    

In [163]:

def load_compressed_ds():
    data = np.load("datasets/compressed/compressed.npy")
    return pd.DataFrame(data)


def assemble_training_set(compressed, transformation_targets):
    targetDf = pd.DataFrame(transformation_targets)
    merged = pd.merge(compressed, targetDf, how='left', on=[0, 1])
    class_1 = merged.ix[:,4:num_bin + 4].values
    class_2 = merged.ix[:,num_bin + 4:-1].values
    target = np.array(merged.ix[:,-1].values)
    meta_inf = np.array(merged.ix[:,:2].values)
    meta_target = np.c_[target, meta_inf]
    X = []
    
    for c1, c2 in zip(class_1, class_2):
        X.append([c1,c2])    

    return np.array(X), np.array(meta_target)


def split_training_test():
    compressed_ds = load_compressed_ds()

    for transf, name in zip(transformations, transformations_name):
        transformation_targets = np.load("datasets/compressed/" + name + "1.npy")
        X,y = assemble_training_set(compressed_ds, transformation_targets)
        X_s_tr, X_s_test, y_s_tr, y_s_test = train_test_split(X, y, test_size=0.3)
        
        # Dropping the meta-info from training set
        y_s_tr = y_s_tr[:,:1]
        
        np.save("datasets/training/" + name + "-data_split",X_s_tr)
        np.save("datasets/training/" + name + "-target_split",y_s_tr)
        np.save("datasets/test/" + name + "-data_split",X_s_test)
        np.save("datasets/test/" + name + "-target_split",y_s_test)
 
        
def load_training_set(transf):
    
    X = np.load("datasets/training/" + transf + "-data_split.npy")
    y = np.load("datasets/training/" + transf + "-target_split.npy")
    
    return X,y


In [12]:
def balance_dataset(X, y, pos_perc = 0.5):
    X = np.array(X)
    y = np.array(y)

    cnt = Counter(y)

    neg_num = cnt[0]
    pos_num = cnt[1]

    neg_index = (y == 0)
    pos_index = (y == 1)

    X_pos = X[pos_index]
    X_neg = X[neg_index]
    y_pos = y[pos_index]
    y_neg = y[neg_index]

    if pos_perc >= 0.5:
        if pos_num <= neg_num:
            sample_neg_num = int(X_pos.shape[0] * (1/pos_perc - 1))
            neg_sampled_indexes = np.random.choice(X_neg.shape[0], sample_neg_num, replace=False)
            X_neg = X_neg[neg_sampled_indexes]
            y_neg = y_neg[neg_sampled_indexes]

        else:
            sample_neg_ideal_size = int(X_pos.shape[0] * (1/pos_perc - 1))

            if(sample_neg_ideal_size > X_neg.shape[0]):
                sample_pos_num = int(X_neg.shape[0] * (1/(1-pos_perc) - 1))
                pos_sampled_indexes = np.random.choice(X_pos.shape[0], sample_pos_num, replace=False)
                X_pos = X_pos[pos_sampled_indexes]
                y_pos = y_pos[pos_sampled_indexes]
            else:
                neg_sampled_indexes = np.random.choice(X_neg.shape[0], sample_neg_ideal_size, replace=False) 
                X_neg = X_neg[neg_sampled_indexes]
                y_neg = y_neg[neg_sampled_indexes]
    else:
        if pos_num <= neg_num:
            sample_pos_ideal_size = int(X_neg.shape[0] * (1/(1-pos_perc) - 1))

            if(sample_pos_ideal_size > X_pos.shape[0]):
                sample_neg_num = int(X_pos.shape[0] * (1/pos_perc - 1))
                neg_sampled_indexes = np.random.choice(X_neg.shape[0], sample_neg_num, replace=False)
                X_neg = X_neg[neg_sampled_indexes]
                y_neg = y_neg[neg_sampled_indexes]
            else:
                pos_sampled_indexes = np.random.choice(X_pos.shape[0], sample_pos_ideal_size, replace=False) 
                X_pos = X_pos[pos_sampled_indexes]
                y_pos = y_pos[pos_sampled_indexes]
        else:
            sample_pos_num = int(X_neg.shape[0] * (1/(1-pos_perc) - 1))
            pos_sampled_indexes = np.random.choice(X_pos.shape[0], sample_pos_num, replace=False)
            X_pos = X_pos[pos_sampled_indexes]
            y_pos = y_pos[pos_sampled_indexes]




    X = np.concatenate((X_pos, X_neg), axis=0)
    y = np.concatenate((y_pos, y_neg) , axis=0)      

    shuffle_index = np.random.choice(X.shape[0], X.shape[0], replace=False)

    X = X[shuffle_index]
    y = y[shuffle_index]

    return X,y

In [168]:
# Training the nets

def train_MLPs():

    for transf, name in zip(transformations, transformations_name):
        
        X, y = load_training_set(name)
        
        X, y = balance_dataset(X, y.reshape(y.shape[0]), pos_perc=0.5)
        
        y = y.reshape(y.shape[0],1)

        MLP_LFE_Nets[name].summary()
        print ("Inputs: {}".format(MLP_LFE_Nets[name].input_shape))
        print ("Outputs: {}".format(MLP_LFE_Nets[name].output_shape))
        print ("Actual input: {}".format(X.shape))
        print ("Actual output: {}".format(y.shape))

        MLP_LFE_Nets[name].fit(X, y, nb_epoch=500)    


In [14]:

def save_MLPs():
    for transf in transformations_name:
        MLP_LFE_Nets[transf].save_weights("datasets/MLPs/" + transf + "-weights")
        model_json = MLP_LFE_Nets[transf].to_json()
        with open("datasets/MLPs/" + transf + "-net_model", "w") as f:
            f.write(model_json)

def load_MLPs():
    
    for name in transformations_name:
        json_file = open('datasets/MLPs/' + name + "-net_model", 'r')
        loaded_model_json = json_file.read()
        json_file.close()
        loaded_model = model_from_json(loaded_model_json)
        
        # load weights into new model
        loaded_model.load_weights("datasets/MLPs/" + name + "-weights")

        # evaluate loaded model on test data
        loaded_model.compile(optimizer='rmsprop',
                             loss='binary_crossentropy',
                             metrics=['accuracy'])
        
        MLP_LFE_Nets[name] = loaded_model


    

In [15]:
def load_test_set():
    
    X = np.load("datasets/test/" + transformations_name[0] + "-data_split.npy")
    y_meta = np.load("datasets/test/" + transformations_name[0] + "-target_split.npy")
    t = np.full((y_meta.shape[0],1), 0)
    y_meta = np.concatenate((y_meta, t), axis=1)
    
    for i,name in enumerate(transformations_name[1:]):
        X = np.concatenate((X, np.load("datasets/test/" + name + "-data_split.npy")), axis=0)
        y_meta_tmp = np.load("datasets/test/" + name + "-target_split.npy")
        t = np.full((y_meta_tmp.shape[0],1),i+1)
        y_meta_tmp = np.concatenate((y_meta_tmp, t), axis=1)
        y_meta = np.concatenate((y_meta, y_meta_tmp), axis=0)
    
    return X,y_meta

In [122]:
# Test the accuracy on a dataset

def evaluate_transformation_classifier():
    
    # Number of prediction on features
    num_of_prediction = 0
    # Number of correct prediction on features
    num_of_correct_prediction = 0
    # Number of dataset which received a prediction
    good_predicted_dids = set()
    num_of_predicted_dataset = 0
    
    pred_mat = []
    
    X, y_meta = load_test_set()
    
    for transf in transformations_name:
        pred_mat.append(MLP_LFE_Nets[transf].predict(X))
        
    pred_mat = np.array(pred_mat).transpose()
    
    for predictions,did,feature in zip(pred_mat[0],y_meta[:,1], y_meta[:,2]):
        pmax = np.amax(predictions)
        
        print(predictions)
        
        if pmax > pred_threshold:
            index = np.where(predictions==pmax)[0][0]
            print(index)
            num_of_prediction += 1
            
            # Select the target for the transformation and the dataset
            positive_example_found =  np.where((y_meta[:,0] == 1)       &  \
                                               (y_meta[:,1] == did)     &  \
                                               (y_meta[:,2] == feature) &  \
                                               (y_meta[:,3] == index))     \
                                                                       [0].shape[0] > 0
            
            if(positive_example_found):
                good_predicted_dids.add(did) 
                print("Correct")
                num_of_correct_prediction += 1
    
    if(num_of_prediction == 0):
        print("No predictions have been made")
        return
    
    print("Number of prediction:", num_of_prediction)
    print("Number of Correct prediciton:", num_of_correct_prediction)
    print("Accuracy:", num_of_correct_prediction/num_of_prediction)
    print("Number of datasets who received a good prediction:", len(good_predicted_dids))
    


# Build and Preprocess the Dataset

In [None]:

build_target_for_compressed(dids)
save_target_for_compressed("datasets/compressed/")
    

In [43]:

build_compressed_dataset(dids)
save_compressed_dataset("datasets/compressed/")


Start dataset number 161
Start dataset number 162
Start dataset number 464
Start dataset number 724
Start dataset number 741
Start dataset number 772
Start dataset number 774
Start dataset number 795
Start dataset number 811
Start dataset number 814
Start dataset number 827
Start dataset number 860
Start dataset number 914
Start dataset number 923
Start dataset number 925
Start dataset number 931
Start dataset number 934
Start dataset number 948
Start dataset number 983
Start dataset number 997
Start dataset number 1169
Start dataset number 1460
Start dataset number 1462
Start dataset number 1464
Start dataset number 1470
Start dataset number 1502
Start dataset number 40698
Start dataset number 40704


# Train Test Split

In [115]:

split_training_test()



# Train the MLPs


In [169]:

initialize_MLPs()
train_MLPs()
#save_MLPs()


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
dense_81 (Dense)                 (None, 2, 64)         704         dense_input_43[0][0]             
____________________________________________________________________________________________________
activation_81 (Activation)       (None, 2, 64)         0           dense_81[0][0]                   
____________________________________________________________________________________________________
dropout_41 (Dropout)             (None, 2, 64)         0           activation_81[0][0]              
____________________________________________________________________________________________________
flatten_41 (Flatten)             (None, 128)           0           dropout_41[0][0]                 
___________________________________________________________________________________________

# Load the nets

In [103]:

load_MLPs()



# Test the nets


In [180]:

evaluate_transformation_classifier()


[ 0.45828593  0.        ]
[ 0.09083163  0.        ]
[ 0.23010738  0.69536591]
[ 0.4448393  1.464782 ]
1
[ 0.          0.18542013]
[ 0.92810386  1.28592622]
1
[ 1.07312238  1.47007704]
1
Correct
[ 0.32598239  1.14436543]
1
[ 0.8989346   1.30614913]
1
[ 0.36133778  0.25625238]
[ 0.79033488  0.61864543]
[ 0.  0.]
[ 0.68025494  0.69206548]
[ 0.76492941  1.34006071]
1
[ 0.  0.]
[ 0.95536643  0.77660787]
0
[ 1.06063747  0.80023301]
0
[ 0.02293015  0.        ]
[ 0.  0.]
[ 0.  0.]
[ 0.  0.]
[ 0.24550438  0.        ]
[ 0.          0.34021586]
[ 0.46725941  0.        ]
[ 1.09537458  1.03687024]
0
Correct
[ 0.  0.]
[ 0.23465164  0.88540453]
1
Correct
[ 0.02293015  0.        ]
[ 0.38218644  0.        ]
[ 0.36622822  0.32268614]
[ 0.04427072  0.        ]
[ 0.63212168  0.9040913 ]
1
[ 0.11455671  0.        ]
[ 0.92083752  0.08255829]
0
[ 0.24568889  0.        ]
[ 0.88007027  1.42343712]
1
Correct
[ 0.63526958  0.76962018]
[ 0.31418219  0.28013247]
[ 0.  0.]
[ 0.09745377  0.        ]
[ 0.          0.

In [None]:
yt =pd.DataFrame(np.load("datasets/test/freq-target_split_1.npy"))

In [None]:
%matplotlib

yt[yt[0]==1][1].values

In [None]:
pos_dids

In [45]:
compressed = pd.DataFrame(np.load("datasets/compressed/compressed.npy"))   
transformation_targets = np.load("datasets/compressed/" + "freq" + "1.npy")
targetDf = pd.DataFrame(transformation_targets)
merged = pd.merge(compressed, targetDf, how='right', on=[0, 1])


In [49]:
merged.describe()

Unnamed: 0,0,1,0_x,1_x,2,3,4,5,6,7,...,13,14,15,16,17,18,19,20,21,2.1
count,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,...,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0
mean,3736.225352,1.28169,3736.225352,1.28169,3.605634,2.274648,3.760563,3.964789,4.133803,4.140845,...,2.274648,3.760563,3.964789,4.133803,4.140845,3.309859,2.950704,1.93662,2.330986,0.197183
std,10216.431461,1.35984,10216.431461,1.35984,4.123893,2.915219,3.222108,3.632813,3.980856,3.597855,...,2.915219,3.222108,3.632813,3.980856,3.597855,3.343636,3.384,3.022879,3.864224,0.39928
min,161.0,0.0,161.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,795.0,0.0,795.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,...,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,925.0,1.0,925.0,1.0,1.5,1.0,3.0,3.0,3.5,4.0,...,1.0,3.0,3.0,3.5,4.0,2.0,1.0,0.0,0.0,0.0
75%,1460.0,2.0,1460.0,2.0,8.75,3.0,6.0,7.0,8.0,7.0,...,3.0,6.0,7.0,8.0,7.0,6.0,6.0,3.0,3.0,0.0
max,40704.0,6.0,40704.0,6.0,10.0,10.0,10.0,10.0,10.0,10.0,...,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,1.0


In [None]:
m.fit()