## Dimensional reduction/ increase of data using autoencoders.

In [None]:
# Loading Data.
import pandas as pd
import numpy as np

train = pd.read_csv("train.csv", index_col=None)
X, Y = train.iloc[:,1:], train["Class"]
train.head(10)

### Create the shallow neural network autoencoder.

In [None]:
from keras.layers import Input, Dense
from keras.models import Model
from keras.optimizers import RMSprop, SGD, Adamax

# Simple autoencoder, based off the code from the following source:
########################################
# Title: Building Autoencoders in Keras
# Author: Francois Chollet
# Date: 14/05/2016
# Available: https://blog.keras.io/building-autoencoders-in-keras.html
########################################

def get_encoder(encoding_dim = 32):
    # this is our input placeholder
    input_dat = Input(shape=(1024,))
    # "encoded" is the encoded representation of the input
    encoded = Dense(encoding_dim, activation='relu')(input_dat)
    # "decoded" is the lossy reconstruction of the input
    decoded = Dense(1024, activation='sigmoid')(encoded)

    # this model maps an input to its reconstruction
    autoencoder = Model(input_dat, decoded)
    
    # this model maps an input to its encoded representation
    encoder = Model(input_dat, encoded)
    
    # create a placeholder for an encoded (32-dimensional) input
    encoded_input = Input(shape=(encoding_dim,))
    # retrieve the last layer of the autoencoder model
    decoder_layer = autoencoder.layers[-1]
    # create the decoder model
    decoder = Model(encoded_input, decoder_layer(encoded_input))
    
    optimiser = SGD(lr=0.5, decay=0.0)
    autoencoder.compile(optimizer=optimiser, loss='binary_crossentropy', metrics=['accuracy'])
    
    

    autoencoder.fit(X_train, X_train,
                    epochs=500,
                    batch_size=200,
                    shuffle=True,
                    validation_data=(X_test, X_test))
    
    return encoder
    

# Functions which can be spawned into new thread.
def reduce_dims():
    global X, Y
    
    encoder = get_encoder(512)
    X_red = encoder.predict(X)
    print(X_red.shape)
    reduced = pd.DataFrame(Y)
    reduced = reduced.join(pd.DataFrame(X_red))
    reduced.to_csv("reduced_dims.csv", index = False)
    
def increase_dims():
    global X, Y
    
    encoder = get_encoder(2048)
    X_inc = encoder.predict(X)

    reduced = pd.DataFrame(Y)
    reduced = reduced.join(pd.DataFrame(X_inc))
    reduced.to_csv("increased_dims.csv", index = False)

In [None]:
import multiprocessing

if __name__ == "__main__":
    # Spawn new threads, so that data is garbage collected.
    p = multiprocessing.Process(target=reduce_dims)
    p.start()
    p.join()
    
    p = multiprocessing.Process(target=increase_dims)
    p.start()
    p.join()

## Test the effectiveness of the encoded output using Random Forest classifier.

In [None]:
# Building a Random Forest classifier pipeline.
#
# Based on the code from the following source:
########################################
# Title: Model Comparison using Pipelines
# Author: Chuan Lu
# Date: 06/04/2017
# Code Version: 47c58c0
# Available: https://github.com/aberML/CSM6420/blob/master/Tutorial%203-Titanic5%20Building%20Pipelines%20and%20Model%20Comparison.ipynb
#######################################
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

from sklearn.ensemble import RandomForestClassifier

from sklearn.grid_search import GridSearchCV

# Loading Data.
import pandas as pd
import numpy as np

kfolds = 8

anova_filter = SelectKBest(f_regression)
# Random Forest Classifier.
clf = RandomForestClassifier()
pipeline_rf = Pipeline([
    ('anova', anova_filter),
    ('rf', clf)
])

def load_file(file):
    train = pd.read_csv(file, index_col=None)
    return train.iloc[:,1:], train["Class"]

In [None]:
def test_encoded(file = "reduced_dims.csv"):
    global kfolds, pipeline_rf
    
    X, Y = load_file(file)
    parameter_grid_rf = {
                'anova__k': [10, 100, 'all'],
                'rf__n_estimators': [10, 100, 1000],
                'rf__max_depth': [5, 10, 50, 100, None],
            }    
    grid_search = GridSearchCV(pipeline_rf, parameter_grid_rf, cv=kfolds, verbose=3, n_jobs=8)
    grid_search.fit(X, Y)

    sorted(grid_search.grid_scores_, key=lambda x: x.mean_validation_score)
    print(grid_search.best_score_)
    print(grid_search.best_params_)
    
test_encoded()
test_encoded("increased_dims.csv")

### Plot ROC curves.

In [None]:
%matplotlib inline 
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# ROC plot and AUC statistic, based on the code from the following source:
########################################
# Title: Cross-Validation and ROC curve analysis
# Author: Chuan Lu
# Date: 06/04/2017
# Available: https://github.com/aberML/CSM6420/
########################################
# (Hacked together, but does the job ;-) )
mean_acc = 0.0
mean_auc = 0.0
all_tpr = []
all_acc = []
all_auc = []

i=0

def ROC_AUC(y_test, y_prob):
    global i, mean_acc, mean_auc, all_tpr, all_acc, all_auc
    i += 1
    # Get prediction on class label from the model
    y_prediction = np.around(y_prob, decimals=0)
    
    # Get probability output from the model
    acc = np.sum(y_test == y_prediction)*1./len(y_test)
    print("Prediction accuracy:", acc)
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(y_test, y_prob)
    roc_auc = auc(fpr, tpr)
    print("Area under ROC curve (AUC):", roc_auc)
    plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))    
    all_acc.append(acc)
    all_auc.append(roc_auc)

def display_plot(title = 'Receiver operating characteristic example'):
    global i, mean_acc, mean_auc, all_tpr, all_acc, all_auc
    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6))#, label='Luck')
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(title)
    plt.legend(loc="lower right")

    all_acc=np.asarray(all_acc)
    all_auc=np.asarray(all_auc)
    print(all_acc)
    # print 95% C.I. for both accuracy and AUC based on CV
    print("Mean Accuracy: %0.3f (+/- %0.3f)" % (all_acc.mean(), all_acc.std() * 1.96))
    print(all_auc)
    print("Mean AUC: %0.3f (+/- %0.3f)" % (all_auc.mean(), all_auc.std() * 1.96))
    
    # Reset values for re-use.
    mean_acc = 0.0
    mean_auc = 0.0
    all_tpr = []
    all_acc = []
    all_auc = []

    i=0

### Check ROC results.

In [None]:
from sklearn.cross_validation import StratifiedKFold as SKFold

random_seed = 1234
scv = SKFold(y=Y, n_folds=kfolds, random_state=random_seed)

input_len, input_sz = X.shape[0], X.shape[1]

# train_frac = 0.5
# sep = int(input_len*train_frac)
for enc, title, params in [("reduced_dims",
                            "Random Forest Dimension Reduction ROC",
                            {'anova__k': 'all', 'rf__max_depth': 50, 'rf__n_estimators': 100}),
                           ("increased_dims",
                            "Random Forest Dimension Increase ROC", 
                            {'anova__k': 100, 'rf__max_depth': 100, 'rf__n_estimators': 100})]:
    
    X, Y = load_file(enc+".csv")
    plt.figure()
    for training_set, test_set in scv:  
        X_train = X.iloc[training_set]
        y_train = Y.iloc[training_set]
        X_test = X.iloc[test_set]
        y_test = Y.iloc[test_set]
        print("Shape of training:")
        print("X:", X_train.shape, "y:", y_train.shape)
        print("Shape of testing:")
        print("X:", X_test.shape, "y:", y_test.shape)

        # pipeline_rf.set_params(**{'anova__k': 'all', 'rf__max_depth': 50, 'rf__n_estimators': 1000})
        # pipeline_rf.fit(X_train, y_train)

        pipeline_rf.set_params(**params)
        pipeline_rf.fit(X_train, y_train)

        y_pred = pipeline_rf.predict_proba(X_test)[:,1]


        ROC_AUC(y_test, y_pred)


    display_plot(title)
    plt.savefig("rf_" + enc + ".pdf")

In [2]:
import sys
import sklearn
import keras
import tensorflow
import pandas as pd

print('Python: ', sys.version_info)
print('Pandas: ', pd.__version__)
print('Sklearn: ', sklearn.__version__)
print('Keras: ', keras.__version__)
print('tensorflow: ', tensorflow.__version__)

Python:  sys.version_info(major=3, minor=6, micro=5, releaselevel='final', serial=0)
Pandas:  0.22.0
Sklearn:  0.19.1
Keras:  2.1.6
tensorflow:  1.8.0
