In [None]:
# are the future functions actually necessary?
from __future__ import absolute_import, division, print_function, unicode_literals
import imp

from globalConstants import Const
from dataset import Dataset
from pipeline import Pipeline
from autoEncoder import AutoEncoder 
from corrector import Corrector
from stepper import Stepper

from plotter import *
from data_read import *
from helperFunctions import *
from losses import *

import numpy as np
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import matplotlib as mpl

import random
import time
from collections import Counter
import sys

np.set_printoptions(precision=3, suppress=True)
# allows for automatic reloading of imports and makes it unncessecary to restart the kernel
# whenever a function is changed
%load_ext autoreload
%autoreload 2

In [None]:
dataSetType = "DW" #"DW", "ZP", or "MH"
assert dataSetType == "DW" or dataSetType == "ZP" or dataSetType == "MH",\
    "dataSetType needs to be set to 'DW', 'ZP' or 'MH'."

In [None]:
c = Const(dataSetType)

In [None]:
if dataSetType == "DW" or dataSetType == "ZP":
    dataset_frac = c.used_toy_frac
    train_val_test_function = make_train_val_test_from_toy
    get_paths_function=get_toy_paths
elif dataSetType == "MH":
    dataset_frac = c.used_TIS_frac
    train_val_test_function = make_train_val_test_from_TIS_and_TPS
    get_paths_function=get_TPS_and_TIS_paths

In [None]:
try:
    trainData = pickle.load(
        open("datasets/{}_trainData_{}.p".format(
            dataSetType, dataset_frac), "rb"))
    valData = pickle.load(
        open("datasets/{}_valData_{}.p".format(
            dataSetType, dataset_frac), "rb"))
except Exception:
    print("Processed dataset files not found."
          +"\nGenerating datasets from raw data.")
    trainData, valData, _ = Dataset\
        .initialize_train_val_test_datasets(
            *train_val_test_function(c))
    print("Saving datasets for future use.")
    pickle.dump(
        trainData,
        open("datasets/{}_trainData_{}.p".format(
            dataSetType, dataset_frac), "wb"))
    pickle.dump(
        valData,
        open("datasets/{}_valData_{}.p".format(
            dataSetType, dataset_frac), "wb"))

In [None]:
#print_coverage(list_var_names, trainData)

In [None]:
pipeline = Pipeline(c, trainData.snapshots)
print(get_size(pipeline))

In [None]:
"""
c = Const(dataSetType)
c.bottleneck_size = 1
Stepper.iter_top_down(
    pipeline=pipeline,
    train_dataset=trainData,
    val_dataset=valData,
    used=reduced_list_var_names,
    param_limit=1,
    epochs=1,
    repetitions=1,
    const=c)
"""
pass

In [None]:

snapshots, pBs, hcb_weights, minima, maxima, g_snapshots = \
    pipeline.prepare_dataset_pickle(
        c.used_variable_names, trainData)
pickle.dump(
    (snapshots, pBs, hcb_weights, minima, maxima),
    open("dumps/{}_train_datasets_tuple_{}_{}_{}.p"\
         .format(
            dataSetType,
            len(c.used_variable_names),
            dataset_frac,
            c.resolution),
         "wb"))

snapshots, pBs, hcb_weights, minima, maxima, _ = \
    pipeline.prepare_dataset_pickle(
        c.used_variable_names, valData)
pickle.dump(
    (snapshots, pBs, hcb_weights, minima, maxima),
    open("dumps/{}_val_datasets_tuple_{}_{}_{}.p"\
         .format(
            dataSetType,
            len(c.used_variable_names),
            dataset_frac,
            c.resolution),
         "wb"))

print("1D")
train_corrected_1D = pipeline.get_1D_means(g_snapshots)
print("2D")

pickle.dump(
    train_corrected_1D,
    open("dumps/{}_train_corrected_1D_{}_{}_{}.p"\
         .format(
            dataSetType,
            len(c.used_variable_names),
            dataset_frac,
            c.resolution),
         "wb"))
train_corrected_2D = pipeline.get_2D_means(g_snapshots)
del g_snapshots
pickle.dump(
    train_corrected_2D,
    open("dumps/{}_train_corrected_2D_{}_{}_{}.p"\
         .format(
            dataSetType,
            len(c.used_variable_names),
            dataset_frac,
            c.resolution),
         "wb"))


In [None]:
"""
train_ds, train_bnrn_minima, train_bnrn_maxima, \
    train_corrected_1D, train_corrected_2D = \
    pipeline.prepare_prediction_plotter(
        c.used_variable_names,
        trainData)

val_ds, _, _, _, _ = \
    pipeline.prepare_prediction_plotter(
        c.used_variable_names,
        valData)
"""

In [None]:
snapshots, pBs, hcb_weights, \
    train_bnrn_minima, train_bnrn_maxima = \
        pickle.load(open("dumps/{}_train_datasets_tuple_{}_{}_{}.p"\
                     .format(
                        dataSetType,
                        len(c.used_variable_names),
                        dataset_frac,
                        c.resolution),
                     "rb"))

train_ds = pipeline.pack_tf_dataset(
            snapshots=snapshots,
            labels=pBs,
            prediction_weights=np.ones(len(snapshots)),
            reconstruction_weights=np.ones(len(snapshots)))
#            prediction_weights=hcb_weights,
#            reconstruction_weights=hcb_weights)

snapshots, pBs, hcb_weights, minima, maxima = \
    pickle.load(open("dumps/{}_val_datasets_tuple_{}_{}_{}.p"\
                     .format(
                        dataSetType,
                        len(c.used_variable_names),
                        dataset_frac,
                        c.resolution),
                     "rb"))
val_ds = pipeline.pack_tf_dataset(
            snapshots=snapshots,
            labels=pBs,
            prediction_weights=np.ones(len(snapshots)),
            reconstruction_weights=np.ones(len(snapshots)))
#            prediction_weights=hcb_weights,
#            reconstruction_weights=hcb_weights)

del snapshots
del pBs
del hcb_weights

train_corrected_1D = pickle.load(
    open("dumps/{}_train_corrected_1D_{}_{}_{}.p"\
         .format(
            dataSetType,
            len(c.used_variable_names),
            dataset_frac,
            c.resolution),
         "rb"))
train_corrected_2D = pickle.load(
    open("dumps/{}_train_corrected_2D_{}_{}_{}.p"\
         .format(
            dataSetType,
            len(c.used_variable_names),
            dataset_frac,
            c.resolution),
         "rb"))


In [None]:
c = Const(dataSetType)
c.bottleneck_size = 1
c.epochs = 10
autoencoder, autoencoder_1, autoencoder_2, \
    encoder, decoder_1, decoder_2 = \
    AutoEncoder.make_models(
        len(c.used_variable_names),
        c)
history = autoencoder.fit(
    x=train_ds,
    epochs=c.epochs,
    validation_data=val_ds,
    callbacks=[tf.keras.callbacks.EarlyStopping(
        monitor="val_loss",
        patience=3)])

In [None]:
store_model_weights(
    "results/{}_model_weights_{}"\
        .format(dataSetType, c.model_stamp),
    autoencoder, autoencoder_1,
    autoencoder_2, encoder, decoder_1, decoder_2)

In [None]:

c.bottleneck_size = 1
autoencoder, autoencoder_1, autoencoder_2, \
    encoder, decoder_1, decoder_2 = \
        load_model_weights(
            "results/model_weights", 
            *AutoEncoder.make_models(
                len(c.used_variable_names),
                c))

pass

In [None]:
plot_single_map(
    x_int=0,
    y_int=1,
    const=c,
    pipeline=pipeline,
    method=calc_represented_map_generated,
    model=autoencoder_1, 
    minima=train_bnrn_minima,
    maxima=train_bnrn_maxima,
    representations=train_corrected_2D,
    stamp="x1x2_Prediction_" + c.model_stamp + c.data_stamp)

In [None]:
for component in get_relative_encoder_importances(encoder, c.used_variable_names):
    print("{:8s}\t{}".format(component[0],
        "\t".join(list(map(lambda x: str(round(x, 4)),component[1:])))))

plot_relative_importances(
    *list(zip(*get_relative_encoder_importances(encoder, c.used_variable_names))))

In [None]:
latent_minimum, latent_maximum = \
    plot_projected_example_paths(
        get_paths_function=get_paths_function,
        const=c,
        pipeline=pipeline,
        steps=20,
        pre_stamp=dataSetType,
        model=encoder)

In [None]:
#"""
plot_super_map(
    pipeline=pipeline,
    const=c,
    pre_stamp="CorrelatedMean_map",
    method=calc_represented_map_generated,
    model=autoencoder_1,
    minima=train_bnrn_minima,
    maxima=train_bnrn_maxima,
    representations=train_corrected_2D)
#"""
#"""
plot_super_scatter(
    pipeline=pipeline,
    const=c,
    pre_stamp="CorrelatedMean_scatter",
    model=autoencoder_2,
    minima=train_bnrn_minima,
    maxima=train_bnrn_maxima,
    method=calc_represented_scatter_generated,
    representations=train_corrected_1D,
    max_row_len=4)
#"""
pass

In [None]:
plot_reconstruction_from_latent_space(
    reduced_list_var_names=c.used_variable_names,
    latent_minimum=latent_minimum,
    latent_maximum=latent_maximum,
    steps=11,
    recon_decoder=decoder_2,
    pre_stamp=dataSetType)

In [None]:
_, _ = plot_projected_example_paths(
    get_paths_function=get_paths_function,
    const=c,
    pipeline=pipeline,
    steps=20,
    pre_stamp=dataSetType + "_Committor",
    model=autoencoder_1)

In [None]:
train_grid_snapshots, train_labels, train_weights = \
    pipeline.prepare_groundTruth(
        c.used_variable_names,
        trainData)

In [None]:
pickle.dump(
    (train_grid_snapshots, train_labels, train_weights),
    open("dumps/{}_train_groundtruth_tuple_{}_{}_{}_{}.p"\
         .format(
            dataSetType,
            len(c.used_variable_names),
            dataset_frac,
            c.resolution,
            c.outlier_cutoff),
         "wb"))

In [None]:
c = Const(dataSetType)
train_grid_snapshots, train_labels, train_weights = \
        pickle.load(open("dumps/{}_train_groundtruth_tuple_{}_{}_{}_{}.p"\
                     .format(
                        dataSetType,
                        len(c.used_variable_names),
                        dataset_frac,
                        c.resolution,
                        c.outlier_cutoff),
                     "rb"))

In [None]:

plot_single_map(
    x_int=0,
    y_int=1,
    const=c,
    pipeline=pipeline,
    method=calc_map_given,
    grid_snapshots=train_grid_snapshots, 
    labels=train_labels, 
    weights=train_weights,
    stamp="{}_x1x2_GroundTruth_".format(c.dataSetType)\
           + c.data_stamp)

"""
plot_single_map(
    x_int=0,
    y_int=1,
    const=c,
    pipeline=pipeline,
    method=calc_map_given_configurational_density,
    grid_snapshots=train_grid_snapshots, 
    weights=train_weights,
    stamp="{}_x1x2_ConfDensity_".format(c.dataSetType)\
          + c.data_stamp)"""
"""
plot_single_map(
    x_int=0,
    y_int=1,
    const=c,
    pipeline=pipeline,
    method=calc_map_given_configurational_density,
    grid_snapshots=train_grid_snapshots, 
    weights=train_weights,
    stamp="{}_x1x2_ConfDensity_Overlay".format(c.dataSetType)\
          + c.data_stamp,
    PES_function=inject_PES)
"""
pass

In [None]:
plot_super_map(
    pipeline=pipeline,
    const=c,
    pre_stamp="{}_ConfDensity_Train".format(c.dataSetType),
    method=calc_map_given_configurational_density,
    grid_snapshots=train_grid_snapshots,
    weights=train_weights)
pass

plot_ground_truth(
    pipeline=pipeline,
    const=c,
    grid_snapshots=train_grid_snapshots,
    labels=train_labels,
    weights=train_weights,
    pre_stamp="{}_GroundTruth_Train".format(c.dataSetType))

In [None]:
plot_encoder_decoder(
    const=c,
    train_ds=train_ds,
    val_ds=val_ds,
    pipeline=pipeline)
pass

In [None]:
plot_single_map(
    x_int=0,
    y_int=1,
    const=c,
    pipeline=pipeline,
    method=calc_map_given,
    grid_snapshots=train_grid_snapshots, 
    labels=train_labels, 
    weights=train_weights,
    stamp="MCG_BigCage_GroundTruth_" + c.data_stamp,
    line_function=inject_dividing_line,
    line_formula=calculate_slope_MCG_BigCage)

plot_single_map(
    x_int=6,
    y_int=1,
    const=c,
    pipeline=pipeline,
    method=calc_map_given,
    grid_snapshots=train_grid_snapshots, 
    labels=train_labels, 
    weights=train_weights,
    stamp="NoW_BigCage_GroundTruth_" + c.data_stamp,
    line_function=inject_dividing_line,
    line_formula=calculate_slope_now_BigCage)


In [None]:
plot_single_map(
    x_int=0,
    y_int=1,
    const=c,
    pipeline=pipeline,
    method=calc_represented_map_generated,
    model=autoencoder_1, 
    minima=train_bnrn_minima,
    maxima=train_bnrn_maxima,
    representations=train_corrected_2D,
    stamp="MCG_BigCage_Train_" + c.model_stamp + c.data_stamp,
    line_function=inject_dividing_line,
    line_formula=calculate_slope_MCG_BigCage)

plot_single_map(
    x_int=6,
    y_int=1,
    const=c,
    pipeline=pipeline,
    method=calc_represented_map_generated,
    model=autoencoder_1, 
    minima=train_bnrn_minima,
    maxima=train_bnrn_maxima,
    representations=train_corrected_2D,
    stamp="NoW_BigCage_Train_" + c.model_stamp + c.data_stamp,
    line_function=inject_dividing_line,
    line_formula=calculate_slope_now_BigCage)


In [None]:
shooting_points, shooting_labels = read_shooting_points(
    "total_data_till_982mc_280K.txt")

shootingData = Dataset(
    shooting_points,
    shooting_labels,
    np.ones(len(shooting_labels)),
    flag="Shooting")

shoot_grid_snapshots, shoot_labels, shoot_weights = \
    pipeline.prepare_groundTruth(
        c.used_variable_names,
        shootingData)


In [None]:
plot_single_map(
    x_int=0,
    y_int=1,
    const=c,
    pipeline=pipeline,
    method=calc_map_given,
    grid_snapshots=shoot_grid_snapshots, 
    labels=shoot_labels, 
    weights=shoot_weights,
    stamp="MCG_BigCage_Shooting_" + c.data_stamp,
    line_function=inject_dividing_line,
    line_formula=calculate_slope_MCG_BigCage)

plot_single_map(
    x_int=6,
    y_int=1,
    const=c,
    pipeline=pipeline,
    method=calc_map_given,
    grid_snapshots=shoot_grid_snapshots, 
    labels=shoot_labels, 
    weights=shoot_weights,
    stamp="NoW_BigCage_Shooting_" + c.data_stamp,
    line_function=inject_dividing_line,
    line_formula=calculate_slope_now_BigCage)

In [None]:
def get_percentage_of_range_retained(outlier_cutoff):
    snapshots = trainData.snapshots
    span = np.amax(snapshots, axis=0) - np.amin(snapshots, axis=0)
    percentile_span = np.percentile(snapshots, 100 - outlier_cutoff, axis=0) \
        - np.percentile(snapshots, outlier_cutoff, axis=0)
    print(np.mean(percentile_span/span))
    
def estimate_reduction_on_AA_and_AB():
    reducer = Reducer(
        reduced_list_var_names,
        c.name_to_list_position)
    reduced_snapshots = reducer.reduce_snapshots(trainData.snapshots)
    bounder = Bounder(reduced_snapshots, c.outlier_cutoff)
    bound_snapshots = bounder.bound_snapshots(reduced_snapshots)

    all_AA_frames = len([1 for i, label in enumerate(trainData.labels) if label == 0])
    all_AB_frames = len([1 for i, label in enumerate(trainData.labels) if label == 1])
    bound_AA_frames = len([1 for i, label in enumerate(trainData.labels) if (label == 0 and bound_snapshots[i][0] == bounder.upper_bound[0])])
    bound_AB_frames = len([1 for i, label in enumerate(trainData.labels) if (label == 1 and bound_snapshots[i][0] == bounder.upper_bound[0])])

    print(bound_AA_frames/all_AA_frames)
    print(bound_AB_frames/all_AB_frames)

In [None]:
model = autoencoder
print(model.name)
print(model.input_names)
print(model.output_names)

In [None]:
train_grid_columns = np.transpose(train_grid_snapshots)
def rec_cols(used, unused, lim, p):
    if len(used) == lim:
        p.append(used)
        return
    else:
        for i in range(len(unused)):
            rec_cols(used+[unused[i]], unused[i+1:], lim, p)

def mean_pB_attributes(dimensions, train_grid_columns, dataO, tries):
    p = []
    rec_cols([],list(range(22)),dimensions, p)
    pB_uniques = []
    pB_unique_means = []
    pB_means = []
    pB_unique_zeroes = []
    for i in range(tries):
        choice = random.choice(p)
        short_grid_snapshots = []
        for j in choice:
            short_grid_snapshots.append(train_grid_columns[j])

        pB_dict, pBs = gridO.approximate_pB(np.transpose(short_grid_snapshots), dataO.train_labels, dataO.train_weights)
        pB_uniques.append(len(pB_dict)/len(pBs))
        pB_unique_means.append(np.mean([label for key, label in pB_dict.items()]))
        pB_means.append(np.mean(pBs))
        pB_unique_zeroes.append(len([label for key, label in pB_dict.items() if label == 0])/len(pBs))
    return np.mean(pB_uniques), np.mean(pB_unique_means), np.mean(pB_means), np.mean(pB_unique_zeroes)
over_list = []
for i in range(1,23):
    print(i)
    over_list.append(list(mean_pB_attributes(i, train_grid_columns, dataO, 5)))

print(over_list)

In [None]:
over_list = [[1.4504122192394765e-05, 0.4575422359289151, 0.35037916491845283, 0.0], 
             [8.823341000373481e-05, 0.31197805032254855, 0.24989161958688205, 1.4987592932141258e-05], 
             [0.0011390570628427355, 0.254336987822713, 0.21065742919045424, 0.00041312574711337754], 
             [0.0031104090041590574, 0.18133452313340712, 0.2170178744045542, 0.0019471784043289969], 
             [0.019666622751407806, 0.14815356079849162, 0.1799223783174298, 0.016739932628352418], 
             [0.03723860852286394, 0.0766339470321534, 0.15529480428303702, 0.03335392112900087], 
             [0.10585688540897394, 0.04434703300207507, 0.12188629749425983, 0.10030736652279384], 
             [0.14584233293970758, 0.03858060089555661, 0.12155443609723238, 0.13996139486143125], 
             [0.2389070660457291, 0.030955704907381622, 0.08891275517831804, 0.2316479946238054], 
             [0.3461106592002669, 0.02109568034865818, 0.07299244764109925, 0.33821316466650786], 
             [0.40314715278037977, 0.023028707168210764, 0.06465460885978427, 0.3932804819236334], 
             [0.5168345720256579, 0.020024200833715332, 0.055430137658731636, 0.5062564739753744], 
             [0.6097532969682758, 0.01770556773880911, 0.04620075248938048, 0.5986738397608754], 
             [0.649903849756633, 0.019642215099454723, 0.040862941059977745, 0.6368426459870116], 
             [0.648240710411905, 0.018966941544334908, 0.04145155090339449, 0.6356276840180286], 
             [0.694312812820677, 0.019123793821338247, 0.037413562972544265, 0.6808039151460504], 
             [0.7417976167309884, 0.0178504232145612, 0.03518109760382279, 0.7283716342882285], 
             [0.7338169652299931, 0.018954433483644333, 0.035358386024052696, 0.7197073551612314], 
             [0.7123308003495493, 0.01780718726971691, 0.03705239598119732, 0.6994197142446192], 
             [0.7675257538819679, 0.01880599290810691, 0.03314824301337975, 0.7529500780200907], 
             [0.7685473275550522, 0.02063167015576706, 0.032021440958897376, 0.752516404766538], 
             [0.7835114721563158, 0.021141123648464055, 0.031053932448914478, 0.766812392805472]
            ]

In [None]:
over_columns = np.transpose(over_list)
plt.scatter(list(range(1,23)),over_columns[1])
plt.ylim(-0.1,1.1)
plt.xlim(0,23)
plt.xlabel("Dimensions")
plt.ylabel("Mean")
plt.title("Mean pB value of all unique entries")
plt.show()
plt.close()
            
over_columns = np.transpose(over_list)
plt.scatter(list(range(1,23)),over_columns[2])
plt.ylim(-0.1,1.1)
plt.xlim(0,23)
plt.xlabel("Dimensions")
plt.ylabel("Mean")
plt.title("Mean pB value of all entries")
plt.show()
plt.close()
            
plt.figure()
plt.scatter(list(range(1,23)),over_columns[0], label = "Unique entries")
plt.scatter(list(range(1,23)),over_columns[3], label = "Unique entries = 0")
plt.xlabel("Dimensions")
plt.ylabel("Fraction")
plt.ylim(-0.1,1)
plt.xlim(0,23)
plt.legend(loc = "lower right")
plt.title("Fraction of unique entries of all entries")
plt.show()
plt.figure()

plt.scatter(list(range(1,23)),(over_columns[0]-over_columns[3])/over_columns[0])
plt.ylim(-0.1,1.1)
plt.xlim(0,23)
plt.xlabel("Dimensions")
plt.ylabel("Fraction")
plt.title("Fraction of non-zero entries of all unique entries")
plt.show()
plt.close()

In [None]:
gridO.plot_distribution(train_grid_snapshots,6,20,var_names,"untrimmed")
gridO.plot_distribution(trimmed_keys,6,20,var_names,"trimmed_both")
gridO.plot_distribution(trimmed_back_keys,6,20,var_names,"trimmed_back")

In [None]:
#print(len(pB_dict))
#print(max([label for key, label in pB_dict.items()]))
#print(min([label for key, label in pB_dict.items()]))
#print(len([label for key, label in pB_dict.items() if label > 0.0]))
#print(len([label for key, label in pB_dict.items() if label == 0.0]))
#print(len([label for key, label in pB_dict.items() if label < 0.25]))
#print(len([label for key, label in pB_dict.items() if label > 0.25]))

def broken_hist(xs, bins, y_lower_1, y_upper_1, y_lower_2, y_upper_2, filename):
    f, (ax, ax2) = plt.subplots(2, 1, sharex=True)
    # ax.hist([label for key, label in pB_dict.items()], 100)
    # ax2.hist([label for key, label in pB_dict.items()], 100)
    ax.hist(xs, bins)
    ax2.hist(xs, bins)
    ax.set_ylim(y_lower_2, y_upper_2)  # outliers only
    ax2.set_ylim(y_lower_1, y_upper_1)  # most of the data
    # hide the spines between ax and ax2
    ax.spines['bottom'].set_visible(False)
    ax2.spines['top'].set_visible(False)
    ax.xaxis.tick_top()
    ax.tick_params(labeltop=False)  # don't put tick labels at the top
    ax2.xaxis.tick_bottom()

    d = .015  # how big to make the diagonal lines in axes coordinates
    # arguments to pass to plot, just so we don't keep repeating them
    kwargs = dict(transform=ax.transAxes, color='k', clip_on=False)
    ax.plot((-d, +d), (-d, +d), **kwargs)        # top-left diagonal
    ax.plot((1 - d, 1 + d), (-d, +d), **kwargs)  # top-right diagonal

    kwargs.update(transform=ax2.transAxes)  # switch to the bottom axes
    ax2.plot((-d, +d), (1 - d, 1 + d), **kwargs)  # bottom-left diagonal
    ax2.plot((1 - d, 1 + d), (1 - d, 1 + d), **kwargs)  # bottom-right diagonal
    plt.savefig(filename)
    plt.show()
    

plt.figure()
plt.hist(train_pBs, 10)
plt.savefig("pB_untrimmed.png")
plt.show()

broken_hist(train_pBs, 10, 0, 20000, 100000, 850000, "pBs.png")

plt.figure()
plt.hist(trimmed_labels, 10)
plt.savefig("trimmed_labels.png")
plt.show()
    
broken_hist(trimmed_back_labels, 10, 0, 8000, 10000, 400000, "Hist_scaled_labels.png")
