In [None]:
# are the future functions actually necessary?
from __future__ import absolute_import, division, print_function, unicode_literals
import imp

from globalConstants import Const
from dataset import Dataset
from pipeline import Pipeline
from autoEncoder import AutoEncoder 
from corrector import Corrector
from stepper import Stepper

from plotter import *
from data_read import *
from helperFunctions import *
from losses import *

import numpy as np
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import matplotlib as mpl

import random
import time
from collections import Counter
import sys

np.set_printoptions(precision=3, suppress=True)
# allows for automatic reloading of imports and makes it unncessecary to restart the kernel
# whenever a function is changed
%load_ext autoreload
%autoreload 2

In [None]:
dataSetType = "DW" #"DW", "ZP", or "MH"
assert dataSetType == "DW" or dataSetType == "ZP" or dataSetType == "MH",\
    "dataSetType needs to be set to 'DW', 'ZP' or 'MH'."

In [None]:
c = Const(dataSetType)

In [None]:
if dataSetType == "DW" or dataSetType == "ZP":
    train_val_test_function = make_train_val_test_from_toy
    get_paths_function=get_toy_paths
elif dataSetType == "MH":
    train_val_test_function = make_train_val_test_from_TIS_and_TPS
    get_paths_function=get_TPS_and_TIS_paths

In [None]:
try:
    trainData = pickle.load(
        open("datasets/{}_trainData_{}.p".format(
            dataSetType, c.used_dataset_fraction), "rb"))
    valData = pickle.load(
        open("datasets/{}_valData_{}.p".format(
            dataSetType, c.used_dataset_fraction), "rb"))
except Exception:
    print("Processed dataset files not found."
          +"\nGenerating datasets from raw data.")
    trainData, valData, _ = Dataset\
        .initialize_train_val_test_datasets(
            *train_val_test_function(c))
    print("Saving datasets for future use.")
    pickle.dump(
        trainData,
        open("datasets/{}_trainData_{}.p".format(
            dataSetType, c.used_dataset_fraction), "wb"))
    pickle.dump(
        valData,
        open("datasets/{}_valData_{}.p".format(
            dataSetType, c.used_dataset_fraction), "wb"))

In [None]:
#print_coverage(list_var_names, trainData)

In [None]:
pipeline = Pipeline(c, trainData.snapshots)
print(get_size(pipeline))

In [None]:
snapshots, pBs, g_snapshots = pipeline.prepare_dataset_pickle(trainData)
pickle.dump(
    (snapshots, pBs),
    open("dumps/{}_train_datasets_tuple_{}_{}_{}.p"\
         .format(
            dataSetType,
            len(c.used_variable_names),
            c.used_dataset_fraction,
            c.resolution),
         "wb"))

snapshots, pBs, _ = pipeline.prepare_dataset_pickle(valData)
pickle.dump(
    (snapshots, pBs),
    open("dumps/{}_val_datasets_tuple_{}_{}_{}.p"\
         .format(
            dataSetType,
            len(c.used_variable_names),
            c.used_dataset_fraction,
            c.resolution),
         "wb"))

print("1D")
train_corrected_1D = pipeline.get_1D_means(g_snapshots)
print("2D")

pickle.dump(
    train_corrected_1D,
    open("dumps/{}_train_corrected_1D_{}_{}_{}.p"\
         .format(
            dataSetType,
            len(c.used_variable_names),
            c.used_dataset_fraction,
            c.resolution),
         "wb"))
train_corrected_2D = pipeline.get_2D_means(g_snapshots)
del g_snapshots
pickle.dump(
    train_corrected_2D,
    open("dumps/{}_train_corrected_2D_{}_{}_{}.p"\
         .format(
            dataSetType,
            len(c.used_variable_names),
            c.used_dataset_fraction,
            c.resolution),
         "wb"))


In [None]:
"""
train_ds, train_corrected_1D, train_corrected_2D = \
    pipeline.prepare_prediction_plotter(trainData)

val_ds, _, _ = \
    pipeline.prepare_prediction_plotter(valData)
"""

In [None]:
snapshots, pBs = \
        pickle.load(open("dumps/{}_train_datasets_tuple_{}_{}_{}.p"\
                     .format(
                        dataSetType,
                        len(c.used_variable_names),
                        c.used_dataset_fraction,
                        c.resolution),
                     "rb"))

train_ds = pipeline.pack_tf_dataset(
            snapshots=snapshots,
            labels=pBs,
            prediction_weights=np.ones(len(snapshots)),
            reconstruction_weights=np.ones(len(snapshots)))

snapshots, pBs = \
    pickle.load(open("dumps/{}_val_datasets_tuple_{}_{}_{}.p"\
                     .format(
                        dataSetType,
                        len(c.used_variable_names),
                        c.used_dataset_fraction,
                        c.resolution),
                     "rb"))
val_ds = pipeline.pack_tf_dataset(
            snapshots=snapshots,
            labels=pBs,
            prediction_weights=np.ones(len(snapshots)),
            reconstruction_weights=np.ones(len(snapshots)))

del snapshots
del pBs

train_corrected_1D = pickle.load(
    open("dumps/{}_train_corrected_1D_{}_{}_{}.p"\
         .format(
            dataSetType,
            len(c.used_variable_names),
            c.used_dataset_fraction,
            c.resolution),
         "rb"))
train_corrected_2D = pickle.load(
    open("dumps/{}_train_corrected_2D_{}_{}_{}.p"\
         .format(
            dataSetType,
            len(c.used_variable_names),
            c.used_dataset_fraction,
            c.resolution),
         "rb"))


In [None]:
c = Const(dataSetType)
c.epochs = 10
autoencoder, autoencoder_1, autoencoder_2, \
    encoder, decoder_1, decoder_2 = \
    AutoEncoder.make_models(c)
history = autoencoder.fit(
    x=train_ds,
    epochs=c.epochs,
    validation_data=val_ds,
    callbacks=[tf.keras.callbacks.EarlyStopping(
        monitor="val_loss",
        patience=3)])

In [None]:
store_model_weights(
    "results/{}_model_weights_{}"\
        .format(dataSetType, c.model_stamp),
    autoencoder, autoencoder_1,
    autoencoder_2, encoder, decoder_1, decoder_2)

In [None]:
autoencoder, autoencoder_1, autoencoder_2, \
    encoder, decoder_1, decoder_2 = \
        load_model_weights(
            "results/model_weights", 
            *AutoEncoder.make_models(c))
pass

In [None]:
plot_single_map(
    x_int=0,
    y_int=1,
    const=c,
    pipeline=pipeline,
    method=calc_represented_map_generated,
    model=autoencoder_1, 
    minmax_container=pipeline,
    representations=train_corrected_2D,
    stamp="x1x2_Prediction_" + c.model_stamp + c.data_stamp)

In [None]:
make_relative_importance_plot(encoder, c)

In [None]:
make_projected_path_plot(
    pipeline=pipeline, steps=20, pre_stamp=dataSetType, model=encoder)

In [None]:
#"""
make_super_map_plot(
    method=calc_represented_map_generated,
    pipeline=pipeline,
    pre_stamp="CorrelatedMean_map",
    model=autoencoder_1,
    minmax_container=pipeline,
    representations=train_corrected_2D)
#"""
#"""
make_super_scatter_plot(
    method=calc_represented_scatter_generated,
    pipeline=pipeline,
    pre_stamp="CorrelatedMean_scatter",
    model=autoencoder_2,
    minmax_container=pipeline,
    representations=train_corrected_1D,
    max_row_len=4)
#"""
pass

In [None]:
latent_minimum, latent_maximum = \
    get_projected_minimum_and_maximum(pipeline, model=encoder, steps=20)

plot_reconstruction_from_latent_space(
    const=c,
    latent_minimum=latent_minimum,
    latent_maximum=latent_maximum,
    steps=11,
    recon_decoder=decoder_2,
    pre_stamp=dataSetType)

In [None]:
make_projected_path_plot(
    pipeline=pipeline, steps=20, pre_stamp=dataSetType + "_comm", model=autoencoder_1)

In [None]:
train_grid_snapshots, train_labels, train_weights = \
    pipeline.prepare_groundTruth(
        trainData)

In [None]:
pickle.dump(
    (train_grid_snapshots, train_labels, train_weights),
    open("dumps/{}_train_groundtruth_tuple_{}_{}_{}_{}.p"\
         .format(
            dataSetType,
            len(c.used_variable_names),
            c.used_dataset_fraction,
            c.resolution,
            c.outlier_cutoff),
         "wb"))

In [None]:
c = Const(dataSetType)
train_grid_snapshots, train_labels, train_weights = \
        pickle.load(open("dumps/{}_train_groundtruth_tuple_{}_{}_{}_{}.p"\
                     .format(
                        dataSetType,
                        len(c.used_variable_names),
                        c.used_dataset_fraction,
                        c.resolution,
                        c.outlier_cutoff),
                     "rb"))

In [None]:

plot_single_map(
    x_int=0,
    y_int=1,
    const=c,
    pipeline=pipeline,
    method=calc_map_given,
    grid_snapshots=train_grid_snapshots, 
    labels=train_labels, 
    weights=train_weights,
    stamp="{}_x1x2_GroundTruth_".format(c.dataSetType)\
           + c.data_stamp)

"""
plot_single_map(
    x_int=0,
    y_int=1,
    const=c,
    pipeline=pipeline,
    method=calc_map_given_configurational_density,
    grid_snapshots=train_grid_snapshots, 
    weights=train_weights,
    stamp="{}_x1x2_ConfDensity_".format(c.dataSetType)\
          + c.data_stamp)"""
pass

In [None]:
make_super_map_plot(
    method=calc_map_given_configurational_density,
    pipeline=pipeline,
    pre_stamp="{}_ConfDensity_Train".format(c.dataSetType),
    grid_snapshots=train_grid_snapshots,
    weights=train_weights)
pass

make_super_map_plot(
    method=calc_map_given,
    pipeline=pipeline,
    pre_stamp=f"{c.dataSetType}_GroundTruth_Train",
    grid_snapshots=train_grid_snapshots,
    labels=train_labels,
    weights=train_weights)

In [None]:
plot_single_map(
    x_int=0,
    y_int=1,
    const=c,
    pipeline=pipeline,
    method=calc_map_given,
    grid_snapshots=train_grid_snapshots, 
    labels=train_labels, 
    weights=train_weights,
    stamp="MCG_BigCage_GroundTruth_" + c.data_stamp,
    line_function=inject_dividing_line,
    line_formula=calculate_slope_MCG_BigCage)

plot_single_map(
    x_int=6,
    y_int=1,
    const=c,
    pipeline=pipeline,
    method=calc_map_given,
    grid_snapshots=train_grid_snapshots, 
    labels=train_labels, 
    weights=train_weights,
    stamp="NoW_BigCage_GroundTruth_" + c.data_stamp,
    line_function=inject_dividing_line,
    line_formula=calculate_slope_now_BigCage)


In [None]:
plot_single_map(
    x_int=0,
    y_int=1,
    const=c,
    pipeline=pipeline,
    method=calc_represented_map_generated,
    model=autoencoder_1, 
    minmax_container=pipeline,
    representations=train_corrected_2D,
    stamp="MCG_BigCage_Train_" + c.model_stamp + c.data_stamp,
    line_function=inject_dividing_line,
    line_formula=calculate_slope_MCG_BigCage)

plot_single_map(
    x_int=6,
    y_int=1,
    const=c,
    pipeline=pipeline,
    method=calc_represented_map_generated,
    model=autoencoder_1, 
    minmax_container=pipeline,
    representations=train_corrected_2D,
    stamp="NoW_BigCage_Train_" + c.model_stamp + c.data_stamp,
    line_function=inject_dividing_line,
    line_formula=calculate_slope_now_BigCage)


In [None]:
shooting_points, shooting_labels = read_shooting_points(
    "total_data_till_982mc_280K.txt")

shootingData = Dataset(
    shooting_points,
    shooting_labels,
    np.ones(len(shooting_labels)),
    flag="Shooting")

shoot_grid_snapshots, shoot_labels, shoot_weights = \
    pipeline.prepare_groundTruth(shootingData)


In [None]:
plot_single_map(
    x_int=0,
    y_int=1,
    const=c,
    pipeline=pipeline,
    method=calc_map_given,
    grid_snapshots=shoot_grid_snapshots, 
    labels=shoot_labels, 
    weights=shoot_weights,
    stamp="MCG_BigCage_Shooting_" + c.data_stamp,
    line_function=inject_dividing_line,
    line_formula=calculate_slope_MCG_BigCage)

plot_single_map(
    x_int=6,
    y_int=1,
    const=c,
    pipeline=pipeline,
    method=calc_map_given,
    grid_snapshots=shoot_grid_snapshots, 
    labels=shoot_labels, 
    weights=shoot_weights,
    stamp="NoW_BigCage_Shooting_" + c.data_stamp,
    line_function=inject_dividing_line,
    line_formula=calculate_slope_now_BigCage)

In [None]:
make_super_map_plot(
    method=calc_map_given,
    pipeline=pipeline,
    pre_stamp=f"{c.dataSetType}_GroundTruth_Shoot",
    grid_snapshots=shoot_grid_snapshots,
    labels=shoot_labels,
    weights=shoot_weights)

In [None]:
plot_input_distribution(c, train_grid_snapshots, 5, pipeline)
plot_histogram_with_broken_axes(
    train_pBs, 10, 0, 500, 1000, 250000, "results/pB_distribution.png")

In [None]:
def get_percentage_of_range_retained(outlier_cutoff):
    snapshots = trainData.snapshots
    span = np.amax(snapshots, axis=0) - np.amin(snapshots, axis=0)
    percentile_span = np.percentile(snapshots, 100 - outlier_cutoff, axis=0) \
        - np.percentile(snapshots, outlier_cutoff, axis=0)
    print(np.mean(percentile_span/span))
    
def estimate_reduction_on_AA_and_AB():
    reducer = Reducer(
        reduced_list_var_names,
        c.name_to_list_position)
    reduced_snapshots = reducer.reduce_snapshots(trainData.snapshots)
    bounder = Bounder(reduced_snapshots, c.outlier_cutoff)
    bound_snapshots = bounder.bound_snapshots(reduced_snapshots)

    all_AA_frames = len([1 for i, label in enumerate(trainData.labels) if label == 0])
    all_AB_frames = len([1 for i, label in enumerate(trainData.labels) if label == 1])
    bound_AA_frames = len([1 for i, label in enumerate(trainData.labels) if (label == 0 and bound_snapshots[i][0] == bounder.upper_bound[0])])
    bound_AB_frames = len([1 for i, label in enumerate(trainData.labels) if (label == 1 and bound_snapshots[i][0] == bounder.upper_bound[0])])

    print(bound_AA_frames/all_AA_frames)
    print(bound_AB_frames/all_AB_frames)