In [1]:
# We will extract the predicted values from the trained models. Something to note here, with the relu activation and the current architecture of the model, this model gives 21 predicted features out of the 32 to be 0. We need to keep this in mind, and we may need to train better models in stage 1 as inputs in stage two needs to be precise inorder for the BLAR model to give accurate predictions. It might also make sense to look at the actual values to see if the problem is there

In [3]:
import tensorflow as tf
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt

Get predcited test features

In [7]:
# Locate the model
model_1_non_overlapping = tf.keras.models.load_model('../CNN_seq2seq_model/models/CNN_seq2seq_non_overlapping.keras')

In [8]:
model_1_non_overlapping.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 13, None, None, 3)   0         []                            
                             ]                                                                    
                                                                                                  
 time_distributed (TimeDist  (None, 13, 32)               71808     ['input_1[0][0]']             
 ributed)                                                                                         
                                                                                                  
 lstm (LSTM)                 [(None, 64),                 24832     ['time_distributed[0][0]']    
                              (None, 64),                                                   

In [9]:
# Where's the data that goes into the model?
# input features
input_features_loc = '../CNN_seq2seq_model/data/test_input_sub_images'
input_contents = os.listdir(input_features_loc)
input_contents.sort()

In [10]:
input_contents

['test_data_blk_0103.npy',
 'test_data_blk_0104.npy',
 'test_data_blk_0105.npy',
 'test_data_blk_0106.npy',
 'test_data_blk_0201.npy',
 'test_data_blk_0202.npy',
 'test_data_blk_0205.npy',
 'test_data_blk_0206.npy',
 'test_data_blk_0302.npy',
 'test_data_blk_0303.npy',
 'test_data_blk_0304.npy',
 'test_data_blk_0305.npy',
 'test_data_blk_0306.npy']

In [11]:
# we need to first load the npy files, use the trained model to extract features, and store these so that can be used later along with the train features and the corresponding densities to train the BLAR model

In [14]:
%%time
for_sanity_check = []
for i in range(len(input_contents)):
    # load the np file
    load_np_file = np.load(os.path.join(input_features_loc, input_contents[i]))
    # print shape of the loaded file
    print(load_np_file.shape)
    # predicted_values
    predicted_values = model_1_non_overlapping.predict(load_np_file)
    print(predicted_values.shape)
    for_sanity_check.append(predicted_values)
    # save these values?
    # name
    loc_name = 'data/predicted_sequences_from_stage_1/model_1/' + 'pred_values_blk_' + input_contents[i].split('.')[0][-4:] + '.npy'
    np.save(loc_name, predicted_values)

(910, 13, 30, 30, 3)
(910, 7, 32)
(910, 13, 30, 30, 3)
(910, 7, 32)
(910, 13, 30, 30, 3)
(910, 7, 32)
(910, 13, 30, 30, 3)
(910, 7, 32)
(910, 13, 30, 30, 3)
(910, 7, 32)
(910, 13, 30, 30, 3)
(910, 7, 32)
(910, 13, 30, 30, 3)
(910, 7, 32)
(910, 13, 30, 30, 3)
(910, 7, 32)
(910, 13, 30, 30, 3)
(910, 7, 32)
(910, 13, 30, 30, 3)
(910, 7, 32)
(910, 13, 30, 30, 3)
(910, 7, 32)
(910, 13, 30, 30, 3)
(910, 7, 32)
(910, 13, 30, 30, 3)
(910, 7, 32)
CPU times: user 53.9 s, sys: 3.05 s, total: 57 s
Wall time: 10.4 s


In [15]:
# perform a sanity check
loc_path = 'data/predicted_sequences_from_stage_1/model_1/'

In [17]:
loc_contents = os.listdir(loc_path)
loc_contents.sort()

In [18]:
loc_contents

['pred_values_blk_0103.npy',
 'pred_values_blk_0104.npy',
 'pred_values_blk_0105.npy',
 'pred_values_blk_0106.npy',
 'pred_values_blk_0201.npy',
 'pred_values_blk_0202.npy',
 'pred_values_blk_0205.npy',
 'pred_values_blk_0206.npy',
 'pred_values_blk_0302.npy',
 'pred_values_blk_0303.npy',
 'pred_values_blk_0304.npy',
 'pred_values_blk_0305.npy',
 'pred_values_blk_0306.npy']

In [20]:
%%time
for i in range(len(loc_contents)):
    load_stored_preds = np.load(os.path.join(loc_path, loc_contents[i]))
    print(np.mean(load_stored_preds == for_sanity_check[i]))

1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
CPU times: user 13.9 ms, sys: 5.98 ms, total: 19.9 ms
Wall time: 20.1 ms


In [21]:
# We have correctly stored the predictions

In [22]:
# now what about the actual targets? Should we take a look? Where are the targets?

In [23]:
# test targets
out_targets_loc = '../CNN_seq2seq_model/data/test_out_targets'
out_contents = os.listdir(out_targets_loc)
out_contents.sort()

In [24]:
out_contents

['test_targets_blk_0103.npy',
 'test_targets_blk_0104.npy',
 'test_targets_blk_0105.npy',
 'test_targets_blk_0106.npy',
 'test_targets_blk_0201.npy',
 'test_targets_blk_0202.npy',
 'test_targets_blk_0205.npy',
 'test_targets_blk_0206.npy',
 'test_targets_blk_0302.npy',
 'test_targets_blk_0303.npy',
 'test_targets_blk_0304.npy',
 'test_targets_blk_0305.npy',
 'test_targets_blk_0306.npy']

In [25]:
# just do this to one npy file
true_targets_blk_0103 = np.load(os.path.join(out_targets_loc, out_contents[0]))

In [26]:
true_targets_blk_0103.shape

(910, 7, 32)

In [27]:
test_in_0_true = true_targets_blk_0103[0,:,:]

In [28]:
test_in_0_true.shape

(7, 32)

In [33]:
# Doesn't seem the true values are that different from the target values
test_in_0_true_df = pd.DataFrame(test_in_0_true)

In [34]:
# display all coumns
with pd.option_context('display.max_columns', None):
    display(test_in_0_true_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
0,0.0,0.0,0.0,0.0,0.0,1.363772,0.0,1.245406,0.0,0.0,0.588365,0.0,0.0,1.61744,0.0,0.0,0.0,1.680196,0.0,1.59113,1.439219,0.0,0.0,0.0,1.391284,0.0,1.45495,0.0,0.0,1.398869,1.337006,0.0
1,0.0,0.0,0.0,0.0,0.0,1.441939,0.0,1.33163,0.0,0.0,0.61921,0.0,0.0,1.706845,0.0,0.0,0.0,1.776458,0.0,1.680459,1.531679,0.0,0.0,0.0,1.479376,0.0,1.534767,0.0,0.0,1.499093,1.433885,0.0
2,0.0,0.0,0.0,0.0,0.0,0.808372,0.0,0.73682,0.0,0.0,0.281132,0.0,0.0,0.956608,0.0,0.0,0.0,1.003784,0.0,0.946936,0.858345,0.0,0.0,0.0,0.834624,0.0,0.881518,0.0,0.0,0.832148,0.78665,0.0
3,0.0,0.0,0.0,0.0,0.0,1.388119,0.0,1.282966,0.0,0.0,0.592222,0.0,0.0,1.649319,0.0,0.0,0.0,1.724459,0.0,1.632454,1.477105,0.0,0.0,0.0,1.420537,0.0,1.487596,0.0,0.0,1.432754,1.372558,0.0
4,0.0,0.0,0.0,0.0,0.0,1.551632,0.0,1.410785,0.0,0.0,0.687335,0.0,0.0,1.841874,0.0,0.0,0.0,1.902592,0.0,1.817208,1.645934,0.0,0.0,0.0,1.588776,0.0,1.652073,0.0,0.0,1.603272,1.535132,0.0
5,0.0,0.0,0.0,0.0,0.0,0.96482,0.0,0.883131,0.0,0.0,0.364582,0.0,0.0,1.145905,0.0,0.0,0.0,1.191485,0.0,1.123184,1.018102,0.0,0.0,0.0,0.990711,0.0,1.03938,0.0,0.0,0.98639,0.937631,0.0
6,0.0,0.0,0.0,0.0,0.0,0.800389,0.0,0.752234,0.0,0.0,0.311395,0.0,0.0,0.974476,0.0,0.0,0.0,1.019076,0.0,0.967141,0.873649,0.0,0.0,0.0,0.829445,0.0,0.843752,0.0,0.0,0.841774,0.811647,0.0


In [35]:
zero_cols = (test_in_0_true_df == 0).all()
num_zero_cols = zero_cols.sum()
print(f"Number of all-zero columns: {num_zero_cols}")

Number of all-zero columns: 21


So both train and test data columns seem to have 21 zero columns - at the exact features. but how is this so different from our previous data? We may need to take a look at the earlier preprocessed data for this.