In [1]:
# We will extract the predicted values from the trained models. Something to note here, with the relu activation and the current architecture of the model, this model gives 21 predicted features out of the 32 to be 0. We need to keep this in mind, and we may need to train better models in stage 1 as inputs in stage two needs to be precise inorder for the BLAR model to give accurate predictions. It might also make sense to look at the actual values to see if the problem is there

In [2]:
import tensorflow as tf
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt

2025-07-23 15:34:07.282064: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-23 15:34:07.311537: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Get predcited test features

In [3]:
# Locate the model
model_1_non_overlapping = tf.keras.models.load_model('../CNN_seq2seq_model/models/CNN_seq2seq_non_overlapping.keras')

In [4]:
model_1_non_overlapping.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 13, None, None, 3)   0         []                            
                             ]                                                                    
                                                                                                  
 time_distributed (TimeDist  (None, 13, 32)               71808     ['input_1[0][0]']             
 ributed)                                                                                         
                                                                                                  
 lstm (LSTM)                 [(None, 64),                 24832     ['time_distributed[0][0]']    
                              (None, 64),                                                   

In [5]:
# Where's the data that goes into the model?
# input features
input_features_loc = '../CNN_seq2seq_model/data/test_input_sub_images'
input_contents = os.listdir(input_features_loc)
input_contents.sort()

In [6]:
input_contents

['test_data_blk_0103.npy',
 'test_data_blk_0104.npy',
 'test_data_blk_0105.npy',
 'test_data_blk_0106.npy',
 'test_data_blk_0201.npy',
 'test_data_blk_0202.npy',
 'test_data_blk_0205.npy',
 'test_data_blk_0206.npy',
 'test_data_blk_0302.npy',
 'test_data_blk_0303.npy',
 'test_data_blk_0304.npy',
 'test_data_blk_0305.npy',
 'test_data_blk_0306.npy']

In [7]:
# we need to first load the npy files, use the trained model to extract features, and store these so that can be used later along with the train features and the corresponding densities to train the BLAR model

In [8]:
%%time
for_sanity_check = []
for i in range(len(input_contents)):
    # load the np file
    load_np_file = np.load(os.path.join(input_features_loc, input_contents[i]))
    # print shape of the loaded file
    print(load_np_file.shape)
    # predicted_values
    predicted_values = model_1_non_overlapping.predict(load_np_file)
    print(predicted_values.shape)
    for_sanity_check.append(predicted_values)
    # save these values?
    # name
    loc_name = 'data/predicted_sequences_from_stage_1/model_1/' + 'pred_values_blk_' + input_contents[i].split('.')[0][-4:] + '.npy'
    np.save(loc_name, predicted_values)

(910, 13, 30, 30, 3)
(910, 7, 32)
(910, 13, 30, 30, 3)
(910, 7, 32)
(910, 13, 30, 30, 3)
(910, 7, 32)
(910, 13, 30, 30, 3)
(910, 7, 32)
(910, 13, 30, 30, 3)
(910, 7, 32)
(910, 13, 30, 30, 3)
(910, 7, 32)
(910, 13, 30, 30, 3)
(910, 7, 32)
(910, 13, 30, 30, 3)
(910, 7, 32)
(910, 13, 30, 30, 3)
(910, 7, 32)
(910, 13, 30, 30, 3)
(910, 7, 32)
(910, 13, 30, 30, 3)
(910, 7, 32)
(910, 13, 30, 30, 3)
(910, 7, 32)
(910, 13, 30, 30, 3)
(910, 7, 32)
CPU times: user 53.8 s, sys: 4.39 s, total: 58.2 s
Wall time: 16.7 s


In [9]:
# perform a sanity check
loc_path = 'data/predicted_sequences_from_stage_1/model_1/'

In [10]:
loc_contents = os.listdir(loc_path)
loc_contents.sort()

In [11]:
loc_contents

['pred_values_blk_0103.npy',
 'pred_values_blk_0104.npy',
 'pred_values_blk_0105.npy',
 'pred_values_blk_0106.npy',
 'pred_values_blk_0201.npy',
 'pred_values_blk_0202.npy',
 'pred_values_blk_0205.npy',
 'pred_values_blk_0206.npy',
 'pred_values_blk_0302.npy',
 'pred_values_blk_0303.npy',
 'pred_values_blk_0304.npy',
 'pred_values_blk_0305.npy',
 'pred_values_blk_0306.npy']

In [12]:
%%time
for i in range(len(loc_contents)):
    load_stored_preds = np.load(os.path.join(loc_path, loc_contents[i]))
    print(np.mean(load_stored_preds == for_sanity_check[i]))

1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
CPU times: user 5.73 ms, sys: 5.07 ms, total: 10.8 ms
Wall time: 10.1 ms


In [13]:
# We have correctly stored the predictions

In [14]:
# now what about the actual targets? Should we take a look? Where are the targets?

In [15]:
# test targets
out_targets_loc = '../CNN_seq2seq_model/data/test_out_targets'
out_contents = os.listdir(out_targets_loc)
out_contents.sort()

In [16]:
out_contents

['test_targets_blk_0103.npy',
 'test_targets_blk_0104.npy',
 'test_targets_blk_0105.npy',
 'test_targets_blk_0106.npy',
 'test_targets_blk_0201.npy',
 'test_targets_blk_0202.npy',
 'test_targets_blk_0205.npy',
 'test_targets_blk_0206.npy',
 'test_targets_blk_0302.npy',
 'test_targets_blk_0303.npy',
 'test_targets_blk_0304.npy',
 'test_targets_blk_0305.npy',
 'test_targets_blk_0306.npy']

In [17]:
# just do this to one npy file
true_targets_blk_0103 = np.load(os.path.join(out_targets_loc, out_contents[0]))

In [18]:
true_targets_blk_0103.shape

(910, 7, 32)

In [19]:
test_in_0_true = true_targets_blk_0103[0,:,:]

In [20]:
test_in_0_true.shape

(7, 32)

In [21]:
# Doesn't seem the true values are that different from the target values
test_in_0_true_df = pd.DataFrame(test_in_0_true)

In [22]:
# display all coumns
with pd.option_context('display.max_columns', None):
    display(test_in_0_true_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
0,0.0,0.0,0.0,0.0,0.0,1.363772,0.0,1.245406,0.0,0.0,0.588365,0.0,0.0,1.61744,0.0,0.0,0.0,1.680196,0.0,1.59113,1.439219,0.0,0.0,0.0,1.391284,0.0,1.45495,0.0,0.0,1.398869,1.337006,0.0
1,0.0,0.0,0.0,0.0,0.0,1.441939,0.0,1.33163,0.0,0.0,0.61921,0.0,0.0,1.706845,0.0,0.0,0.0,1.776458,0.0,1.680459,1.531679,0.0,0.0,0.0,1.479376,0.0,1.534767,0.0,0.0,1.499093,1.433885,0.0
2,0.0,0.0,0.0,0.0,0.0,0.808372,0.0,0.73682,0.0,0.0,0.281132,0.0,0.0,0.956608,0.0,0.0,0.0,1.003784,0.0,0.946936,0.858345,0.0,0.0,0.0,0.834624,0.0,0.881518,0.0,0.0,0.832148,0.78665,0.0
3,0.0,0.0,0.0,0.0,0.0,1.388119,0.0,1.282966,0.0,0.0,0.592222,0.0,0.0,1.649319,0.0,0.0,0.0,1.724459,0.0,1.632454,1.477105,0.0,0.0,0.0,1.420537,0.0,1.487596,0.0,0.0,1.432754,1.372558,0.0
4,0.0,0.0,0.0,0.0,0.0,1.551632,0.0,1.410785,0.0,0.0,0.687335,0.0,0.0,1.841874,0.0,0.0,0.0,1.902592,0.0,1.817208,1.645934,0.0,0.0,0.0,1.588776,0.0,1.652073,0.0,0.0,1.603272,1.535132,0.0
5,0.0,0.0,0.0,0.0,0.0,0.96482,0.0,0.883131,0.0,0.0,0.364582,0.0,0.0,1.145905,0.0,0.0,0.0,1.191485,0.0,1.123184,1.018102,0.0,0.0,0.0,0.990711,0.0,1.03938,0.0,0.0,0.98639,0.937631,0.0
6,0.0,0.0,0.0,0.0,0.0,0.800389,0.0,0.752234,0.0,0.0,0.311395,0.0,0.0,0.974476,0.0,0.0,0.0,1.019076,0.0,0.967141,0.873649,0.0,0.0,0.0,0.829445,0.0,0.843752,0.0,0.0,0.841774,0.811647,0.0


In [23]:
zero_cols = (test_in_0_true_df == 0).all()
num_zero_cols = zero_cols.sum()
print(f"Number of all-zero columns: {num_zero_cols}")

Number of all-zero columns: 21


So both train and test data columns seem to have 21 zero columns - at the exact features. but how is this so different from our previous data? We may need to take a look at the earlier preprocessed data for this.

In [25]:
# Took a look at the previvous feature extraction and density map creation, we have correctly extracted the features and the density maps have been correctly created. We may need to see if we are using the exact model as earlier with our feature extraction in the current work.

In [26]:
# Seems like we are doing exactly the right thing with the feature extraction, the only difference for having very different extracted features in the previous and the current work I see is the difference in the size of the sub-windows (300,300,3 earlier vs 30,30,3 now). May be it is a significant impact, so let's set this aside for a moment and follow through wiith the feature extraction. If there are features which are all 0s in both train and test time periods in the 32 features, maybe we can drop them before fiting the BLAR model?

In [27]:
# Okay, so let's proceed with the rest of the data preprocessing.

In [28]:
# Have we stored all the predictions for the test sequences? - Seems like it

# Please make sure we have the correct inputs to extract the features for the test data and train data and everything before moving forward tomorrow?

In [29]:
# Okay, what should be our inputs for the BLAR model? We need 910 csv files (data frames). each csv file will correspond to a sub-window in order of appearence. How will each csv file look like?

# There will be 33 columns, first 32 will be for the extracted features , and the last column will correspond to the density of tassels for that particular subwindow. Each row will be a time points - there will be 20 such rows. So the first 13 rows of the df will be extracted as in the earlier implementation for comps as the images do exist in the way we have formulated the problem. The last 7 rows will come from the features we have extracted above using the CNN seq_2_seq model (stage 1 model) and it's variants.

In [30]:
# Now we know how the data frames (csvs) should look like, what do we need? We need to extract the features for for the train time steps for all the test blocks - the first 13 images in the test blocks using the original feature extrcating model. 

# We also need the target densities for both train and test time points (ideally we will not have the densities for the test time points during deployment, but at this point we still have these, so we can use them) - Let's look at the generation of the targets later, but should we get the features extracted for the train images/sub-images?

In [31]:
# Should we get this original feature extraction model here the first thing?

In [32]:
# where is this model?
fine_tuned_model = tf.keras.models.load_model("../../Spring_2024/Bayes_for_comps/TS_bayes_implementation_for_TN/models/trained_gmp_model_dense_32_new.h5")

In [33]:
fine_tuned_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None, None, 3)]   0         
                                                                 
 conv2d (Conv2D)             (None, None, None, 32)    896       
                                                                 
 activation (Activation)     (None, None, None, 32)    0         
                                                                 
 conv2d_1 (Conv2D)           (None, None, None, 32)    9248      
                                                                 
 activation_1 (Activation)   (None, None, None, 32)    0         
                                                                 
 max_pooling2d (MaxPooling2  (None, None, None, 32)    0         
 D)                                                              
                                                           

In [35]:
# Define the feature extractor model

# feature extractor input
feat_ext_input = fine_tuned_model.input

# feature extractor output 
feat_ext_output = fine_tuned_model.layers[-4].output

# define the model
feature_extractor_model = tf.keras.models.Model(inputs = feat_ext_input, outputs = feat_ext_output)

In [36]:
feature_extractor_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None, None, 3)]   0         
                                                                 
 conv2d (Conv2D)             (None, None, None, 32)    896       
                                                                 
 activation (Activation)     (None, None, None, 32)    0         
                                                                 
 conv2d_1 (Conv2D)           (None, None, None, 32)    9248      
                                                                 
 activation_1 (Activation)   (None, None, None, 32)    0         
                                                                 
 max_pooling2d (MaxPooling2  (None, None, None, 32)    0         
 D)                                                              
                                                             

In [38]:
# Okay - now what do we need to do?

# I think we do have the inputs stored and arranged in a previous exercise, may be we can use these?

# Where is this location?

sub_windows_of_images_loc = '../CNN_seq2seq_model/data/test_input_sub_images/'

In [39]:
contents_here = os.listdir(sub_windows_of_images_loc)

In [41]:
contents_here.sort()

In [42]:
# Let's just try this for a single block, and maybe write a function so that it could be done for the rest?

In [43]:
load_blk_0103_data = np.load(os.path.join(sub_windows_of_images_loc, contents_here[0]))

In [44]:
load_blk_0103_data.shape

(910, 13, 30, 30, 3)

In [46]:
time_1 = load_blk_0103_data[:,0,:,:,:]

In [47]:
time_1.shape

(910, 30, 30, 3)

In [48]:
# I think we can get preds for this?
extracted_features_t1 = feature_extractor_model.predict(time_1)



In [50]:
extracted_features_t1.shape

(910, 32)

In [51]:
# just convert this to a df to verify something
extracted_features_t1_df = pd.DataFrame((extracted_features_t1))

In [57]:
# extracted_features_t1_df

In [58]:
# see how many 0 only coulmns we have?

In [55]:
# display all coumns
with pd.option_context('display.max_columns', None):
    display(extracted_features_t1_df.head(100))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
0,0.0,0.000000,0.000000,0.0,0.00000,1.369269,0.0,1.248844,0.000000,0.000000,0.592686,0.0,0.000000,1.624723,0.000000,0.000000,0.000000,1.682279,0.0,1.598236,1.450064,0.000000,0.000000,0.000000,1.399071,0.000000,1.462036,0.000000,0.000000,1.405704,1.345099,0.0
1,0.0,0.000000,0.000000,0.0,0.00000,1.132936,0.0,1.032808,0.000000,0.000000,0.480186,0.0,0.000000,1.357083,0.000000,0.000000,0.000000,1.394248,0.0,1.330586,1.200220,0.000000,0.000000,0.000000,1.155771,0.000000,1.217091,0.000000,0.000000,1.172076,1.108077,0.0
2,0.0,0.000000,0.000000,0.0,0.00000,1.213863,0.0,1.114427,0.000000,0.000000,0.500377,0.0,0.000000,1.433805,0.000000,0.000000,0.000000,1.499915,0.0,1.423099,1.288628,0.000000,0.000000,0.000000,1.238002,0.000000,1.303378,0.000000,0.000000,1.253887,1.189286,0.0
3,0.0,0.000000,0.000000,0.0,0.00000,1.465182,0.0,1.338707,0.000000,0.000000,0.648936,0.0,0.000000,1.749635,0.000000,0.000000,0.000000,1.782776,0.0,1.714970,1.547786,0.000000,0.000000,0.000000,1.496574,0.000000,1.564507,0.000000,0.000000,1.526324,1.450341,0.0
4,0.0,0.000000,0.000000,0.0,0.00000,0.798903,0.0,0.730617,0.000000,0.000000,0.279807,0.0,0.000000,0.947793,0.000000,0.000000,0.000000,0.995947,0.0,0.938399,0.852347,0.000000,0.000000,0.000000,0.818034,0.000000,0.851462,0.000000,0.000000,0.832226,0.800566,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.000000,0.000000,0.0,0.00000,0.736419,0.0,0.670524,0.000000,0.000000,0.243270,0.0,0.000000,0.880048,0.000000,0.000000,0.000000,0.907017,0.0,0.857415,0.779001,0.000000,0.000000,0.000000,0.751881,0.000000,0.797463,0.000000,0.000000,0.752230,0.715544,0.0
96,0.0,0.000000,0.000000,0.0,0.00000,0.792569,0.0,0.723093,0.000000,0.000000,0.290050,0.0,0.000000,0.951232,0.000000,0.000000,0.000000,0.974730,0.0,0.927692,0.832897,0.000000,0.000000,0.000000,0.797694,0.000000,0.858229,0.000000,0.000000,0.819866,0.771400,0.0
97,0.0,0.000000,0.000000,0.0,0.00000,0.680296,0.0,0.620987,0.000000,0.000000,0.216857,0.0,0.000000,0.815890,0.000000,0.000000,0.000000,0.856667,0.0,0.795978,0.726802,0.000000,0.000000,0.000000,0.690363,0.000000,0.712925,0.000000,0.000000,0.686644,0.670018,0.0
98,0.0,0.119545,0.145148,0.0,0.14259,0.000000,0.0,0.000000,0.099046,0.155396,0.000000,0.0,0.180927,0.000000,0.193183,0.060701,0.071701,0.000000,0.0,0.000000,0.000000,0.039592,0.155912,0.171989,0.000000,0.139521,0.000000,0.105747,0.144852,0.000000,0.000000,0.0


In [56]:
# but notice that over here we do not have all zero columns - make sense we have not concatenated data in a time direction yet
zero_cols = (extracted_features_t1_df == 0).all()
num_zero_cols = zero_cols.sum()
print(f"Number of all-zero columns: {num_zero_cols}")

Number of all-zero columns: 0


In [63]:
# Let's get the predictions across all time points in a for loop?
catch_all_preds_block_0103 = []
for i in range(load_blk_0103_data.shape[1]):
    time_wise_data = load_blk_0103_data[:,i,:,:,:]
    extracted_features = feature_extractor_model.predict(time_wise_data)
    catch_all_preds_block_0103.append(extracted_features)



In [64]:
catch_all_preds_block_0103[0].shape

(910, 32)

In [70]:
# stack all these together? - maybe to be of shape 910, 13, 32
stacked_features_0103 = np.stack(catch_all_preds_block_0103, axis = 1)

In [71]:
stacked_features_0103.shape

(910, 13, 32)

In [72]:
sub_0 = stacked_features_0103[0,:,:]

In [74]:
sub_0_df = pd.DataFrame(sub_0)

In [76]:
with pd.option_context('display.max_columns', None):
    display(sub_0_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
0,0.0,0.0,0.0,0.0,0.0,1.369269,0.0,1.248844,0.0,0.0,0.592686,0.0,0.0,1.624723,0.0,0.0,0.0,1.682279,0.0,1.598236,1.450064,0.0,0.0,0.0,1.399071,0.0,1.462036,0.0,0.0,1.405704,1.345099,0.0
1,0.0,0.0,0.0,0.0,0.0,0.545594,0.0,0.490542,0.0,0.0,0.161817,0.0,0.0,0.632805,0.0,0.0,0.0,0.686555,0.0,0.644817,0.567092,0.0,0.0,0.0,0.556151,0.0,0.599482,0.0,0.0,0.572482,0.528357,0.0
2,0.0,0.0,0.0,0.0,0.0,1.469633,0.0,1.342258,0.0,0.0,0.647264,0.0,0.0,1.747006,0.0,0.0,0.0,1.801751,0.0,1.715402,1.552778,0.0,0.0,0.0,1.500075,0.0,1.571111,0.0,0.0,1.515108,1.442261,0.0
3,0.0,0.0,0.0,0.0,0.0,1.237221,0.0,1.126303,0.0,0.0,0.516621,0.0,0.0,1.469283,0.0,0.0,0.0,1.51876,0.0,1.44617,1.307163,0.0,0.0,0.0,1.264671,0.0,1.32869,0.0,0.0,1.272606,1.214003,0.0
4,0.0,0.0,0.0,0.0,0.0,1.304077,0.0,1.193108,0.0,0.0,0.558597,0.0,0.0,1.557889,0.0,0.0,0.0,1.604493,0.0,1.51994,1.384442,0.0,0.0,0.0,1.335352,0.0,1.402343,0.0,0.0,1.355031,1.28865,0.0
5,0.0,0.0,0.0,0.0,0.0,1.398598,0.0,1.274066,0.0,0.0,0.606516,0.0,0.0,1.660823,0.0,0.0,0.0,1.720388,0.0,1.633357,1.477843,0.0,0.0,0.0,1.424209,0.0,1.491625,0.0,0.0,1.43392,1.369575,0.0
6,0.0,0.0,0.0,0.0,0.0,0.303054,0.0,0.278769,0.0,0.0,0.014354,0.0,0.0,0.355323,0.0,0.0,0.0,0.391059,0.0,0.359449,0.325105,0.0,0.0,0.0,0.301922,0.0,0.336012,0.0,0.0,0.313626,0.291782,0.0
7,0.0,0.0,0.0,0.0,0.0,0.997213,0.0,0.908011,0.0,0.0,0.380849,0.0,0.0,1.178452,0.0,0.0,0.0,1.229886,0.0,1.166759,1.056643,0.0,0.0,0.0,1.02036,0.0,1.077106,0.0,0.0,1.02879,0.974916,0.0
8,0.0,0.0,0.0,0.0,0.0,0.467198,0.0,0.432392,0.0,0.0,0.09958,0.0,0.0,0.556453,0.0,0.0,0.0,0.584768,0.0,0.545438,0.500881,0.0,0.0,0.0,0.48664,0.0,0.516942,0.0,0.0,0.489528,0.457301,0.0
9,0.0,0.0,0.0,0.0,0.0,1.17312,0.0,1.066786,0.0,0.0,0.480801,0.0,0.0,1.39141,0.0,0.0,0.0,1.442701,0.0,1.369239,1.23594,0.0,0.0,0.0,1.195756,0.0,1.257762,0.0,0.0,1.203079,1.146309,0.0


In [77]:
zero_cols = (sub_0_df == 0).all()
num_zero_cols = zero_cols.sum()
print(f"Number of all-zero columns: {num_zero_cols}")

Number of all-zero columns: 21


In [78]:
# There are all 0 columns - but is this true for all sub windows?

In [79]:
sub_98 = stacked_features_0103[98,:,:]

In [80]:
sub_98_df = pd.DataFrame(sub_98)

In [81]:
with pd.option_context('display.max_columns', None):
    display(sub_98_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
0,0.0,0.119545,0.145148,0.0,0.14259,0.0,0.0,0.0,0.099046,0.155396,0.0,0.0,0.180927,0.0,0.193183,0.060701,0.071701,0.0,0.0,0.0,0.0,0.039592,0.155912,0.171989,0.0,0.139521,0.0,0.105747,0.144852,0.0,0.0,0.0
1,0.0,0.036691,0.025579,0.0,0.038598,0.009914,0.0,0.0,0.009917,0.061608,0.0,0.0,0.057069,0.0,0.107086,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.046288,0.0,0.017956,0.0,0.0,0.0,0.027006,0.0,0.0,0.0
2,0.0,0.075862,0.112421,0.0,0.121383,0.0,0.0,0.0,0.058554,0.124238,0.0,0.000163,0.147197,0.0,0.167313,0.081811,0.039921,0.0,0.0,0.0,0.0,0.054744,0.116728,0.130066,0.0,0.104291,0.0,0.075486,0.125931,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.115046,0.0,0.105339,0.0,0.0,0.0,0.0,0.0,0.155586,0.0,0.0,0.0,0.161335,0.0,0.149993,0.136979,0.0,0.0,0.0,0.11324,0.0,0.132858,0.0,0.0,0.105571,0.118664,0.0
4,0.0,0.038395,0.029195,0.0,0.060244,0.029192,0.0,0.008932,0.0,0.073409,0.0,0.0,0.075384,0.007525,0.078443,0.0,0.0,0.03264,0.0,0.036982,0.003811,0.0,0.0567,0.031333,0.03744,0.030496,0.032622,0.0,0.050327,0.021465,0.012152,0.0
5,0.0,0.0,0.0,0.0,0.0,0.054232,0.0,0.046725,0.0,0.0,0.0,0.0,0.0,0.059294,0.016813,0.0,0.0,0.077573,0.0,0.06698,0.062831,0.0,0.0,0.0,0.05553,0.0,0.066062,0.0,0.0,0.043269,0.042933,0.0
6,0.0,0.0,0.0,0.0,0.0,0.129388,0.0,0.123665,0.0,0.0,0.0,0.0,0.0,0.17478,0.0,0.0,0.0,0.167175,0.0,0.154384,0.145695,0.0,0.0,0.0,0.136214,0.0,0.151416,0.0,0.0,0.130013,0.132924,0.0
7,0.045548,0.11926,0.147498,0.029857,0.143269,0.0,0.0,0.0,0.11442,0.134741,0.0,0.034608,0.178917,0.0,0.195689,0.124939,0.104401,0.0,0.011549,0.0,0.0,0.093392,0.138638,0.149571,0.0,0.112263,0.0,0.113159,0.143052,0.0,0.0,0.020877
8,0.0,0.0,0.0,0.0,0.0,0.321841,0.0,0.29604,0.0,0.0,0.070336,0.0,0.0,0.418375,0.0,0.0,0.0,0.402779,0.0,0.39848,0.355007,0.0,0.0,0.0,0.335629,0.0,0.352987,0.0,0.0,0.341777,0.323506,0.0
9,0.0,0.0,0.0,0.0,0.0,0.272825,0.0,0.247361,0.0,0.0,0.0,0.0,0.0,0.324527,0.0,0.0,0.0,0.352623,0.0,0.324389,0.285961,0.0,0.0,0.0,0.271037,0.0,0.292364,0.0,0.0,0.259464,0.253495,0.0


In [82]:
# Are there any 0 all columns?
zero_cols = (sub_98_df == 0).all()
num_zero_cols = zero_cols.sum()
print(f"Number of all-zero columns: {num_zero_cols}")

Number of all-zero columns: 1


In [83]:
# There is just 1. So seems like some sub-windows will have all 0 columns - and some will not

In [84]:
# should we save these stacked data for future use? I guess yes. And then we will write a function to do this for the rest of the blocks, and call it a day for the work on dissertation

In [85]:
stacked_features_0103.shape

(910, 13, 32)

In [86]:
train_save_path = 'data/train_features_non_overlapping/'

In [87]:
np.save(os.path.join(train_save_path, 'train_features_block_0103.npy'), stacked_features_0103)

In [89]:
san_check_blk_0103 = np.load('data/train_features_non_overlapping/train_features_block_0103.npy')

In [90]:
np.mean(stacked_features_0103 == san_check_blk_0103)

1.0

In [91]:
# Okay, so let's define a function for this
sub_windows_of_images_loc

'../CNN_seq2seq_model/data/test_input_sub_images/'

In [93]:
train_save_path

'data/train_features_non_overlapping/'

In [97]:
def store_train_extracted_features(path_to_inputs, input_feature_file, save_path):
    # load the file
    loaded_input_file = np.load(os.path.join(path_to_inputs, input_feature_file))
    # Let's get the predictions across all time points in a for loop?
    catch_all_preds = []
    for i in range(loaded_input_file.shape[1]):
        time_wise_data = loaded_input_file[:,i,:,:,:]
        extracted_features = feature_extractor_model.predict(time_wise_data)
        catch_all_preds.append(extracted_features)

    # stack these predictions?
    stacked_features = np.stack(catch_all_preds, axis = 1)
    # save the stack of extracted features?
    save_name = 'train_features_block_' + input_feature_file.split('.')[0][-4:] + '.npy'
    np.save(os.path.join(save_path, save_name), stacked_features)
    # also do the sanity check?
    print(np.mean(np.load(os.path.join(save_path, save_name)) == stacked_features))
    return stacked_features
    

In [98]:
# see if this works for block 0103?
stack_0103 = store_train_extracted_features(sub_windows_of_images_loc, contents_here[0], train_save_path)

1.0


In [99]:
# Okay, so seems to be working

In [100]:
# Do this for the rest of the blocks as well?

In [101]:
%%time
# Easier to do it in a for loop - but verify this tomorrow
all_stacks = []
for i in range(len(contents_here)):
    stack = store_train_extracted_features(sub_windows_of_images_loc, contents_here[i], train_save_path)
    all_stacks.append(stack)
    

1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
CPU times: user 1min 58s, sys: 8 s, total: 2min 6s
Wall time: 33.8 s
