In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

2024-07-15 17:12:53.973821: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-15 17:12:54.309714: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-15 17:12:54.309766: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-15 17:12:54.386750: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-15 17:12:54.491002: I tensorflow/core/platform/cpu_feature_guar

In [2]:
# load the trained model
gmp_model = tf.keras.models.load_model('models/trained_gmp_model_dense_32_new.h5')

2024-07-15 17:13:08.661484: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 31141 MB memory:  -> device: 0, name: Tesla V100S-PCIE-32GB, pci bus id: 0000:06:00.0, compute capability: 7.0


In [3]:
gmp_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None, None, 3)]   0         
                                                                 
 conv2d (Conv2D)             (None, None, None, 32)    896       
                                                                 
 activation (Activation)     (None, None, None, 32)    0         
                                                                 
 conv2d_1 (Conv2D)           (None, None, None, 32)    9248      
                                                                 
 activation_1 (Activation)   (None, None, None, 32)    0         
                                                                 
 max_pooling2d (MaxPooling2  (None, None, None, 32)    0         
 D)                                                              
                                                           

In [4]:
# check gpu utility
tf.config.list_physical_devices("GPU")

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [5]:
# feature extractor input
feat_ext_input = gmp_model.input

In [6]:
# feature extractor output - do this at the ReLu activation layer - as this will give the same features as the dropout layer (It does not matter if it is the dropout or the activation layer, the extracted features will be the same)
feat_ext_output = gmp_model.layers[-4].output

In [7]:
feature_extractor_model = tf.keras.models.Model(inputs = feat_ext_input, outputs = feat_ext_output)

In [8]:
feature_extractor_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None, None, 3)]   0         
                                                                 
 conv2d (Conv2D)             (None, None, None, 32)    896       
                                                                 
 activation (Activation)     (None, None, None, 32)    0         
                                                                 
 conv2d_1 (Conv2D)           (None, None, None, 32)    9248      
                                                                 
 activation_1 (Activation)   (None, None, None, 32)    0         
                                                                 
 max_pooling2d (MaxPooling2  (None, None, None, 32)    0         
 D)                                                              
                                                             

In [9]:
# We might need to be cautious here though, as dropout will set some neurons to 0, and the extracted features will therefore become zero - workaround - use the Dense layer to extract features before the pred head (and the dropout layer)

In [10]:
def extract_features_and_get_counts(model, preprocessed_path, all_in_folder, digits_image, digits_dense, character, store_path):
    necessary_sub_images = [item for item in all_in_folder if item.split(".")[0][-digits_image:] == ('_' + character)]
    necessary_sub_images.sort()
    necessary_sub_dense_maps = [item for item in all_in_folder if item.split(".")[0][-digits_dense:] == ('_' + character + '_density')]
    necessary_sub_dense_maps.sort()
    print(necessary_sub_images[0])
    # check if the image and density maps tally
    print(np.mean([item.split('.')[0] for item in necessary_sub_images] == [item.split(".")[0][:-8] for item in necessary_sub_dense_maps]))
    
    collect_extracted_features = []
    collect_corresponding_counts = []

    # collect the subwindow-wise features and corresponding counts
    for i in range(len(necessary_sub_images)):
        # load the subimage
        load_image = np.load(os.path.join(preprocessed_path, necessary_sub_images[i]))
        # reshape it to get the prediction
        load_image_reshape = load_image[None, ...]
        # get the prediction from the model
        extracted_features = model.predict(load_image_reshape, verbose = 0)
        collect_extracted_features.append(extracted_features)

        # load the count map
        load_count_map = np.load(os.path.join(preprocessed_path, necessary_sub_dense_maps[i]))
        count = np.sum(load_count_map)
        collect_corresponding_counts.append(count)

    # collect all extracted features data and create a dataframe
    Features_data = pd.DataFrame(np.vstack(collect_extracted_features), columns = ['feature_' + str(i) for i in range(np.vstack(collect_extracted_features).shape[1])])
    # dataframe for counts
    counts_subwindow = pd.DataFrame(collect_corresponding_counts, columns = ['tassel_count'])
    # combine the two dataframes
    combined_df = pd.concat((Features_data,counts_subwindow), axis = 1 )
    # save this dataframe
    save_path_name = store_path + 'extracted_features_sub_window_' + character + '.csv'
    combined_df.to_csv(save_path_name, index = False)

    return(combined_df)

In [None]:
# There's actually another thing to check before we close off this notebook -  we need to make sure the sums of density maps match with what we had stored earlier at the location all_preprocessed_data/Block_0103/sub_count_dfs/ - Do this in a separate notebook?

Block 0105

In [None]:
prev_preprocessed_image_loc_0105 = "all_preprocessed_data/Block_0105/sub_images_and_counts"

In [None]:
folder_contents_0105 = os.listdir(prev_preprocessed_image_loc_0105)
folder_contents_0105.sort()

In [None]:
# folder_contents

In [None]:
folder_contents_0105[0]

In [None]:
s_path_0105 = 'all_preprocessed_data/Block_0105//TS_ready_data_frames/'

In [None]:
# get the predictions for all subimages for the block? do separately for single digit subimages and double digit subimages separately
character_list = list(range(10))
character_list = list(map(str, character_list))

In [None]:
%%time

# for single digit subwindows
catch_all_dfs_0105 = []
for character in character_list:
    features_df = extract_features_and_get_counts(feature_extractor_model, prev_preprocessed_image_loc_0105, folder_contents_0105, 2, 10, character, s_path_0105)
    catch_all_dfs_0105.append(features_df)

In [None]:
# get the predictions for double digits
character_list_double = list(range(10, 12))
character_list_double = list(map(str, character_list_double))

In [None]:
%%time

# for double digit subwindows
catch_all_dfs_0105 = []
for character in character_list_double:
    features_df = extract_features_and_get_counts(feature_extractor_model, prev_preprocessed_image_loc_0105, folder_contents_0105, 3, 11, character, s_path_0105)
    catch_all_dfs_0105.append(features_df)

In [None]:
# catch_all_dfs_0105

Block 0106

In [None]:
prev_preprocessed_image_loc_0106 = "all_preprocessed_data/Block_0106/sub_images_and_counts"

In [None]:
folder_contents_0106 = os.listdir(prev_preprocessed_image_loc_0106)
folder_contents_0106.sort()

In [None]:
# folder_contents

In [None]:
folder_contents_0106[0]

In [None]:
s_path_0106 = 'all_preprocessed_data/Block_0106//TS_ready_data_frames/'

In [None]:
# get the predictions for all subimages for the block? do separately for single digit subimages and double digit subimages separately
character_list = list(range(10))
character_list = list(map(str, character_list))

In [None]:
%%time

# for single digit subwindows
catch_all_dfs_0106 = []
for character in character_list:
    features_df = extract_features_and_get_counts(feature_extractor_model, prev_preprocessed_image_loc_0106, folder_contents_0106, 2, 10, character, s_path_0106)
    catch_all_dfs_0106.append(features_df)

In [None]:
# get the predictions for double digits
character_list_double = list(range(10, 12))
character_list_double = list(map(str, character_list_double))

In [None]:
%%time

# for double digit subwindows
catch_all_dfs_0106 = []
for character in character_list_double:
    features_df = extract_features_and_get_counts(feature_extractor_model, prev_preprocessed_image_loc_0106, folder_contents_0106, 3, 11, character, s_path_0106)
    catch_all_dfs_0106.append(features_df)

In [None]:
# catch_all_dfs_0105

Block 0201

In [None]:
prev_preprocessed_image_loc_0201 = "all_preprocessed_data/Block_0201/sub_images_and_counts"

In [None]:
folder_contents_0201 = os.listdir(prev_preprocessed_image_loc_0201)
folder_contents_0201.sort()

In [None]:
# folder_contents_0201

In [None]:
folder_contents_0201[0]

In [None]:
s_path_0201 = 'all_preprocessed_data/Block_0201//TS_ready_data_frames/'

In [None]:
# get the predictions for all subimages for the block? do separately for single digit subimages and double digit subimages separately
character_list = list(range(10))
character_list = list(map(str, character_list))

In [None]:
%%time

# for single digit subwindows
catch_all_dfs_0201 = []
for character in character_list:
    features_df = extract_features_and_get_counts(feature_extractor_model, prev_preprocessed_image_loc_0201, folder_contents_0201, 2, 10, character, s_path_0201)
    catch_all_dfs_0201.append(features_df)

In [None]:
# get the predictions for double digits
character_list_double = list(range(10, 12))
character_list_double = list(map(str, character_list_double))

In [None]:
%%time

# for double digit subwindows
catch_all_dfs_0201 = []
for character in character_list_double:
    features_df = extract_features_and_get_counts(feature_extractor_model, prev_preprocessed_image_loc_0201, folder_contents_0201, 3, 11, character, s_path_0201)
    catch_all_dfs_0201.append(features_df)

In [None]:
# catch_all_dfs_0105

Block 0202

In [None]:
prev_preprocessed_image_loc_0202 = "all_preprocessed_data/Block_0202/sub_images_and_counts"

In [None]:
folder_contents_0202 = os.listdir(prev_preprocessed_image_loc_0202)
folder_contents_0202.sort()

In [None]:
# folder_contents_0201

In [None]:
folder_contents_0202[0]

In [None]:
s_path_0202 = 'all_preprocessed_data/Block_0202//TS_ready_data_frames/'

In [None]:
# get the predictions for all subimages for the block? do separately for single digit subimages and double digit subimages separately
character_list = list(range(10))
character_list = list(map(str, character_list))

In [None]:
%%time

# for single digit subwindows
catch_all_dfs_0202 = []
for character in character_list:
    features_df = extract_features_and_get_counts(feature_extractor_model, prev_preprocessed_image_loc_0202, folder_contents_0202, 2, 10, character, s_path_0202)
    catch_all_dfs_0202.append(features_df)

In [None]:
# get the predictions for double digits
character_list_double = list(range(10, 12))
character_list_double = list(map(str, character_list_double))

In [None]:
%%time

# for double digit subwindows
catch_all_dfs_0202 = []
for character in character_list_double:
    features_df = extract_features_and_get_counts(feature_extractor_model, prev_preprocessed_image_loc_0202, folder_contents_0202, 3, 11, character, s_path_0202)
    catch_all_dfs_0202.append(features_df)

In [None]:
# catch_all_dfs_0105

Block 0205

In [None]:
prev_preprocessed_image_loc_0205 = "all_preprocessed_data/Block_0205/sub_images_and_counts"

In [None]:
folder_contents_0205 = os.listdir(prev_preprocessed_image_loc_0205)
folder_contents_0205.sort()

In [None]:
# folder_contents_0201

In [None]:
folder_contents_0205[0]

In [None]:
s_path_0205 = 'all_preprocessed_data/Block_0205//TS_ready_data_frames/'

In [None]:
# get the predictions for all subimages for the block? do separately for single digit subimages and double digit subimages separately
character_list = list(range(10))
character_list = list(map(str, character_list))

In [None]:
%%time

# for single digit subwindows
catch_all_dfs_0205 = []
for character in character_list:
    features_df = extract_features_and_get_counts(feature_extractor_model, prev_preprocessed_image_loc_0205, folder_contents_0205, 2, 10, character, s_path_0205)
    catch_all_dfs_0205.append(features_df)

In [None]:
# get the predictions for double digits
character_list_double = list(range(10, 12))
character_list_double = list(map(str, character_list_double))

In [None]:
%%time

# for double digit subwindows
catch_all_dfs_0205 = []
for character in character_list_double:
    features_df = extract_features_and_get_counts(feature_extractor_model, prev_preprocessed_image_loc_0205, folder_contents_0205, 3, 11, character, s_path_0205)
    catch_all_dfs_0205.append(features_df)

Block 0206

In [None]:
prev_preprocessed_image_loc_0206 = "all_preprocessed_data/Block_0206/sub_images_and_counts"

In [None]:
folder_contents_0206 = os.listdir(prev_preprocessed_image_loc_0206)
folder_contents_0206.sort()

In [None]:
# folder_contents_0201

In [None]:
folder_contents_0206[0]

In [None]:
s_path_0206 = 'all_preprocessed_data/Block_0206//TS_ready_data_frames/'

In [None]:
# get the predictions for all subimages for the block? do separately for single digit subimages and double digit subimages separately
character_list = list(range(10))
character_list = list(map(str, character_list))

In [None]:
%%time

# for single digit subwindows
catch_all_dfs_0206 = []
for character in character_list:
    features_df = extract_features_and_get_counts(feature_extractor_model, prev_preprocessed_image_loc_0206, folder_contents_0206, 2, 10, character, s_path_0206)
    catch_all_dfs_0206.append(features_df)

In [None]:
# get the predictions for double digits
character_list_double = list(range(10, 12))
character_list_double = list(map(str, character_list_double))

In [None]:
%%time

# for double digit subwindows
catch_all_dfs_0206 = []
for character in character_list_double:
    features_df = extract_features_and_get_counts(feature_extractor_model, prev_preprocessed_image_loc_0206, folder_contents_0206, 3, 11, character, s_path_0206)
    catch_all_dfs_0206.append(features_df)

Block 0302

In [None]:
prev_preprocessed_image_loc_0302 = "all_preprocessed_data/Block_0302/sub_images_and_counts"

In [None]:
folder_contents_0302 = os.listdir(prev_preprocessed_image_loc_0302)
folder_contents_0302.sort()

In [None]:
# folder_contents_0201

In [None]:
folder_contents_0302[0]

In [None]:
s_path_0302 = 'all_preprocessed_data/Block_0302//TS_ready_data_frames/'

In [None]:
# get the predictions for all subimages for the block? do separately for single digit subimages and double digit subimages separately
character_list = list(range(10))
character_list = list(map(str, character_list))

In [None]:
%%time

# for single digit subwindows
catch_all_dfs_0302 = []
for character in character_list:
    features_df = extract_features_and_get_counts(feature_extractor_model, prev_preprocessed_image_loc_0302, folder_contents_0302, 2, 10, character, s_path_0302)
    catch_all_dfs_0302.append(features_df)

In [None]:
# get the predictions for double digits
character_list_double = list(range(10, 12))
character_list_double = list(map(str, character_list_double))

In [None]:
%%time

# for double digit subwindows
catch_all_dfs_0302 = []
for character in character_list_double:
    features_df = extract_features_and_get_counts(feature_extractor_model, prev_preprocessed_image_loc_0302, folder_contents_0302, 3, 11, character, s_path_0302)
    catch_all_dfs_0302.append(features_df)

Block 0303

In [None]:
prev_preprocessed_image_loc_0303 = "all_preprocessed_data/Block_0303/sub_images_and_counts"

In [None]:
folder_contents_0303 = os.listdir(prev_preprocessed_image_loc_0303)
folder_contents_0303.sort()

In [None]:
# folder_contents_0201

In [None]:
folder_contents_0303[0]

In [None]:
s_path_0303 = 'all_preprocessed_data/Block_0303//TS_ready_data_frames/'

In [None]:
# get the predictions for all subimages for the block? do separately for single digit subimages and double digit subimages separately
character_list = list(range(10))
character_list = list(map(str, character_list))

In [None]:
%%time

# for single digit subwindows
catch_all_dfs_0303 = []
for character in character_list:
    features_df = extract_features_and_get_counts(feature_extractor_model, prev_preprocessed_image_loc_0303, folder_contents_0303, 2, 10, character, s_path_0303)
    catch_all_dfs_0303.append(features_df)

In [None]:
# get the predictions for double digits
character_list_double = list(range(10, 12))
character_list_double = list(map(str, character_list_double))

In [None]:
%%time

# for double digit subwindows
catch_all_dfs_0303 = []
for character in character_list_double:
    features_df = extract_features_and_get_counts(feature_extractor_model, prev_preprocessed_image_loc_0303, folder_contents_0303, 3, 11, character, s_path_0303)
    catch_all_dfs_0303.append(features_df)

Block 0304

In [None]:
prev_preprocessed_image_loc_0304 = "all_preprocessed_data/Block_0304/sub_images_and_counts"

In [None]:
folder_contents_0304 = os.listdir(prev_preprocessed_image_loc_0304)
folder_contents_0304.sort()

In [None]:
# folder_contents_0201

In [None]:
folder_contents_0304[0]

In [None]:
s_path_0304 = 'all_preprocessed_data/Block_0304//TS_ready_data_frames/'

In [None]:
# get the predictions for all subimages for the block? do separately for single digit subimages and double digit subimages separately
character_list = list(range(10))
character_list = list(map(str, character_list))

In [None]:
%%time

# for single digit subwindows
catch_all_dfs_0304 = []
for character in character_list:
    features_df = extract_features_and_get_counts(feature_extractor_model, prev_preprocessed_image_loc_0304, folder_contents_0304, 2, 10, character, s_path_0304)
    catch_all_dfs_0304.append(features_df)

In [None]:
# get the predictions for double digits
character_list_double = list(range(10, 12))
character_list_double = list(map(str, character_list_double))

In [None]:
%%time

# for double digit subwindows
catch_all_dfs_0304 = []
for character in character_list_double:
    features_df = extract_features_and_get_counts(feature_extractor_model, prev_preprocessed_image_loc_0304, folder_contents_0304, 3, 11, character, s_path_0304)
    catch_all_dfs_0304.append(features_df)

Block 0305

In [None]:
prev_preprocessed_image_loc_0305 = "all_preprocessed_data/Block_0305/sub_images_and_counts"

In [None]:
folder_contents_0305 = os.listdir(prev_preprocessed_image_loc_0305)
folder_contents_0305.sort()

In [None]:
# folder_contents_0201

In [None]:
folder_contents_0305[0]

In [None]:
s_path_0305 = 'all_preprocessed_data/Block_0305//TS_ready_data_frames/'

In [None]:
# get the predictions for all subimages for the block? do separately for single digit subimages and double digit subimages separately
character_list = list(range(10))
character_list = list(map(str, character_list))

In [None]:
%%time

# for single digit subwindows
catch_all_dfs_0305 = []
for character in character_list:
    features_df = extract_features_and_get_counts(feature_extractor_model, prev_preprocessed_image_loc_0305, folder_contents_0305, 2, 10, character, s_path_0305)
    catch_all_dfs_0305.append(features_df)

In [None]:
# get the predictions for double digits
character_list_double = list(range(10, 12))
character_list_double = list(map(str, character_list_double))

In [None]:
%%time

# for double digit subwindows
catch_all_dfs_0305 = []
for character in character_list_double:
    features_df = extract_features_and_get_counts(feature_extractor_model, prev_preprocessed_image_loc_0305, folder_contents_0305, 3, 11, character, s_path_0305)
    catch_all_dfs_0305.append(features_df)

Block 0306

In [11]:
prev_preprocessed_image_loc_0306 = "all_preprocessed_data/Block_0306/sub_images_and_counts"

In [12]:
folder_contents_0306 = os.listdir(prev_preprocessed_image_loc_0306)
folder_contents_0306.sort()

In [13]:
# folder_contents_0201

In [14]:
folder_contents_0306[0]

'Block0306_2020_08_03_0.npy'

In [15]:
s_path_0306 = 'all_preprocessed_data/Block_0306//TS_ready_data_frames/'

In [16]:
# get the predictions for all subimages for the block? do separately for single digit subimages and double digit subimages separately
character_list = list(range(10))
character_list = list(map(str, character_list))

In [17]:
%%time

# for single digit subwindows
catch_all_dfs_0306 = []
for character in character_list:
    features_df = extract_features_and_get_counts(feature_extractor_model, prev_preprocessed_image_loc_0306, folder_contents_0306, 2, 10, character, s_path_0306)
    catch_all_dfs_0306.append(features_df)

Block0306_2020_08_03_0.npy
1.0


2024-07-15 17:13:09.860851: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8907


Block0306_2020_08_03_1.npy
1.0
Block0306_2020_08_03_2.npy
1.0
Block0306_2020_08_03_3.npy
1.0
Block0306_2020_08_03_4.npy
1.0
Block0306_2020_08_03_5.npy
1.0
Block0306_2020_08_03_6.npy
1.0
Block0306_2020_08_03_7.npy
1.0
Block0306_2020_08_03_8.npy
1.0
Block0306_2020_08_03_9.npy
1.0
CPU times: user 11.7 s, sys: 1.75 s, total: 13.4 s
Wall time: 44.2 s


In [18]:
# get the predictions for double digits
character_list_double = list(range(10, 12))
character_list_double = list(map(str, character_list_double))

In [19]:
%%time

# for double digit subwindows
catch_all_dfs_0306 = []
for character in character_list_double:
    features_df = extract_features_and_get_counts(feature_extractor_model, prev_preprocessed_image_loc_0306, folder_contents_0306, 3, 11, character, s_path_0306)
    catch_all_dfs_0306.append(features_df)

Block0306_2020_08_03_10.npy
1.0
Block0306_2020_08_03_11.npy
1.0
CPU times: user 2.36 s, sys: 299 ms, total: 2.66 s
Wall time: 8.45 s
