## Setup

In [10]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.image import ImageDataGenerator,load_img, img_to_array
import tensorflow_addons as tfa
import pandas as pd
import os

In [11]:
import warnings
warnings.filterwarnings('ignore')

In [12]:
def get_filepaths(directory):
    
    file_paths = []  # List which will store all of the full filepaths.
    file_image = []  # Image filename
   
    # Walk the tree.
    for root, directories, files in os.walk(directory):
        for filename in files:
            # Join the two strings in order to form the full filepath.
                       
            filepath = os.path.join(root, filename)
            
            file_paths.append(filepath)  # Add it to the list.       
            file_image.append(filename.split('.')[0])

    return file_paths, file_image  # Self-explanatory.

In [13]:
class DataGenerator(tf.keras.utils.Sequence):
        
    def __init__(self, df_X, arr_Y, batch_size=32,shuffle=False,image_size=128):
        self.batch_size = batch_size
        self.df_X = df_X
        self.arr_Y = arr_Y
        self.indices = self.df_X.index.tolist()
        self.shuffle = shuffle
        self.image_size = image_size
        self.on_epoch_end()
        
    def __len__(self):
        return int(np.floor(len(self.indices) / self.batch_size))

    def __getitem__(self, index):
        index = self.index[index * self.batch_size:(index + 1) * self.batch_size]
        batch = [self.indices[k] for k in index]
        
        X, y = self.__get_data(batch)
        return X, y
    
    def n(self):
        return len(self.indices)
    
    def on_epoch_end(self):
        self.index = np.arange(len(self.indices))
        if self.shuffle == True:
            np.random.shuffle(self.index)

    def __get_data(self, batch):
        X = []
        y = []
               
        for i, id in enumerate(batch):
                       
            # Data
            file = self.df_X.iloc[self.indices[id],0]
            img = load_img(file,color_mode='rgb', target_size=(self.image_size,self.image_size),interpolation='nearest')
                                               
            img = img_to_array(img).astype(np.float32)
                        
            X.append(img)
            y.append(self.arr_Y[self.indices[id]])
            
        return np.array(X), np.array(y).reshape(self.batch_size,1)

In [14]:
class StatsRecorder:
    def __init__(self, data=None):
        """
        data: ndarray, shape (nobservations, ndimensions)
        """
        if data is not None:
            data = np.atleast_2d(data)
            self.mean = data.mean()
            self.std  = data.std()
            self.nobservations = data.shape[0]
            self.ndimensions   = data.shape[1]
        else:
            self.nobservations = 0

    def update(self, data):
        """
        data: ndarray, shape (nobservations, ndimensions)
        """
        if self.nobservations == 0:
            self.__init__(data)
        else:
            data = np.atleast_2d(data)
            if data.shape[1] != self.ndimensions:
                raise ValueError("Data dims don't match prev observations.")

            newmean = data.mean()
            newstd  = data.std()

            m = self.nobservations * 1.0
            n = data.shape[0]

            tmp = self.mean

            self.mean = m/(m+n)*tmp + n/(m+n)*newmean
            self.std  = m/(m+n)*self.std**2 + n/(m+n)*newstd**2 +\
                        m*n/(m+n)**2 * (tmp - newmean)**2
            self.std  = np.sqrt(self.std)

            self.nobservations += n

## Prepare the data

In [15]:
data = pd.read_csv('train.csv')
#data = data.loc[data.bolts <= 15]
#data = data.loc[data.bolts > 0]
#data.reset_index(drop=True,inplace=True)
print(data.shape)

(569, 2)


## Configure the hyperparameters

In [16]:
batch_size = 1
image_size = 128

In [17]:
train_generator = DataGenerator(df_X=data, arr_Y=data.bolts.values, batch_size=batch_size, shuffle=True,image_size=image_size)
STEP_SIZE_TRAIN = train_generator.n()//train_generator.batch_size

mystats = StatsRecorder()

for image in train_generator:
    
    data = image[0].reshape((batch_size,image_size*image_size*3))
    
    mystats.update(data)


In [18]:
print(mystats.mean)
print(mystats.std)

136.78040649936997
107.97868633912864
