# Data preprocess

In [None]:
import tensorflow as tf
from tensorflow.keras import layers

import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import os

tf.__version__

In [None]:
# import tarfile
# tar_train_val = tarfile.open('./data/VOCtrainval_06-Nov-2007.tar')
# tar_train_val.extractall('./data/')

In [None]:
def load_classes_from_txt(file_path):
    
    file_name = os.path.basename(file_path)
    
    class_name = file_name.split('_')[0]
    
    df = pd.read_csv(file_path, sep='\s+| |  ', engine='python', 
                     header=None)
    
    df.columns = ['file_name', class_name]
    
    df.loc[df[class_name]==-1, class_name] = 0
    
    return df
    

In [None]:
# data = load_classes_from_txt('./data/VOCdevkit/VOC2007/ImageSets/Main/aeroplane_train.txt')
# data.head()

In [None]:
# file_name = os.path.basename('./data/VOCdevkit/VOC2007/ImageSets/Main/aeroplane_train.txt')
# file_name.split('_')[0]

In [None]:
def get_dataframe_with_classes_in_images(folder_path, which_set='train'):
    
    '''
    Returns DataFrame with class annotations for every image in chosen set.
    
    Args:
        folder_path -- path to folder containing .txt files with labels
        which_set -- 'train', 'trainval', 'val', 'test'
    '''
    
    for root, _, files in os.walk(folder_path):
        pass
    
    # create dataframes with file name column 
    df_all_classes = pd.read_csv(f'{root}{which_set}.txt', header=None, dtype=str)
    df_all_classes.columns = ['file_name']
    
    # loops through all file names in the folder
    for file_name in files:
        
        # exclude txt files without information about class apperance in a image
        if file_name not in ['train.txt', 'trainval.txt', 'val.txt', 'test.txt']:
            
            #  get information which of dataset's splits the file relates to
            ds_part = file_name.split('_')[1]
            
            file_path = root + file_name
            
            # function for getting dataframe from txt file
            df = load_classes_from_txt(file_path=file_path)
         
            # check if annotations are for images of chosen set
            if ds_part==(which_set+'.txt'):
                
                # statment check if columns are equal before concatenation
                if not df_all_classes['file_name'].astype('int').equals(df['file_name']):
                    print('File name columns are not equal!')
                
                df_all_classes = pd.concat([df_all_classes, df.iloc[:, 1]], axis=1)
            
            else:
                pass
            

    # get column names exluding 'file_name'
    cols = df_all_classes.columns.tolist()[1:]
        
    cols.sort()
    
    cols.insert(0, 'file_name')
    
#     print(cols)

    # rearrange columns order
    df_all_classes = df_all_classes[cols]
           
    return df_all_classes



In [None]:
folder_path = './data/VOCdevkit/VOC2007/ImageSets/Main/'

df_train = get_dataframe_with_classes_in_images(folder_path, which_set='train')
df_trainval = get_dataframe_with_classes_in_images(folder_path, which_set='trainval')
df_val = get_dataframe_with_classes_in_images(folder_path, which_set='val')

test_path = './data/VOCtest_06-Nov-2007/VOCdevkit/VOC2007/ImageSets/Main/'

df_test = get_dataframe_with_classes_in_images(test_path, 'test')

In [None]:
df_train

In [None]:
df_trainval

In [None]:
df_val

In [None]:
df_test

## Graphs

In [None]:
def display_graph_of_images_count(dataframe):
    
    ax = dataframe.plot(kind='bar', figsize=(20, 15), title='Images count per class',
                        xlabel='Class', ylabel='Count', legend=False, fontsize=12)
    
    ax.legend(fontsize=12)

    for i in range(len(dataframe.columns)):
        ax.bar_label(ax.containers[i], label_type='edge', rotation=90, fontsize=12, padding=3)

    ax.margins(y=0.1)

In [None]:

df_train_class_count = pd.DataFrame(df_train.sum().iloc[1:], columns=['train'])

df_val_class_count = pd.DataFrame(df_val.sum().iloc[1:], columns=['val'])

df_test_class_count = pd.DataFrame(df_test.sum().iloc[1:], columns=['test'])

# concat
df_class_count = pd.concat([df_train_class_count, df_val_class_count, df_test_class_count], axis=1)

# sum across train/val/test count
df_class_count['total']= df_class_count.sum(axis=1).astype('int')


In [None]:
df_class_count = df_class_count.sort_values(by=['total'], ascending=False)

In [None]:
df_class_count

In [None]:
display_graph_of_images_count(df_class_count)

Most common class found in images by far is 'person'. Next are car, chair and dog.

The least amount of images depict sheeps with only 193 images in total of around 10k of images! Other classes are also not faring better. 

The distribution of class across train/val/test sets is almost equal. 
E.g.: Class 'person' appear in:
    
    * 1025 images in train set
    * 985 images in validation set
    * 2007 images in test set, which consists of 50% of all images

## Load images

In [None]:
df_train.iloc[:, 1:]

In [None]:
df_train.iloc[:, 0].loc[0]

In [None]:
# classes in image 000012.jpg
ds = df_train.iloc[0, 1:]
ds

In [None]:
file_name = df_train.iloc[0, 0]
file_name

In [None]:
classes = list(df_train.iloc[:, 1:])
NUM_CLASSES = len(classes)
classes

In [None]:
ds_as_list = list(ds)
ds_as_list

In [None]:
def to_categorical(labels_list, n_classes=NUM_CLASSES):
       
    for i, cls in enumerate(labels_list):
        
        if cls == 1:
            
            labels_list[i] = i
    
    Y = tf.keras.utils.to_categorical(labels_list)
    
    return Y
    

In [None]:
cat_arr = to_categorical(ds_as_list, 20)
cat_arr

## Create Dataset

In [None]:
IMG_SIZE = 299

CHANNELS = 3

In [None]:
def parse_image(file_path, label):
    """
    Function that returns a tuple of normalized image array and labels array.
    
    Args:
        file_path: string representing path to image
        label: 0/1 one-dimensional array of size N_LABELS
    """
    
    # Read an image from a file
    image_string = tf.io.read_file(file_path)
    
    # Decode it into a dense vector
    image_decoded = tf.image.decode_jpeg(image_string, channels=CHANNELS)
    
    # Resize it to fixed shape
    image_resized = tf.image.resize(image_decoded, [IMG_SIZE, IMG_SIZE])
    
    # Normalize it from [0, 255] to [0.0, 1.0]
    image_normalized = image_resized / 255.0
    
    return image_normalized, label

In [None]:
BATCH_SIZE = 32

SHUFFLE_BUFFER_SIZE = 512

AUTOTUNE = tf.data.AUTOTUNE

In [None]:
def get_img_paths_and_labels(dataframe, images_folder_path): 
    
    '''
    Get lists of paths to images and their corresponding labels
    
    Args:
        dataframe: pandas dataframe
        images_folder_path: path to folder with images
    '''
    
    set_paths = []
    set_labels = []

    for i in range(len(dataframe)):

        path = f'{images_folder_path}{dataframe.iloc[i, 0]}.jpg'
        set_paths.append(path)

        set_labels.append(list(dataframe.iloc[i, 1:]))
    
    return set_paths, set_labels

In [None]:
def create_dataset(dataframe, images_folder_path, is_training=True):
    """
    Load and parse dataset.
    
    Args:
        dataframe: pandas dataframe
        images_folder_path: path to folder with images
        is_training: boolean to indicate training mode
    """
    
    filepaths, labels = get_img_paths_and_labels(dataframe, images_folder_path)
    
    # Create a first dataset of file paths and labels
    dataset = tf.data.Dataset.from_tensor_slices((filepaths, labels))
    
    # Parse and preprocess observations in parallel
    dataset = dataset.map(parse_image, num_parallel_calls=AUTOTUNE)
    
    if is_training == True:
    
        # This is a small dataset, only load it once, and keep it in memory.
        dataset = dataset.cache()
        
        # Shuffle the data each buffer size
        dataset = dataset.shuffle(buffer_size=SHUFFLE_BUFFER_SIZE)
        
    # Batch the data for multiple steps
    dataset = dataset.batch(BATCH_SIZE)
    
    # Fetch batches in the background while the model is training.
    dataset = dataset.prefetch(buffer_size=AUTOTUNE)
    
    return dataset

In [None]:
trainval_images_path = './data/VOCdevkit/VOC2007/JPEGImages/'
test_images_path = './data/VOCtest_06-Nov-2007/VOCdevkit/VOC2007/JPEGImages'

# for trainval_root, _, _ in os.walk(trainval_images_path):
#     pass

# for test_root, _, _ in os.walk(test_images_path):
#     pass    

In [None]:
train_ds = create_dataset(df_train, trainval_images_path)
train_ds

In [None]:
val_ds = create_dataset(df_val, trainval_images_path)
test_ds = create_dataset(df_test, test_images_path)

# Deep learning model 

In [None]:
conv_base = tf.keras.applications.Xception(weights='imagenet',
                                        include_top=False,
                                        input_shape=(IMG_SIZE, IMG_SIZE, 3))

conv_base.trainable = False
print("weights:", len(conv_base.weights))
print("trainable_weights:", len(conv_base.trainable_weights))
print("non_trainable_weights:", len(conv_base.non_trainable_weights))

In [None]:
model = tf.keras.models.Sequential([
    
    conv_base,
    
    layers.Flatten(),
    
    layers.Dense(512, activation='relu'),
    
    layers.Dense(num_classes, activation='softmax', name='output')
])

In [None]:
model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.0001),
             loss='categorical_crossentropy',
             metrics=['accuracy'])

In [None]:
EPOCHS = 10

history = model.fit(train_ds,
                   validation_data=val_ds,
                   epochs=EPOCHS)

# Model evaluation