# Data Exploration: Image Pre-Processing
Let's preprocess the images so that they are:
- center cropped
- NxN dimensions (same height and width)

Finally, we output the images into two folders: `train` and `test`
Each file will be a 137x236 image with the `image_id` as the filename

## NOTE: images are normalized individually instead of in batches or by the whole dataset!

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
from pathlib import Path
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image

In [3]:
path = Path('./data')
sorted(os.listdir(path))

['bengaliai-cv19.zip',
 'class_map.csv',
 'sample_submission.csv',
 'test',
 'test.csv',
 'test_image_data_0.parquet',
 'test_image_data_1.parquet',
 'test_image_data_2.parquet',
 'test_image_data_3.parquet',
 'train',
 'train.csv',
 'train_image_data_0.parquet',
 'train_image_data_1.parquet',
 'train_image_data_2.parquet',
 'train_image_data_3.parquet']

In [4]:
HEIGHT = 137
WIDTH = 236
TRAIN_DATASETS = [
    path/'train_image_data_0.parquet',
    path/'train_image_data_1.parquet',
    path/'train_image_data_2.parquet',
    path/'train_image_data_3.parquet',
]
TEST_DATASETS = [
    path/'test_image_data_0.parquet',
    path/'test_image_data_1.parquet',
    path/'test_image_data_2.parquet',
    path/'test_image_data_3.parquet',
]

In [5]:
def get_images_and_labels(df, height=HEIGHT, width=WIDTH):
    return df.iloc[:,0], df.iloc[:,1:].values.reshape(-1, height, width)

def preprocess_img(img):
    img = 255 - img
    img = img * (255./img.max())
    return img

In [8]:
def gen_preprocessed_data(dataset_paths, save_dir):
    start_time = time.time()

    for data_path in dataset_paths:
        df_start_time = time.time()

        df = pd.read_parquet(data_path)
        filename, images = get_images_and_labels(df)
        assert len(filename) == len(images)

        for i in range(len(images)):
            fn = filename.iloc[i]
            img = images[i]
            img = preprocess_img(img)
            img = Image.fromarray(img).convert('RGB')
            img.save('{}.png'.format(save_dir/fn))
            if i % 1000 == 0:
                print('Completed {:4f}% in {:4f}'.format(
                    i / len(images) * 100,
                    time.time() - df_start_time
                ))

        print('Total time for df: ', time.time() - df_start_time)
        print()

    print('Total time: ', time.time() - start_time)

In [9]:
gen_preprocessed_data(TRAIN_DATASETS, path/'train')

Completed 0.000000% in 4.300104
Completed 1.991635% in 8.181643
Completed 3.983270% in 12.008147
Completed 5.974905% in 15.995915
Completed 7.966541% in 19.827554
Completed 9.958176% in 23.740606
Completed 11.949811% in 27.641182
Completed 13.941446% in 31.585071
Completed 15.933081% in 35.473633
Completed 17.924716% in 39.366803
Completed 19.916351% in 43.180285
Completed 21.907986% in 47.074324
Completed 23.899622% in 50.921019
Completed 25.891257% in 54.783370
Completed 27.882892% in 58.644429
Completed 29.874527% in 62.554107
Completed 31.866162% in 66.437678
Completed 33.857797% in 70.320444
Completed 35.849432% in 74.148338
Completed 37.841068% in 78.021273
Completed 39.832703% in 81.804364
Completed 41.824338% in 85.654381
Completed 43.815973% in 89.502108
Completed 45.807608% in 93.395722
Completed 47.799243% in 97.233616
Completed 49.790878% in 101.115839
Completed 51.782513% in 105.168096
Completed 53.774149% in 109.148994
Completed 55.765784% in 113.163586
Completed 57.75741

In [14]:
gen_preprocessed_data(TEST_DATASETS, path/'test')

Completed 0.000000% in 0.881459
Total time for df:  0.8862061500549316

Completed 0.000000% in 0.830535
Total time for df:  0.8388051986694336

Completed 0.000000% in 0.893340
Total time for df:  0.9014194011688232

Completed 0.000000% in 0.892469
Total time for df:  0.8984432220458984

Total time:  3.525055170059204
