## T-Shirt Images - Data Transformations


This notebook reads all the received datasets from Threadless and select(copies) only those that are present in the 'AllData.xlsx' that has the images (~10k) to be analyzed. Once the necessary images are selected and moved to a different folder, some preprocessing steps are performed like - <br> <br>
i) Image Resizing (256 * 256) <br>
ii) Padding the images such as to maintain the aspect ratio while having the final image size as 256* 256 <br>
    * Padding with black color might not help a lot with classification, so the images are padded with majority R,G and B colors
    * There are cases where the majority R, G and B color are not the background image, but the contents of the images. So, only outer pixels will be looked for while selecting the major background color



In [1]:
import os
import tensorflow as tf
import pandas as pd
import numpy as np
import scipy
import PIL
import matplotlib.pyplot as plt

  from ._conv import register_converters as _register_converters


In [2]:
### Reading in the transformed version of 'AllData.xlsx' file
#Major_Background_labels has been transformed to have the Background tagging for each image

one_hot_output = pd.read_csv('Major_Background_labels.csv')

## Moving Images

### Moving all the images from different folders to one consolidated folder

Codes have been commented to prevent them from executing again (considering the execution time and also because they are one time process)

In [3]:
# dst = "C:\\Users\\arvra\\Documents\\UVa files\\Research Assistantship\\Final data\\"
# for subdir, dirs, files in os.walk("C:\\Users\\arvra\\Documents\\data"):
#     #print(subdir)
#     if(len(os.listdir(subdir)) < 100 ):
#         continue
#     for each in os.listdir(subdir):
#         if(each.split(".")[0] in one_hot_output.Index.values):
#             shutil.copy(subdir+"\\"+each, dst+each)

In [35]:
# from collections import Counter
# all_files = [each.split(".")[0] for each in os.listdir(dst)]
# all_files_counter = Counter(all_files)

### Code: Moving only those images which are present in AllData.xlsx file

In [4]:
### Destination location ###
# ### final_data = "C:\\Users\\arvra\\Documents\\UVa files\\Research Assistantship\\Final data Analysis\\"

# for each in os.listdir(dst):
#     file_name = each.split(".")[0]
#     if(all_files_counter[file_name] > 1):
        
#         continue
        
#     else:
#         shutil.copy(dst+each, final_data+each)

In [37]:
# os.stat(dst+each).st_size

In [38]:
# for each_file in all_files_counter.keys():
#     if all_files_counter[each_file] > 1:
        
#         similar_images = [file for file in os.listdir(dst) if each_file == file.split(".")[0]]
        
#         selected_image = []
#         size = 10000000000
#         for each in similar_images:
#             if( os.stat(dst+each).st_size < size):
#                 size = os.stat(dst+each).st_size
#                 selected_image = each
#         shutil.copy(dst+each, final_data+selected_image)

## Transforming images

### Converting the images to 256 x 256

In [6]:
#### Setting the directory to move the transformed images (while splitting them to train and validation dataset)
image_data_out_train = "C:\\Users\\arvra\\Documents\\UVa files\\Research Assistantship\\Final data Folders\\train\\"
image_data_out_val = "C:\\Users\\arvra\\Documents\\UVa files\\Research Assistantship\\Final data Folders\\val\\"

#### Input Raw that will be used for our analysis ####
image_data = "C:\\Users\\arvra\\Documents\\UVa files\\Research Assistantship\\Final data Analysis"
images = os.listdir(image_data)

In [7]:
### Extracting the image numbers from the directory
image_numbers = [each.replace(".jpg","").replace(".png","").replace(".gif","") for each in images]
available_images = [each for each in image_numbers if each in one_hot_output.Image_index.values]

In [8]:
## Remove unnecessary images
## Removing '176194' since we have a file, but we do not have an image for this file
remove_imgs = ['176194']

image_numbers = [(each.split(".")[0],each) for each in images]
available_files = [(avail,files) for avail,files in image_numbers \
                   if avail in list(one_hot_output.Image_index) and avail not in remove_imgs]

## Available images
available_images = [img for img,files in available_files]

file_names = [files for img,files in available_files]

In [12]:
### Loading necessary libraries

from PIL import Image, ImageOps
from PIL import ImageFile
import random
ImageFile.LOAD_TRUNCATED_IMAGES = True

In [13]:
#### Splitting the data to train and validation
train_files = random.sample(available_files, int(len(available_files) * 0.7))
val_files = [each for each in available_files if each not in train_files]

In [95]:
### Final aspect ratio
desired_size = 512

images_dict = {}
images_dict_new = {}
image_array = []
image_with_aspect = []
sizes = []

print("CREATING TRAINING FOLDER")


############################## CREATING FOLDER FOR TRAINING IMAGES ##########################################
for image,file in train_files:
    #images_dict[each] = np.array(Image.open())
    #image = scipy.ndimage.imread(image_data+'\\'+file, mode="RGB")

    ##Reading the input file
    im = Image.open(image_data+'\\'+file)
    old_size = im.size  # old_size[0] is in (width, height) format
    
    ##Getting the ratio and setting the new size
    ratio = float(desired_size)/max(old_size)
    new_size = tuple([int(x*ratio) for x in old_size])

    # use thumbnail() or resize() method to resize the input image
    # thumbnail is a in-place operation
    # im.thumbnail(new_size, Image.ANTIALIAS)
    im = im.resize(new_size, Image.ADAPTIVE)
    
    ############### CREATING A NEW IMAGE WITH COLOR PADDING ###########################
    im2 = im.convert('RGB')
    histogram = im2.histogram()
    # Take only the Red counts
    l1 = histogram[0:256]
    # Take only the Blue counts
    l2 = histogram[256:512]
    # Take only the Green counts
    l3 = histogram[512:768]

    r,g,b = np.argmax(l1),np.argmax(l2),np.argmax(l3)
    #####################################################################################
    
    # create a new image and paste the resized on it
    new_im = Image.new("RGB", (desired_size, desired_size),(r,g,b))
    new_im.paste(im, ((desired_size-new_size[0])//2,
                        (desired_size-new_size[1])//2))
    image_with_aspect.append(new_im)
    subfolder = one_hot_output[one_hot_output.Image_index == image]['Major_backgroup'].values
    
    directory = image_data_out_train+str(subfolder[0])
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    new_im.save(image_data_out_train+str(subfolder[0])+"\\"+file)
    
    
############################## CREATING FOLDER FOR VALIDATION IMAGES ########################################## 

print("CREATING VALIDATION FOLDER")

for image,file in val_files:
    #images_dict[each] = np.array(Image.open())
    #image = scipy.ndimage.imread(image_data+'\\'+file, mode="RGB")

    ##Reading the input file
    im = Image.open(image_data+'\\'+file)
    old_size = im.size  # old_size[0] is in (width, height) format
    
    ##Getting the ratio and setting the new size
    ratio = float(desired_size)/max(old_size)
    new_size = tuple([int(x*ratio) for x in old_size])

    # use thumbnail() or resize() method to resize the input image
    # thumbnail is a in-place operation
    # im.thumbnail(new_size, Image.ANTIALIAS)
    im = im.resize(new_size, Image.ADAPTIVE)
    
    
    ############### CREATING A NEW IMAGE WITH COLOR PADDING ###########################
    im2 = im.convert('RGB')
    histogram = im2.histogram()
    # Take only the Red counts
    l1 = histogram[0:256]
    # Take only the Blue counts
    l2 = histogram[256:512]
    # Take only the Green counts
    l3 = histogram[512:768]

    r,g,b = np.argmax(l1),np.argmax(l2),np.argmax(l3)
    #####################################################################################
    
    # create a new image and paste the resized on it
    new_im = Image.new("RGB", (desired_size, desired_size),(r,g,b))
    new_im.paste(im, ((desired_size-new_size[0])//2,
                        (desired_size-new_size[1])//2))
    image_with_aspect.append(new_im)
    subfolder = one_hot_output[one_hot_output.Image_index == image]['Major_backgroup'].values
    
    directory = image_data_out_val+str(subfolder[0])
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    new_im.save(image_data_out_val+str(subfolder[0])+"\\"+file)

CREATING TRAINING FOLDER
CREATING VALIDATION FOLDER
