# Database, data preparation

This file contains data clean-up and preparation for the model training

In [1]:
from skimage import io
import numpy as np
import pandas as pd

In [2]:
img = io.imread('fashion/images/1163.jpg', as_gray=True)

## Single image has standard shape of 80 x 60. However there are some exceptions...

In [4]:
img.shape

(80, 60)

In [5]:
clothes_csv = pd.read_csv("fashion\\fashion.csv", sep=";")

In [6]:
clothes_csv = clothes_csv.set_index("id")

In [7]:
import os
file_names = os.listdir(os.getcwd() + '/fashion/images/')

## Removing records that don't have images

In [8]:
toRemove = []
for id, row in clothes_csv.iterrows():
    if file_names.count(str(id)+".jpg") == 0:
        print('not found: ', id)
        toRemove.append(id)

not found:  39403
not found:  39410
not found:  39401
not found:  39425
not found:  12347


In [9]:
for remove in toRemove:
    clothes_csv = clothes_csv.drop(index=remove, axis=0)

In [10]:
clothes_csv.head(5)

Unnamed: 0_level_0,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt
39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans
59263,Women,Accessories,Watches,Watches,Silver,Winter,2016.0,Casual,Titan Women Silver Watch
21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011.0,Casual,Manchester United Men Solid Black Track Pants
53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012.0,Casual,Puma Men Grey T-shirt


## Removing records where images have dimensions different than 80 x 60     
I could either remove or rescale images - I chose to remove them
### Flatenning images to one dimensional arrays

In [11]:
allImages = []
toRemove.clear()
for id, row in clothes_csv.iterrows():
    img = io.imread(f'fashion/images/{id}.jpg', as_gray=True) 
    if img.shape != (80, 60):
        print("bad shape: ", id)
        toRemove.append(id)
    else: 
        allImages.append((id, np.array(img).flatten())) #(id, img)
        
for remove in toRemove:
    clothes_csv = clothes_csv.drop(index=remove, axis=0)

bad shape:  56624
bad shape:  28092
bad shape:  1801
bad shape:  28492
bad shape:  35915
bad shape:  59606
bad shape:  1800
bad shape:  14776
bad shape:  11160
bad shape:  50908
bad shape:  11151
bad shape:  57730
bad shape:  25943
bad shape:  5408
bad shape:  29519
bad shape:  52166
bad shape:  25299
bad shape:  56128
bad shape:  1799
bad shape:  59593
bad shape:  2311
bad shape:  25958
bad shape:  44101


In [31]:
length = len(allImages)
imagesNumpyArray = np.array([allImages[i][1] for i in range(length)])

## Saving transformed images to numpy file.   
### Numpy's file size is smaller than csv.

In [33]:
np.save("ImagesTxt", imagesNumpyArray)

In [36]:
clothes_csv.to_csv(path_or_buf="clothes_processed.csv", sep=';')

In [42]:
imagesLabels = np.array([id for id, row in clothes_csv.iterrows()])

In [44]:
np.save("saved_ids", imagesLabels)

In [51]:
del(clothes_csv)

In [52]:
clothes_csv = pd.read_csv("clothes_processed.csv", sep=';')

## Creating target column based on masterCategory and subCategory     
I decided my target column will consist of masterCategory values, however for 'Apparel' I wanted to be more specific and chose subcategories Topweaer and Bottomwear as targets.    
So the model could still be useful while keeping relatively more distinguishable classes.

In [53]:
clothes_csv['target'] = np.where(clothes_csv['masterCategory'] != 'Apparel', clothes_csv['masterCategory'], clothes_csv['subCategory'])

In [64]:
clothes_csv

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName,target
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt,Topwear
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans,Bottomwear
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016.0,Casual,Titan Women Silver Watch,Accessories
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011.0,Casual,Manchester United Men Solid Black Track Pants,Bottomwear
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012.0,Casual,Puma Men Grey T-shirt,Topwear
...,...,...,...,...,...,...,...,...,...,...,...
44413,17036,Men,Footwear,Shoes,Casual Shoes,White,Summer,2013.0,Casual,Gas Men Caddy Casual Shoe,Footwear
44414,6461,Men,Footwear,Flip Flops,Flip Flops,Red,Summer,2011.0,Casual,Lotto Men's Soccer Track Flip Flop,Footwear
44415,18842,Men,Apparel,Topwear,Tshirts,Blue,Fall,2011.0,Casual,Puma Men Graphic Stellar Blue Tshirt,Topwear
44416,46694,Women,Personal Care,Fragrance,Perfume and Body Mist,Blue,Spring,2017.0,Casual,Rasasi Women Blue Lady Perfume,Personal Care


## Deleting records where masterCategory is Apparel and subcategory is neither Topwear nor Bottomwear

In [69]:
clothes_csv = clothes_csv.set_index("id")
indexDel = clothes_csv[(clothes_csv['masterCategory'] == 'Apparel') & (clothes_csv['target'] != 'Topwear') & (clothes_csv['target'] != 'Bottomwear')].index
indexDel

Index([51832, 17871, 32138, 59607, 17885, 39716, 25349, 55802, 28032, 39140,
       ...
       25590, 28881,  2697, 37267, 45647, 57382, 56250, 27491, 15761, 32143],
      dtype='int64', name='id', length=3297)

In [70]:
clothes_csv.drop(indexDel , inplace=True)

## Deleting irrelevant columns
I have already extracted targets I needed for target, rest of the columns won't be used

In [75]:
clothes_csv.drop(labels=['masterCategory','gender','subCategory', 'articleType','baseColour', 'season', 'year', 'usage', 'productDisplayName'], axis=1, inplace=True)

<class 'pandas.core.frame.DataFrame'>
Index: 41121 entries, 15970 to 51623
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   target  41121 non-null  object
dtypes: object(1)
memory usage: 642.5+ KB


In [77]:
clothes_csv

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
15970,Topwear
39386,Bottomwear
59263,Accessories
21379,Bottomwear
53759,Topwear
...,...
17036,Footwear
6461,Footwear
18842,Topwear
46694,Personal Care


## Saving cleaned database

In [78]:
clothes_csv.to_csv(path_or_buf="cleaned_fashion.csv", sep=';')