In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import glob, os, warnings, pickle

import cv2

warnings.filterwarnings('ignore')

### Top categories

In [2]:
df_styles = pd.read_csv("../Data/styles.csv", error_bad_lines = False)

'''
    Use the following code-line to read data
    ## df_stratified_top = pickle.load(open("../Data/stratified_data.p", 'rb')) ##
'''

b'Skipping line 6044: expected 10 fields, saw 11\nSkipping line 6569: expected 10 fields, saw 11\nSkipping line 7399: expected 10 fields, saw 11\nSkipping line 7939: expected 10 fields, saw 11\nSkipping line 9026: expected 10 fields, saw 11\nSkipping line 10264: expected 10 fields, saw 11\nSkipping line 10427: expected 10 fields, saw 11\nSkipping line 10905: expected 10 fields, saw 11\nSkipping line 11373: expected 10 fields, saw 11\nSkipping line 11945: expected 10 fields, saw 11\nSkipping line 14112: expected 10 fields, saw 11\nSkipping line 14532: expected 10 fields, saw 11\nSkipping line 15076: expected 10 fields, saw 12\nSkipping line 29906: expected 10 fields, saw 11\nSkipping line 31625: expected 10 fields, saw 11\nSkipping line 33020: expected 10 fields, saw 11\nSkipping line 35748: expected 10 fields, saw 11\nSkipping line 35962: expected 10 fields, saw 11\nSkipping line 37770: expected 10 fields, saw 11\nSkipping line 38105: expected 10 fields, saw 11\nSkipping line 38275: ex

'\n    Use the following code-line to read data\n    ## df_stratified_top = pickle.load(open("../Data/stratified_data.p", \'rb\')) ##\n'

In [3]:
df_styles.head()

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016.0,Casual,Titan Women Silver Watch
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011.0,Casual,Manchester United Men Solid Black Track Pants
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012.0,Casual,Puma Men Grey T-shirt


In [4]:
top_n = 5
top_n_list = list(df_styles.groupby(['subCategory', 'articleType'])['id'].agg({'count'}).nlargest(columns = 'count', n = top_n).reset_index()['articleType'])

In [5]:
top_n_list

['Tshirts', 'Shirts', 'Casual Shoes', 'Watches', 'Sports Shoes']

In [6]:
df_top_n = df_styles[
    df_styles['articleType'].isin(top_n_list)
]

In [7]:
df_stratified_top = df_top_n.groupby('articleType', group_keys=False).apply(lambda x: x.sample(min(len(x), 1000))).reset_index(drop = True)

In [8]:
pickle.dump(df_stratified_top, open("../Data/stratified_data.p", 'wb'))

### Structuring the image dataset

In [9]:
import shutil

from sklearn.model_selection import train_test_split

In [10]:
df_stratified_top['articleType'] = df_stratified_top['articleType'].astype('category')
df_stratified_top['label'] = df_stratified_top['articleType'].cat.codes
df_stratified_top.head()
df_stratified_sliced = df_stratified_top.loc[:, ['id', 'label']]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    df_stratified_sliced['id'], 
    df_stratified_sliced['label'], 
    test_size = 0.3, 
    random_state = 1769
)

In [12]:
folder_path = "../Data/images"
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

folder_path = "..//Data//images//{}//"

if not os.path.exists(folder_path.format('train')):
    os.makedirs(folder_path.format('train'))
    
if not os.path.exists(folder_path.format('test')):
    os.makedirs(folder_path.format('test'))

sub_folder_path = folder_path.format('train') + '{}'
for i in y_train.unique():
    if not os.path.exists(sub_folder_path.format(str(i))):
        os.makedirs(sub_folder_path.format(str(i)))
        
sub_folder_path = folder_path.format('test') + '{}'
for i in y_test.unique():
    if not os.path.exists(sub_folder_path.format(str(i))):
        os.makedirs(sub_folder_path.format(str(i)))

In [None]:
train_path = "..//Data//images//train//{}//{}.jpg"
source_path = "D://Data//fashion-product-images-dataset//fashion-dataset//fashion-dataset//images//{}.jpg"

print ("Structuring TRAIN dataset")
i = 0
for idx, label in zip(X_train, y_train):
    i += 1
    print (i)
    file_path_to_store = train_path.format(str(label), str(idx))
    shutil.copyfile(
        source_path.format(idx),
        file_path_to_store
    )

test_path = "..//Data//images//test//{}//{}.jpg"
print ("Structuring TEST dataset")
for idx, label in zip(X_test, y_test):
    i += 1
    print (i)
    file_path_to_store = test_path.format(str(label), str(idx))
    shutil.copyfile(
        source_path.format(idx),
        file_path_to_store
    )

### ImageDataGenerator

In [13]:
from keras.preprocessing.image import ImageDataGenerator

from keras.models import Sequential
from keras.layers import Conv2D, MaxPool2D, Flatten, Dropout, Dense

from keras.optimizers import adam

Using TensorFlow backend.


In [14]:
datagen = ImageDataGenerator(
    featurewise_center = False, 
    samplewise_center = False, 
    featurewise_std_normalization = False, 
    samplewise_std_normalization = False, 
    zca_whitening = False, 
    zca_epsilon = 1e-06, 
    rotation_range = 0, 
    width_shift_range = 0.0, 
    height_shift_range = 0.0, 
    brightness_range = None, 
    shear_range = 0.0, 
    zoom_range = 0.0, 
    channel_shift_range = 0.0, 
    fill_mode = 'nearest', 
    cval = 0.0, 
    horizontal_flip = True, 
    vertical_flip = True, 
    rescale = None, 
    preprocessing_function = None, 
    data_format = 'channels_last', 
    validation_split = 0.0, 
    interpolation_order = 1, 
    dtype='float32'
)

folder_path = '..//Data//images//{}/'
train_it = datagen.flow_from_directory(folder_path.format('train'), class_mode = 'categorical', batch_size = 32)
test_it = datagen.flow_from_directory(folder_path.format('test'), class_mode = 'categorical', batch_size = 32)
# val_it = datagen.flow_from_directory('data/validation/', class_mode='binary', batch_size=64)

Found 3500 images belonging to 5 classes.
Found 1500 images belonging to 5 classes.


In [15]:
model = Sequential()

model.add(Conv2D(64, (3, 3), input_shape = (256, 256, 3), activation='relu'))
model.add(MaxPool2D(3, 3))
model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPool2D(3, 3))
model.add(Dropout(0.2))
model.add(Conv2D(256, (3, 3), activation='relu'))
model.add(MaxPool2D(3, 3))
model.add(Conv2D(512, (3, 3), activation='relu'))
model.add(MaxPool2D(3, 3))
# model.add(Conv2D(512, (3, 3), activation='relu'))
# model.add(MaxPool2D(3, 3))

model.add(Flatten())
model.add(Dense(1024, activation = 'relu'))
model.add(Dense(512, activation = 'relu'))
model.add(Dense(256, activation = 'relu'))
model.add(Dense(5, activation = 'relu'))

model.summary()

model.compile(
    optimizer = 'adam',
    loss = 'categorical_crossentropy',
    metrics = ['accuracy']
)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 254, 254, 64)      1792      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 84, 84, 64)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 82, 82, 128)       73856     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 27, 27, 128)       0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 27, 27, 128)       0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 25, 25, 256)       295168    
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 8, 8, 256)        

In [16]:
model.fit_generator(
    train_it,
    steps_per_epoch = len(X_train) / 32, 
    epochs = 10
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x24b213d94a8>

In [17]:
pickle.dump(model, open("../Data/local_model.p", 'wb'))

In [18]:
model.fit_generator(
    train_it,
    steps_per_epoch = train_it.samples // 32,
    validation_data = test_it, 
    validation_steps = test_it.samples // 32,
    epochs = 10)

Epoch 1/10

KeyboardInterrupt: 