In [3]:
import os
import pandas as pd
import numpy as np
from shutil import copyfile
from sklearn.model_selection import train_test_split

In [4]:
# Do not forget to seed the random state or the directories will change every time you
# run this code
np.random.seed(42)

# Import data

In [5]:
data = pd.read_csv("data/flipkart_com-ecommerce_sample_1050.csv")
data = data.set_index('uniq_id')

In [6]:
# We first need to extract the categories from the data
data['categ'] = data['product_category_tree'].str.split(pat='>>').str[0]
data['categ'] = data['categ'].str.split(pat='"').str[1]
# Let's get the second categ, we'll see if we need it later
data['categ2'] = data['product_category_tree'].str.split(pat='>>').str[1]

# Setting the right path for each category

The architecture will be as follow : <br/>
<li><ul> Train 
    <ul> Cat1 </ul>
    <ul> Cat2 </ul></ul>
    <ul> Validation 
    <ul> Cat1 </ul>
    <ul> Cat2 </ul></ul></li>
    <ul> Test 
    <ul> Cat1 </ul>
    <ul> Cat2 </ul></ul></li>

## Create the train, validation and test dataset

In [7]:
x = data.copy()
y = data[['categ']]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(
    x_train, y_train, test_size=0.25, random_state=42)

Let's rebuild a single dataset with the labeled data

In [28]:
x_train.loc[:,'label'] = "Train"
x_val.loc[:,'label'] = "Validation"
x_test.loc[:,'label'] = "Test"
data = pd.concat([x_train, x_val])
data = pd.concat([data, x_test])

y_train.loc[:,'label'] = "Train"
y_val.loc[:,'label'] = "Validation"
y_test.loc[:,'label'] = "Test"

target = pd.concat([y_train, y_val])
target = pd.concat([target, y_test])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


## Get the different categories

In [8]:
categ = data['categ'].unique()

## Set up functions to create the directories

In [16]:
def createDirectory(categ, direct=['Train', 'Validation', 'Test']):
    for cat in categ:
        for d in direct:
            if os.path.isdir("data/" + d + "/" + cat):
                print("The directory :", "data/" + d + "/" + cat, "was already created")
            else:
                os.makedirs("data/" + d + "/" + cat)
                print("The directory :", "data/" + d + "/" + cat, "was created successfully")

In [17]:
createDirectory(categ)

The directory : data/Train/Home Furnishing  was created successfully
The directory : data/Validation/Home Furnishing  was created successfully
The directory : data/Test/Home Furnishing  was created successfully
The directory : data/Train/Baby Care  was created successfully
The directory : data/Validation/Baby Care  was created successfully
The directory : data/Test/Baby Care  was created successfully
The directory : data/Train/Watches  was created successfully
The directory : data/Validation/Watches  was created successfully
The directory : data/Test/Watches  was created successfully
The directory : data/Train/Home Decor & Festive Needs  was created successfully
The directory : data/Validation/Home Decor & Festive Needs  was created successfully
The directory : data/Test/Home Decor & Festive Needs  was created successfully
The directory : data/Train/Kitchen & Dining  was created successfully
The directory : data/Validation/Kitchen & Dining  was created successfully
The directory : data

## Put all images into the right directory

In [18]:
def moveImages():
    path_train = []
    path_test = []
    path_val = []
    for index in x_train.index:
        try:
            cat = x_train.loc[index, 'categ']
            path_ = "data/Train/" + cat.strip() + "/" + x_train.loc[index, 'image']
            copyfile("data/Images/" + x_train.loc[index, 'image'], path_)
            path_train.append(path_)
        except:
            print("Picture not in the path")
            path_ = " "
            path_train.append(path_)
            pass
    path_train = pd.DataFrame(path_train, index=x_train.index, columns=['path'])

    for index in x_val.index:
        try:
            cat = x_val.loc[index, 'categ'] 
            path_ = "data/Validation/" + cat.strip() + "/" + x_val.loc[index, 'image']
            copyfile("data/Images/" + x_val.loc[index, 'image'], path_)
            path_val.append(path_)
        except:
            print("Picture not in the path")
            path_ = " "
            path_val.append(path_)
            pass
    path_val = pd.DataFrame(path_val, index=x_val.index, columns=['path'])

    for index in x_test.index:
        try:
            cat = x_test.loc[index, 'categ'] 
            path_ = "data/Test/" + cat.strip() + "/" + x_test.loc[index, 'image']
            copyfile("data/Images/" + x_test.loc[index, 'image'], path_)
            path_test.append(path_)
        except:
            print("Picture not in the path")
            path_ = " "
            path_test.append(path_)
            pass
    path_test = pd.DataFrame(path_test, index=x_test.index, columns=['path'])
    
    return path_train, path_val, path_test

In [19]:
path_train, path_val, path_test = moveImages()

In [20]:
x_train = pd.merge(x_train, path_train, left_index=True, right_index=True, how='inner')
x_val = pd.merge(x_val, path_val, left_index=True, right_index=True, how='inner')
x_test = pd.merge(x_test, path_test, left_index=True, right_index=True, how='inner')

In [31]:
data.to_csv("data/data.csv")
target.to_csv("data/target.csv")