In [1]:
import pandas as pd
import os
from measures import wrangling

# Import dataset and prepare data for neural network input

In [2]:
dataset = pd.read_csv('./data/clean/data_v0.1.csv', header=0)

In [3]:
print(dataset.columns)

Index(['key', 'pid_x', 'size_x', 'color', 'brand', 'rrp', 'mainCategory',
       'category', 'subCategory', 'stock',
       ...
       '2018-01-22_sales', '2018-01-23_sales', '2018-01-24_sales',
       '2018-01-25_sales', '2018-01-26_sales', '2018-01-27_sales',
       '2018-01-28_sales', '2018-01-29_sales', '2018-01-30_sales',
       '2018-01-31_sales'],
      dtype='object', length=285)


In [4]:
# Drop February price-columns
dataset = wrangling.remove_price_cols(dataset, '2018-02-01', '2018-02-28')
print(dataset.columns)

Index(['key', 'pid_x', 'size_x', 'color', 'brand', 'rrp', 'mainCategory',
       'category', 'subCategory', 'stock',
       ...
       '2018-01-22_sales', '2018-01-23_sales', '2018-01-24_sales',
       '2018-01-25_sales', '2018-01-26_sales', '2018-01-27_sales',
       '2018-01-28_sales', '2018-01-29_sales', '2018-01-30_sales',
       '2018-01-31_sales'],
      dtype='object', length=257)


In [12]:
# Separate Y labels from X variables
Y_train = wrangling.sales_cols(dataset, '2017-10-01', '2017-12-31')
Y_test = wrangling.sales_cols(dataset, '2018-01-01', '2018-01-31')
X_dataset = wrangling.remove_sales_cols(dataset, '2017-10-01', '2018-01-31')

In [13]:
# Drop more unnecessary columns from X
X_dataset = X_dataset.drop(['pid_x', 'size_x', 'stock'], axis=1)

In [14]:
# Flatten X so that each row is 1 day; we expect 12,824*123 = 1,577,352 rows as result
cols = ['key', 'color', 'brand', 'rrp', 'mainCategory', 'category', 'subCategory', 'releaseDate']
X_flat = pd.melt(X_dataset, id_vars=cols, var_name='date', value_name='price')
X_flat = X_flat.sort_values(['key', 'date']).reset_index(drop=True)
print(X_flat.shape)
print(X_flat.tail()) # Quick check of the result

(1577352, 10)
            key color   brand     rrp  mainCategory  category  subCategory  \
1577347  22881S  grau  adidas  139.63             1         7         31.0   
1577348  22881S  grau  adidas  139.63             1         7         31.0   
1577349  22881S  grau  adidas  139.63             1         7         31.0   
1577350  22881S  grau  adidas  139.63             1         7         31.0   
1577351  22881S  grau  adidas  139.63             1         7         31.0   

        releaseDate              date   price  
1577347  2017-10-01  2018-01-27_price  104.72  
1577348  2017-10-01  2018-01-28_price  104.72  
1577349  2017-10-01  2018-01-29_price  104.72  
1577350  2017-10-01  2018-01-30_price  104.72  
1577351  2017-10-01  2018-01-31_price  104.72  


In [15]:
# Flatten Y similarly, so that the rows of Y correspond to that of X
Y_train = pd.melt(Y_train, id_vars='key', var_name='date', value_name='sales')
Y_test = pd.melt(Y_test, id_vars='key', var_name='date', value_name='sales')
Y_train = Y_train.sort_values(['key', 'date']).reset_index(drop=True)
Y_test = Y_test.sort_values(['key', 'date']).reset_index(drop=True)

In [16]:
# Separate X to training and test data; test data should have 12,824*31 = 397,544 rows
X_train = X_flat.loc[X_flat['date'].str.startswith('2017')].reset_index(drop=True)
X_test = X_flat.loc[X_flat['date'].str.startswith('2018')].reset_index(drop=True)
print(X_test.shape)
print(Y_test.shape)

(397544, 10)
(397544, 3)


In [19]:
# Clean 'date' columns to keep only YYYY-MM-DD part
X_train['date'] = X_train['date'].str[0:10]
X_test['date'] = X_test['date'].str[0:10]
Y_train['date'] = Y_train['date'].str[0:10]
Y_test['date'] = Y_test['date'].str[0:10]
print(Y_test.head())
print(X_test.head())

                   key        date  sales
0  10000XL ( 158-170 )  2018-01-01    0.0
1  10000XL ( 158-170 )  2018-01-02    0.0
2  10000XL ( 158-170 )  2018-01-03    0.0
3  10000XL ( 158-170 )  2018-01-04    0.0
4  10000XL ( 158-170 )  2018-01-05    0.0
                   key  color brand    rrp  mainCategory  category  \
0  10000XL ( 158-170 )  gruen  Nike  25.33             1         7   
1  10000XL ( 158-170 )  gruen  Nike  25.33             1         7   
2  10000XL ( 158-170 )  gruen  Nike  25.33             1         7   
3  10000XL ( 158-170 )  gruen  Nike  25.33             1         7   
4  10000XL ( 158-170 )  gruen  Nike  25.33             1         7   

   subCategory releaseDate        date  price  
0         25.0  2017-10-01  2018-01-01  18.99  
1         25.0  2017-10-01  2018-01-02  18.99  
2         25.0  2017-10-01  2018-01-03  18.99  
3         25.0  2017-10-01  2018-01-04  18.99  
4         25.0  2017-10-01  2018-01-05  18.99  


In [20]:
# Store dataframes to csv
out_directory = './data/clean'
if not os.path.exists(out_directory):
    os.makedirs(out_directory)

X_train.to_csv('{}/nn_X_train.csv'.format(out_directory))
X_test.to_csv('{}/nn_X_test.csv'.format(out_directory))
Y_train.to_csv('{}/nn_Y_train.csv'.format(out_directory))
Y_test.to_csv('{}/nn_Y_test.csv'.format(out_directory))