In [1]:
import pandas as pd
import os
import pickle

from measures import wrangling

# Import dataset and flatten for neural network input

In [2]:
dataset = pd.read_csv('./data/clean/data_v0.1.csv', header=0)

In [3]:
cols = pd.DataFrame([dataset.columns.tolist()])
cols.to_csv('./data/clean/data_v0.1_columns.csv')

In [4]:
# Drop February price-columns
dataset = wrangling.remove_price_cols(dataset, '2018-02-01', '2018-02-28')
print(dataset.columns)

Index(['key', 'pid_x', 'size_x', 'color', 'brand', 'rrp', 'mainCategory',
       'category', 'subCategory', 'stock',
       ...
       '2018-01-22_sales', '2018-01-23_sales', '2018-01-24_sales',
       '2018-01-25_sales', '2018-01-26_sales', '2018-01-27_sales',
       '2018-01-28_sales', '2018-01-29_sales', '2018-01-30_sales',
       '2018-01-31_sales'],
      dtype='object', length=257)


In [5]:
# Separate Y labels from X variables
Y_dataset = wrangling.sales_cols(dataset, '2017-10-01', '2018-01-31') 
X_dataset = wrangling.remove_sales_cols(dataset, '2017-10-01', '2018-01-31')

In [6]:
X_dataset = X_dataset.drop('stock', axis=1)
print(X_dataset.columns)

Index(['key', 'pid_x', 'size_x', 'color', 'brand', 'rrp', 'mainCategory',
       'category', 'subCategory', 'releaseDate',
       ...
       '2018-01-22_price', '2018-01-23_price', '2018-01-24_price',
       '2018-01-25_price', '2018-01-26_price', '2018-01-27_price',
       '2018-01-28_price', '2018-01-29_price', '2018-01-30_price',
       '2018-01-31_price'],
      dtype='object', length=133)


In [7]:
# Flatten X so that each row is 1 day; we expect 12,824*123 = 1,577,352 rows as result
cols = ['key', 'pid_x', 'size_x', 'color', 'brand', 'rrp', 'mainCategory', 'category', 'subCategory', 'releaseDate']
X_flat = pd.melt(X_dataset, id_vars=cols, var_name='date', value_name='price')
X_flat = X_flat.sort_values(['key', 'date']).reset_index(drop=True)
print(X_flat.shape)
print(X_flat.tail()) # Quick check of the result

(1577352, 12)
            key  pid_x size_x color   brand     rrp  mainCategory  category  \
1577347  22881S  22881      S  grau  adidas  139.63             1         7   
1577348  22881S  22881      S  grau  adidas  139.63             1         7   
1577349  22881S  22881      S  grau  adidas  139.63             1         7   
1577350  22881S  22881      S  grau  adidas  139.63             1         7   
1577351  22881S  22881      S  grau  adidas  139.63             1         7   

         subCategory releaseDate              date   price  
1577347         31.0  2017-10-01  2018-01-27_price  104.72  
1577348         31.0  2017-10-01  2018-01-28_price  104.72  
1577349         31.0  2017-10-01  2018-01-29_price  104.72  
1577350         31.0  2017-10-01  2018-01-30_price  104.72  
1577351         31.0  2017-10-01  2018-01-31_price  104.72  


In [8]:
# Flatten Y similarly, so that the rows of Y correspond to that of X
Y_flat = pd.melt(Y_dataset, id_vars='key', var_name='date', value_name='sales')
Y_flat = Y_flat.sort_values(['key', 'date']).reset_index(drop=True)
print(Y_flat.shape)
print(Y_flat.tail())

(1577352, 3)
            key              date  sales
1577347  22881S  2018-01-27_sales    0.0
1577348  22881S  2018-01-28_sales    0.0
1577349  22881S  2018-01-29_sales    0.0
1577350  22881S  2018-01-30_sales    1.0
1577351  22881S  2018-01-31_sales    1.0


# Cleaning data

In [9]:
# Clean 'date' columns to keep only YYYY-MM-DD part
X_flat['date'] = X_flat['date'].str[0:10]
Y_flat['date'] = Y_flat['date'].str[0:10]

In [10]:
X_flat['subCategory'] = X_flat['subCategory'].fillna(0) # Fill blank sub-category with 0
X_flat['size_x'] = X_flat['size_x'].fillna('NA') # Fill blank sizes with 'NA' string
X_flat['price'] = X_flat['price'].fillna(method='bfill') # Fill blank prices with earliest given price

# Feature engineering

In [11]:
print(X_flat.head())

                   key  pid_x          size_x  color brand    rrp  \
0  10000XL ( 158-170 )  10000  XL ( 158-170 )  gruen  Nike  25.33   
1  10000XL ( 158-170 )  10000  XL ( 158-170 )  gruen  Nike  25.33   
2  10000XL ( 158-170 )  10000  XL ( 158-170 )  gruen  Nike  25.33   
3  10000XL ( 158-170 )  10000  XL ( 158-170 )  gruen  Nike  25.33   
4  10000XL ( 158-170 )  10000  XL ( 158-170 )  gruen  Nike  25.33   

   mainCategory  category  subCategory releaseDate        date  price  
0             1         7         25.0  2017-10-01  2017-10-01  18.99  
1             1         7         25.0  2017-10-01  2017-10-02  18.99  
2             1         7         25.0  2017-10-01  2017-10-03  18.99  
3             1         7         25.0  2017-10-01  2017-10-04  18.99  
4             1         7         25.0  2017-10-01  2017-10-05  18.99  


In [12]:
# Add last N day's sales, up to 30 days
X_flat['last_1_day_sales'] = 0
X_flat['last_2_day_sales'] = 0
X_flat['last_3_day_sales'] = 0
X_flat['last_4_day_sales'] = 0
X_flat['last_5_day_sales'] = 0
X_flat['last_6_day_sales'] = 0
X_flat['last_7_day_sales'] = 0
X_flat['last_8_day_sales'] = 0
X_flat['last_9_day_sales'] = 0
X_flat['last_10_day_sales'] = 0
X_flat['last_11_day_sales'] = 0
X_flat['last_12_day_sales'] = 0
X_flat['last_13_day_sales'] = 0
X_flat['last_14_day_sales'] = 0
X_flat['last_15_day_sales'] = 0
X_flat['last_16_day_sales'] = 0
X_flat['last_17_day_sales'] = 0
X_flat['last_18_day_sales'] = 0
X_flat['last_19_day_sales'] = 0
X_flat['last_20_day_sales'] = 0
X_flat['last_21_day_sales'] = 0
X_flat['last_22_day_sales'] = 0
X_flat['last_23_day_sales'] = 0
X_flat['last_24_day_sales'] = 0
X_flat['last_25_day_sales'] = 0
X_flat['last_26_day_sales'] = 0
X_flat['last_27_day_sales'] = 0
X_flat['last_28_day_sales'] = 0

In [14]:
for i in range(12824):
    X_flat['last_1_day_sales'].iloc[i*123+1:i*123+123] = Y_flat['sales'].iloc[i*123:i*123+122].as_matrix()
    X_flat['last_2_day_sales'].iloc[i*123+2:i*123+123] = Y_flat['sales'].iloc[i*123:i*123+121].as_matrix()
    X_flat['last_3_day_sales'].iloc[i*123+3:i*123+123] = Y_flat['sales'].iloc[i*123:i*123+120].as_matrix()
    X_flat['last_4_day_sales'].iloc[i*123+4:i*123+123] = Y_flat['sales'].iloc[i*123:i*123+119].as_matrix()
    X_flat['last_5_day_sales'].iloc[i*123+5:i*123+123] = Y_flat['sales'].iloc[i*123:i*123+118].as_matrix()
    X_flat['last_6_day_sales'].iloc[i*123+6:i*123+123] = Y_flat['sales'].iloc[i*123:i*123+117].as_matrix()
    X_flat['last_7_day_sales'].iloc[i*123+7:i*123+123] = Y_flat['sales'].iloc[i*123:i*123+116].as_matrix()
    X_flat['last_8_day_sales'].iloc[i*123+8:i*123+123] = Y_flat['sales'].iloc[i*123:i*123+115].as_matrix()
    X_flat['last_9_day_sales'].iloc[i*123+9:i*123+123] = Y_flat['sales'].iloc[i*123:i*123+114].as_matrix()
    X_flat['last_10_day_sales'].iloc[i*123+10:i*123+123] = Y_flat['sales'].iloc[i*123:i*123+113].as_matrix()
    X_flat['last_11_day_sales'].iloc[i*123+11:i*123+123] = Y_flat['sales'].iloc[i*123:i*123+112].as_matrix()
    X_flat['last_12_day_sales'].iloc[i*123+12:i*123+123] = Y_flat['sales'].iloc[i*123:i*123+111].as_matrix()
    X_flat['last_13_day_sales'].iloc[i*123+13:i*123+123] = Y_flat['sales'].iloc[i*123:i*123+110].as_matrix()
    X_flat['last_14_day_sales'].iloc[i*123+14:i*123+123] = Y_flat['sales'].iloc[i*123:i*123+109].as_matrix()
    X_flat['last_15_day_sales'].iloc[i*123+15:i*123+123] = Y_flat['sales'].iloc[i*123:i*123+108].as_matrix()
    X_flat['last_16_day_sales'].iloc[i*123+16:i*123+123] = Y_flat['sales'].iloc[i*123:i*123+107].as_matrix()
    X_flat['last_17_day_sales'].iloc[i*123+17:i*123+123] = Y_flat['sales'].iloc[i*123:i*123+106].as_matrix()
    X_flat['last_18_day_sales'].iloc[i*123+18:i*123+123] = Y_flat['sales'].iloc[i*123:i*123+105].as_matrix()
    X_flat['last_19_day_sales'].iloc[i*123+19:i*123+123] = Y_flat['sales'].iloc[i*123:i*123+104].as_matrix()
    X_flat['last_20_day_sales'].iloc[i*123+20:i*123+123] = Y_flat['sales'].iloc[i*123:i*123+103].as_matrix()
    X_flat['last_21_day_sales'].iloc[i*123+21:i*123+123] = Y_flat['sales'].iloc[i*123:i*123+102].as_matrix()
    X_flat['last_22_day_sales'].iloc[i*123+22:i*123+123] = Y_flat['sales'].iloc[i*123:i*123+101].as_matrix()
    X_flat['last_23_day_sales'].iloc[i*123+23:i*123+123] = Y_flat['sales'].iloc[i*123:i*123+100].as_matrix()
    X_flat['last_24_day_sales'].iloc[i*123+24:i*123+123] = Y_flat['sales'].iloc[i*123:i*123+99].as_matrix()
    X_flat['last_25_day_sales'].iloc[i*123+25:i*123+123] = Y_flat['sales'].iloc[i*123:i*123+98].as_matrix()
    X_flat['last_26_day_sales'].iloc[i*123+26:i*123+123] = Y_flat['sales'].iloc[i*123:i*123+97].as_matrix()
    X_flat['last_27_day_sales'].iloc[i*123+27:i*123+123] = Y_flat['sales'].iloc[i*123:i*123+96].as_matrix()
    X_flat['last_28_day_sales'].iloc[i*123+28:i*123+123] = Y_flat['sales'].iloc[i*123:i*123+95].as_matrix()
    if i%128 == 0:
        print('{} products finished'.format(str(i)))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


0 products finished


128 products finished


256 products finished


384 products finished


512 products finished


640 products finished


768 products finished


896 products finished


1024 products finished


1152 products finished


1280 products finished


1408 products finished


1536 products finished


1664 products finished


1792 products finished


1920 products finished


2048 products finished


2176 products finished


2304 products finished


2432 products finished


2560 products finished


2688 products finished


2816 products finished


2944 products finished


3072 products finished


3200 products finished


3328 products finished


3456 products finished


3584 products finished


3712 products finished


3840 products finished


3968 products finished


4096 products finished


4224 products finished


4352 products finished


4480 products finished


4608 products finished


4736 products finished


4864 products finished


4992 products finished


5120 products finished


5248 products finished


5376 products finished


5504 products finished


5632 products finished


5760 products finished


5888 products finished


6016 products finished


6144 products finished


6272 products finished


6400 products finished


6528 products finished


6656 products finished


6784 products finished


6912 products finished


7040 products finished


7168 products finished


7296 products finished


7424 products finished


7552 products finished


7680 products finished


7808 products finished


7936 products finished


8064 products finished


8192 products finished


8320 products finished


8448 products finished


8576 products finished


8704 products finished


8832 products finished


8960 products finished


9088 products finished


9216 products finished


9344 products finished


9472 products finished


9600 products finished


9728 products finished


9856 products finished


9984 products finished


10112 products finished


10240 products finished


10368 products finished


10496 products finished


10624 products finished


10752 products finished


10880 products finished


11008 products finished


11136 products finished


11264 products finished


11392 products finished


11520 products finished


11648 products finished


11776 products finished


11904 products finished


12032 products finished


12160 products finished


12288 products finished


12416 products finished


12544 products finished


12672 products finished


12800 products finished


In [16]:
def is_eleventh(row):
    if row['date'][-2:] == '11':
        return 1
    else: 
        return 0

X_flat['is_eleventh'] = X_flat.apply(is_eleventh, axis=1)

In [18]:
def is_crazy_day(row):
    if row['date'] == '2017-11-24': # Black Friday
        return 1
    return 0

X_flat['is_crazy_day'] = X_flat.apply(is_crazy_day, axis=1)

In [20]:
# One-hot encoding days of week
X_flat['day_of_week'] = pd.to_datetime(X_flat['date']).dt.weekday_name

In [22]:
days = pd.get_dummies(X_flat['day_of_week'], prefix='day')
X_flat = X_flat.join(days)

ValueError: columns overlap but no suffix specified: Index(['day_Friday', 'day_Monday', 'day_Saturday', 'day_Sunday',
       'day_Thursday', 'day_Tuesday', 'day_Wednesday'],
      dtype='object')

In [24]:
X_flat['days_since_release'] = (pd.to_datetime(X_flat['date']) - pd.to_datetime(X_flat['releaseDate'])).dt.days

In [25]:
X_flat['price_diff'] = X_flat['price'] - X_flat['rrp']

In [26]:
# One-hot encoding everything else except sub-category (too many of them)
colors = pd.get_dummies(X_flat['color'], prefix='color')
brands = pd.get_dummies(X_flat['brand'], prefix='brand')
main_cats = pd.get_dummies(X_flat['mainCategory'], prefix='maincat')
cats = pd.get_dummies(X_flat['category'], prefix='cat')
sub_cats = pd.get_dummies(X_flat['subCategory'], prefix='subcat')

X_flat = X_flat.join(colors)
X_flat = X_flat.join(brands)
X_flat = X_flat.join(main_cats)
X_flat = X_flat.join(cats)

In [28]:
def marketing_activity(row):
    marketingactivities = ['2017-10-11','2017-10-16','2017-11-04','2017-11-11','2017-11-23','2017-11-24', 
                           '2017-11-25','2017-11-27','2017-12-03','2017-12-27','2017-12-28','2017-12-31', 
                           '2018-01-14','2018-01-22','2018-01-23','2018-01-30','2018-02-06','2018-02-07', 
                           '2018-02-20','2018-02-22','2018-02-23']
    if row['date'] in marketingactivities:
        return 1
    else:
        return 0
    
X_flat['marketing_activity'] = X_flat.apply(marketing_activity, axis=1)

In [30]:
X_flat.columns

Index(['key', 'pid_x', 'size_x', 'color', 'brand', 'rrp', 'mainCategory',
       'category', 'subCategory', 'releaseDate',
       ...
       'cat_7', 'cat_10', 'cat_16', 'cat_18', 'cat_24', 'cat_30', 'cat_33',
       'cat_36', 'cat_37', 'marketing_activity'],
      dtype='object', length=108)

# Dump data files to directory

In [31]:
# Dump to pickle
pickle.dump(X_flat, open('./data/clean/X_flat.pkl', 'wb'))
pickle.dump(Y_flat, open('./data/clean/Y_flat.pkl', 'wb'))
# Read the file later by: X_flat = pickle.load(open('path/to/X_flat.pkl', "rb"))