In [117]:
import pandas as pd
import os
from measures import wrangling

# Import dataset and flatten for neural network input

In [147]:
dataset = pd.read_csv('./data/clean/data_v0.1.csv', header=0)

In [148]:
cols = pd.DataFrame([dataset.columns.tolist()])
cols.to_csv('./data/clean/data_v0.1_columns.csv')

In [149]:
# Drop February price-columns
dataset = wrangling.remove_price_cols(dataset, '2018-02-01', '2018-02-28')
print(dataset.columns)

Index(['key', 'pid_x', 'size_x', 'color', 'brand', 'rrp', 'mainCategory',
       'category', 'subCategory', 'stock',
       ...
       '2018-01-22_sales', '2018-01-23_sales', '2018-01-24_sales',
       '2018-01-25_sales', '2018-01-26_sales', '2018-01-27_sales',
       '2018-01-28_sales', '2018-01-29_sales', '2018-01-30_sales',
       '2018-01-31_sales'],
      dtype='object', length=257)


In [150]:
# Separate Y labels from X variables
Y_dataset = wrangling.sales_cols(dataset, '2017-10-01', '2018-01-31') 
X_dataset = wrangling.remove_sales_cols(dataset, '2017-10-01', '2018-01-31')

In [151]:
X_dataset = X_dataset.drop('stock', axis=1)
print(X_dataset.columns)

Index(['key', 'pid_x', 'size_x', 'color', 'brand', 'rrp', 'mainCategory',
       'category', 'subCategory', 'releaseDate',
       ...
       '2018-01-22_price', '2018-01-23_price', '2018-01-24_price',
       '2018-01-25_price', '2018-01-26_price', '2018-01-27_price',
       '2018-01-28_price', '2018-01-29_price', '2018-01-30_price',
       '2018-01-31_price'],
      dtype='object', length=133)


In [152]:
# Flatten X so that each row is 1 day; we expect 12,824*123 = 1,577,352 rows as result
cols = ['key', 'pid_x', 'size_x', 'color', 'brand', 'rrp', 'mainCategory', 'category', 'subCategory', 'releaseDate']
X_flat = pd.melt(X_dataset, id_vars=cols, var_name='date', value_name='price')
X_flat = X_flat.sort_values(['key', 'date']).reset_index(drop=True)
print(X_flat.shape)
print(X_flat.tail()) # Quick check of the result

(1577352, 12)
            key  pid_x size_x color   brand     rrp  mainCategory  category  \
1577347  22881S  22881      S  grau  adidas  139.63             1         7   
1577348  22881S  22881      S  grau  adidas  139.63             1         7   
1577349  22881S  22881      S  grau  adidas  139.63             1         7   
1577350  22881S  22881      S  grau  adidas  139.63             1         7   
1577351  22881S  22881      S  grau  adidas  139.63             1         7   

         subCategory releaseDate              date   price  
1577347         31.0  2017-10-01  2018-01-27_price  104.72  
1577348         31.0  2017-10-01  2018-01-28_price  104.72  
1577349         31.0  2017-10-01  2018-01-29_price  104.72  
1577350         31.0  2017-10-01  2018-01-30_price  104.72  
1577351         31.0  2017-10-01  2018-01-31_price  104.72  


In [154]:
# Flatten Y similarly, so that the rows of Y correspond to that of X
Y_flat = pd.melt(Y_dataset, id_vars='key', var_name='date', value_name='sales')
Y_flat = Y_flat.sort_values(['key', 'date']).reset_index(drop=True)
print(Y_flat.shape)
print(Y_flat.tail())

(1577352, 3)
            key              date  sales
1577347  22881S  2018-01-27_sales    0.0
1577348  22881S  2018-01-28_sales    0.0
1577349  22881S  2018-01-29_sales    0.0
1577350  22881S  2018-01-30_sales    1.0
1577351  22881S  2018-01-31_sales    1.0


# Cleaning data

In [155]:
# Clean 'date' columns to keep only YYYY-MM-DD part
X_flat['date'] = X_flat['date'].str[0:10]
Y_flat['date'] = Y_flat['date'].str[0:10]

In [156]:
X_flat['subCategory'] = X_flat['subCategory'].fillna(0) # Fill blank sub-category with 0
X_flat['size_x'] = X_flat['size_x'].fillna('NA') # Fill blank sizes with 'NA' string
X_flat['price'] = X_flat['price'].fillna(method='bfill') # Fill blank prices with earliest given price

# Feature engineering

In [158]:
print(X_flat.head())

                   key  pid_x          size_x  color brand    rrp  \
0  10000XL ( 158-170 )  10000  XL ( 158-170 )  gruen  Nike  25.33   
1  10000XL ( 158-170 )  10000  XL ( 158-170 )  gruen  Nike  25.33   
2  10000XL ( 158-170 )  10000  XL ( 158-170 )  gruen  Nike  25.33   
3  10000XL ( 158-170 )  10000  XL ( 158-170 )  gruen  Nike  25.33   
4  10000XL ( 158-170 )  10000  XL ( 158-170 )  gruen  Nike  25.33   

   mainCategory  category  subCategory releaseDate        date  price  
0             1         7         25.0  2017-10-01  2017-10-01  18.99  
1             1         7         25.0  2017-10-01  2017-10-02  18.99  
2             1         7         25.0  2017-10-01  2017-10-03  18.99  
3             1         7         25.0  2017-10-01  2017-10-04  18.99  
4             1         7         25.0  2017-10-01  2017-10-05  18.99  


In [159]:
def is_eleventh(row):
    if row['date'][-2:] == '11':
        return 1
    else: 
        return 0

X_flat['is_eleventh'] = X_flat.apply(is_eleventh, axis=1)

In [160]:
def is_crazy_day(row):
    if row['date'] == '2017-11-24': # Black Friday
        return 1
    return 0

X_flat['is_crazy_day'] = X_flat.apply(is_crazy_day, axis=1)

In [161]:
# One-hot encoding days
def is_day(day):
    if row['day_of_week'] == day:
        return 1
    return 0

X_flat['day_of_week'] = pd.to_datetime(X_flat['date']).dt.weekday_name

In [162]:
days = pd.get_dummies(X_flat['day_of_week'], prefix='day')
X_flat = X_flat.join(days)

In [163]:
X_flat['days_since_release'] = (pd.to_datetime(X_flat['date']) - pd.to_datetime(X_flat['releaseDate'])).dt.days

In [164]:
X_flat['price_diff'] = X_flat['price'] - X_flat['rrp']

In [165]:
# One-hot encoding everything else except sub-category (too many of them)
colors = pd.get_dummies(X_flat['color'], prefix='color')
brands = pd.get_dummies(X_flat['brand'], prefix='brand')
main_cats = pd.get_dummies(X_flat['mainCategory'], prefix='maincat')
cats = pd.get_dummies(X_flat['category'], prefix='cat')
sub_cats = pd.get_dummies(X_flat['subCategory'], prefix='subcat')

X_flat = X_flat.join(colors)
X_flat = X_flat.join(brands)
X_flat = X_flat.join(main_cats)
X_flat = X_flat.join(cats)

In [166]:
X_flat.columns

Index(['key', 'pid_x', 'size_x', 'color', 'brand', 'rrp', 'mainCategory',
       'category', 'subCategory', 'releaseDate', 'date', 'price',
       'is_eleventh', 'is_crazy_day', 'day_of_week', 'day_Friday',
       'day_Monday', 'day_Saturday', 'day_Sunday', 'day_Thursday',
       'day_Tuesday', 'day_Wednesday', 'days_since_release', 'price_diff',
       'color_beige', 'color_blau', 'color_braun', 'color_gelb', 'color_gold',
       'color_grau', 'color_gruen', 'color_khaki', 'color_lila',
       'color_orange', 'color_pink', 'color_rosa', 'color_rot',
       'color_schwarz', 'color_silber', 'color_tuerkis', 'color_weiss',
       'brand_Asics', 'brand_Cinquestelle', 'brand_Converse', 'brand_Diadora',
       'brand_Erima', 'brand_FREAM', 'brand_Hummel', 'brand_Jako',
       'brand_Jordan', 'brand_KangaROOS', 'brand_Kempa', 'brand_Lotto',
       'brand_Mizuno', 'brand_New Balance', 'brand_Nike', 'brand_Onitsuka',
       'brand_PUMA', 'brand_Reebok', 'brand_Reusch', 'brand_Sells',
       'b

# Dump data files to directory

In [167]:
# Store dataframes to csv
out_directory = './data/clean'
if not os.path.exists(out_directory):
    os.makedirs(out_directory)

X_flat.to_csv('{}/nn_X.csv'.format(out_directory))
Y_flat.to_csv('{}/nn_Y.csv'.format(out_directory))