In [132]:
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime


In [133]:

# =============================================================================
# 0.2 Import data
# =============================================================================

train = pd.read_csv('data/train.csv', low_memory=False)
store = pd.read_csv('data/store.csv', low_memory=False)

#%% 1. Merging store to train data

# =============================================================================
# 1.1 Since data has to has the same size as 'train', a container is created
# =============================================================================

# Creating dataframe
expanded_store = train

# =============================================================================
# 1.2 Merging variables which do not have to be changed
# =============================================================================

# Variables which can be merged right away
easy = store.loc[:,['Store','StoreType','Assortment','CompetitionDistance']]

# Variables which can be simply merged
expanded_store = pd.merge(expanded_store, easy, on=['Store'])


In [134]:
expanded=expanded_store

In [136]:
# Convert Dates to Date time
expanded['Date']=pd.to_datetime(expanded['Date'])
expanded.set_index('Date')

Unnamed: 0_level_0,Store,DayOfWeek,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2013-01-01,1115.0,2.0,0.0,0.0,0.0,0.0,a,1.0,d,c,5350.0
2013-01-02,1115.0,3.0,3697.0,305.0,1.0,0.0,0,1.0,d,c,5350.0
2013-01-03,1115.0,4.0,4297.0,300.0,1.0,0.0,0,1.0,d,c,5350.0
2013-01-04,1115.0,5.0,4540.0,326.0,1.0,0.0,0,1.0,d,c,5350.0
2013-01-05,1115.0,6.0,4771.0,339.0,1.0,0.0,0,1.0,d,c,5350.0
...,...,...,...,...,...,...,...,...,...,...,...
2014-06-26,127.0,4.0,3335.0,480.0,1.0,0.0,0,0.0,d,a,1350.0
2014-06-27,127.0,5.0,4724.0,591.0,1.0,0.0,0,0.0,d,a,1350.0
2014-06-28,127.0,6.0,5398.0,716.0,1.0,0.0,0,0.0,d,a,1350.0
2014-06-29,127.0,7.0,0.0,0.0,0.0,0.0,0,0.0,d,a,1350.0


In [137]:
#add in variables for day of month etc
expanded['dayofweek'] = expanded['Date'].dt.dayofweek
expanded['quarter'] = expanded['Date'].dt.quarter
expanded['month'] = expanded['Date'].dt.month
expanded['year'] = expanded['Date'].dt.year
expanded['dayofyear'] = expanded['Date'].dt.dayofyear
expanded['dayofmonth'] = expanded['Date'].dt.day
expanded['weekofyear'] = expanded['Date'].dt.weekofyear

In [138]:
#create dummy variables for day of week etc and categorical variables
expanded= pd.get_dummies(expanded)
expanded= pd.get_dummies(expanded,columns=['dayofweek','dayofmonth','quarter','month'])
expanded.set_index('Date')


Unnamed: 0_level_0,Store,DayOfWeek,Sales,Customers,Open,Promo,SchoolHoliday,CompetitionDistance,year,dayofyear,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-01,1115.0,2.0,0.0,0.0,0.0,0.0,1.0,5350.0,2013,1,...,0,0,0,0,0,0,0,0,0,0
2013-01-02,1115.0,3.0,3697.0,305.0,1.0,0.0,1.0,5350.0,2013,2,...,0,0,0,0,0,0,0,0,0,0
2013-01-03,1115.0,4.0,4297.0,300.0,1.0,0.0,1.0,5350.0,2013,3,...,0,0,0,0,0,0,0,0,0,0
2013-01-04,1115.0,5.0,4540.0,326.0,1.0,0.0,1.0,5350.0,2013,4,...,0,0,0,0,0,0,0,0,0,0
2013-01-05,1115.0,6.0,4771.0,339.0,1.0,0.0,1.0,5350.0,2013,5,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2014-06-26,127.0,4.0,3335.0,480.0,1.0,0.0,0.0,1350.0,2014,177,...,0,0,0,1,0,0,0,0,0,0
2014-06-27,127.0,5.0,4724.0,591.0,1.0,0.0,0.0,1350.0,2014,178,...,0,0,0,1,0,0,0,0,0,0
2014-06-28,127.0,6.0,5398.0,716.0,1.0,0.0,0.0,1350.0,2014,179,...,0,0,0,1,0,0,0,0,0,0
2014-06-29,127.0,7.0,0.0,0.0,0.0,0.0,0.0,1350.0,2014,180,...,0,0,0,1,0,0,0,0,0,0


In [139]:
#Re-add original day of month variable etc.
expanded['dayofweek'] = expanded['Date'].dt.dayofweek
expanded['quarter'] = expanded['Date'].dt.quarter
expanded['month'] = expanded['Date'].dt.month
expanded['year'] = expanded['Date'].dt.year
expanded['dayofyear'] = expanded['Date'].dt.dayofyear
expanded['dayofmonth'] = expanded['Date'].dt.day
expanded['weekofyear'] = expanded['Date'].dt.weekofyear

expanded.set_index('Date')


<class 'pandas.core.frame.DataFrame'>
Int64Index: 618473 entries, 0 to 618472
Data columns (total 81 columns):
Date                   618473 non-null datetime64[ns]
Store                  618473 non-null float64
DayOfWeek              600036 non-null float64
Sales                  600028 non-null float64
Customers              599957 non-null float64
Open                   599832 non-null float64
Promo                  599791 non-null float64
SchoolHoliday          599686 non-null float64
CompetitionDistance    616838 non-null float64
year                   618473 non-null int64
dayofyear              618473 non-null int64
weekofyear             618473 non-null int64
StateHoliday_0         618473 non-null uint8
StateHoliday_a         618473 non-null uint8
StateHoliday_b         618473 non-null uint8
StateHoliday_c         618473 non-null uint8
StoreType_a            618473 non-null uint8
StoreType_b            618473 non-null uint8
StoreType_c            618473 non-null uint8
StoreType

In [140]:
#create average sales per customer
df=expanded.groupby(by=['Store']).agg({'Sales':'sum','Customers':'sum'})
df['av_SalesPerCustomer']=df['Sales']/df['Customers']
df.sort_values(by='av_SalesPerCustomer',ascending=False)
expanded = pd.merge(expanded, df['av_SalesPerCustomer'], on=['Store'])

#create sales per customer for each store and day
expanded['SalesPerCustomer']=expanded['Sales']/expanded['Customers']


Index(['Date', 'Store', 'DayOfWeek', 'Sales', 'Customers', 'Open', 'Promo',
       'SchoolHoliday', 'CompetitionDistance', 'year', 'dayofyear',
       'weekofyear', 'StateHoliday_0', 'StateHoliday_a', 'StateHoliday_b',
       'StateHoliday_c', 'StoreType_a', 'StoreType_b', 'StoreType_c',
       'StoreType_d', 'Assortment_a', 'Assortment_b', 'Assortment_c',
       'dayofweek_0', 'dayofweek_1', 'dayofweek_2', 'dayofweek_3',
       'dayofweek_4', 'dayofweek_5', 'dayofweek_6', 'dayofmonth_1',
       'dayofmonth_2', 'dayofmonth_3', 'dayofmonth_4', 'dayofmonth_5',
       'dayofmonth_6', 'dayofmonth_7', 'dayofmonth_8', 'dayofmonth_9',
       'dayofmonth_10', 'dayofmonth_11', 'dayofmonth_12', 'dayofmonth_13',
       'dayofmonth_14', 'dayofmonth_15', 'dayofmonth_16', 'dayofmonth_17',
       'dayofmonth_18', 'dayofmonth_19', 'dayofmonth_20', 'dayofmonth_21',
       'dayofmonth_22', 'dayofmonth_23', 'dayofmonth_24', 'dayofmonth_25',
       'dayofmonth_26', 'dayofmonth_27', 'dayofmonth_28', 'dayof

In [27]:
#drop if any other values null
data_new=expanded
#data_new=data_new.dropna(axis = 0, how ='any') 
#expanded_new.to_csv('data/train_all_10_Oct.csv')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 513606 entries, 0 to 618472
Data columns (total 81 columns):
Date                   513606 non-null datetime64[ns]
Store                  513606 non-null float64
DayOfWeek              513606 non-null float64
Sales                  513606 non-null float64
Customers              513606 non-null float64
Open                   513606 non-null float64
Promo                  513606 non-null float64
SchoolHoliday          513606 non-null float64
CompetitionDistance    513606 non-null float64
year                   513606 non-null int64
dayofyear              513606 non-null int64
weekofyear             513606 non-null int64
StateHoliday_0         513606 non-null uint8
StateHoliday_a         513606 non-null uint8
StateHoliday_b         513606 non-null uint8
StateHoliday_c         513606 non-null uint8
StoreType_a            513606 non-null uint8
StoreType_b            513606 non-null uint8
StoreType_c            513606 non-null uint8
StoreType