In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime


In [2]:
#%% 0. Housekeeping 

# =============================================================================
# 0.1 Import packages
# =============================================================================

import pandas as pd
import datetime as dt
from datetime import datetime
   
# =============================================================================
# 0.2 Import data
# =============================================================================

train = pd.read_csv('data/train.csv', low_memory=False)
store = pd.read_csv('data/store.csv', low_memory=False)

#%% 1. Merging store to train data

# =============================================================================
# 1.1 Since data has to have the same size as 'train', a container is created
# =============================================================================

# Creating dataframe
expanded_store = train

# =============================================================================
# 1.2 Merging variables which do not have to be changed
# =============================================================================

# Variables which can be merged right away
easy = store.loc[:,['Store','StoreType','Assortment','CompetitionDistance']]

# Variables which can be simply merged
expanded_store = pd.merge(expanded_store, easy, on=['Store'])

#%% 2. Creating a dummy variable since for competition is open for every store

# =============================================================================
# 2.1 Checking whether there is actually competition
# =============================================================================

# Dataset of the variables which have to be transformed
competition = store.loc[:,['Store',
                           'CompetitionOpenSinceMonth','CompetitionOpenSinceYear']]

# For easier looping 
competition.set_index('Store', inplace=True)

# Finding stores which have competition openend
open_comp = competition.any(axis=1)

# Stores with competition
open_stores = competition.loc[open_comp].index.unique()

# =============================================================================
# 2.2 Generating dummy for the time competition is existing
# =============================================================================

# Generating Date when opened
for store_id in open_stores: 
    year = competition.loc[store_id,'CompetitionOpenSinceYear'].astype(int)
    month = competition.loc[store_id,'CompetitionOpenSinceMonth'].astype(int)
    competition.loc[store_id,'CompetitionDate'] = datetime(year,month,1)

# Creating a dummy variable for whether competition openend for each store
expanded_store.loc[:,'CompetitionOpened'] = 0

for store_id in open_stores: 
    
    # Getting the date when competition openend
    date = competition.loc[store_id,'CompetitionDate']
    date_str= date.strftime("%Y-%m-%d")
    
    store_number = expanded_store.loc[:,'Store'] == store_id
    
    maximum_date = expanded_store.loc[store_number,'Date'].max()
    
    # Whether it falls in time frame
    competition_existing = (expanded_store.loc[:,'Date'].between(date_str,maximum_date)) &  (expanded_store.loc[:,'Store'] == store_id)
    
    # Indicating whether competition is around
    expanded_store.loc[competition_existing, 'CompetitionOpened'] = 1


#%% 3. Creating a dummy variable for Promo
 
# =============================================================================
# 3.1 Since data has to have the same size as 'train', a container is created
# =============================================================================

# Create dataframe
promo2 = store.loc[:,['Store',
                     'Promo2','Promo2SinceWeek','Promo2SinceYear','PromoInterval']]

# =============================================================================
# 3.2 Create dummy for whether a promo2 is running 
# =============================================================================

# Getting date from which promo started
length = promo2.shape[0]
for i in range(length): 
    if promo2.loc[i,'Promo2']:
        week = promo2.loc[i,'Promo2SinceWeek'].astype(int)
        year = promo2.loc[i,'Promo2SinceYear'].astype(int)
        promo2.loc[i,'promo2start'] = dt.datetime.strptime(f'{year}-W{int(week )- 1}-1', "%Y-W%W-%w").date()

# Merge it with the train file
expanded_promo = pd.merge(expanded_store, promo2, on=['Store'])


# Empty container with no promo indicator
expanded_promo.loc[:,'Promo2GoingOn'] = 0

# Month indication
expanded_promo.loc[:,'Date_str'] = pd.to_datetime(expanded_promo.loc[:,'Date'],).dt.strftime('%Y-%b-%d')
expanded_promo.loc[:,'Date'] = pd.to_datetime(expanded_promo.loc[:,'Date'],)
expanded_promo.loc[:,'month'] = expanded_promo.loc[:,'Date_str'].str[5:8]


months = expanded_promo.loc[:,'month'].unique()

for month in months:
    month_boolean = expanded_promo.loc[:,'PromoInterval'].str.contains(month, na=False)
    expanded_promo.loc[month_boolean,'Promo2GoingOn'] = 1

In [3]:

#%% 4. Creating time since competition opened

exp_comp = pd.merge(expanded_promo, competition, on=['Store'])

date_current = pd.to_datetime(exp_comp.loc[:,'Date'])
date_openend = pd.to_datetime(exp_comp.loc[:,'CompetitionDate'])

exp_comp.loc[:,'comp_open_since'] = (date_current - date_openend).astype('timedelta64[D]')

future_comp = (exp_comp.loc[:,'comp_open_since'] < 0)
no_comp = exp_comp.loc[:,'comp_open_since'].isna()

exp_comp.loc[future_comp, 'comp_open_since'] = 0
exp_comp.loc[no_comp, 'comp_open_since'] = 0

expanded_promo.loc[:,'comp_open_since'] = exp_comp.loc[:,'comp_open_since']

#%% 5. Decaying competition factor

expanded_promo.loc[:,'IntervalList'] = expanded_promo.loc[:,'PromoInterval'].str.split(pat = ',')

Interval = {'First': 0,
'Second' : 1,
'Third' : 2,
'Fourth': 3}

for element, value in Interval.items():
    expanded_promo.loc[:,element] = expanded_promo.loc[:,'IntervalList'].str[value]

year = pd.to_datetime(expanded_promo.loc[:,'Date']).dt.year.astype(str)

Interval = {'Interval1': 'First',
'Interval2' : 'Second',
'Interval3' : 'Third',
'Interval4': 'Fourth'}

expanded_promo.loc[:,'Date_Actual'] = pd.to_datetime(expanded_promo.loc[:,'Date'])

for element, value in Interval.items():

    ### New year stuff
    expanded_promo.loc[:,'Date_Str'] = '1' + '-' + expanded_promo.loc[:,value] + '-' + year
    dates = pd.to_datetime(expanded_promo.loc[:,'Date_Str'])

    expanded_promo.loc[:,element] = (expanded_promo.loc[:,'Date_Actual'] - dates).astype('timedelta64[D]')

    negative = expanded_promo.loc[:,element] < 0
    expanded_promo.loc[negative,element] = np.nan

    ### Last year stuff
    expanded_promo.loc[:,'Date_Str'] = '1' + '-' + expanded_promo.loc[:,value] + '-' + (year.astype(int)-1).astype(str)
    dates = pd.to_datetime(expanded_promo.loc[:,'Date_Str'])

    expanded_promo.loc[:,element + 'before'] = (expanded_promo.loc[:,'Date_Actual'] - dates).astype('timedelta64[D]')

    negative = expanded_promo.loc[:,element + 'before'] < 0
    expanded_promo.loc[negative,element + 'before'] = np.nan

all_versions = expanded_promo.loc[:,['Interval1','Interval2','Interval3','Interval4',
'Interval1before','Interval2before','Interval3before','Interval4before',]]
minimum_distance = all_versions.min(axis=1, skipna=True)

expanded_promo.loc[:,'DaysFromPromotion'] = minimum_distance

expanded_promo.loc[:,'Decay'] = np.exp(- 0.05 * minimum_distance)

In [4]:
expanded_promo=expanded_promo[['Date', 'Store', 'DayOfWeek', 'Sales', 'Customers', 'Open', 'Promo','StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment','CompetitionDistance', 'CompetitionOpened', 'Promo2', 'Promo2SinceWeek',
       'Promo2SinceYear', 'Promo2GoingOn', 'comp_open_since', 'DaysFromPromotion','Decay']]
expanded=expanded_promo.copy()
expanded_promo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 618473 entries, 0 to 618472
Data columns (total 20 columns):
Date                   618473 non-null datetime64[ns]
Store                  618473 non-null float64
DayOfWeek              600036 non-null float64
Sales                  600028 non-null float64
Customers              599957 non-null float64
Open                   599832 non-null float64
Promo                  599791 non-null float64
StateHoliday           599873 non-null object
SchoolHoliday          599686 non-null float64
StoreType              618473 non-null object
Assortment             618473 non-null object
CompetitionDistance    616838 non-null float64
CompetitionOpened      618473 non-null int64
Promo2                 618473 non-null int64
Promo2SinceWeek        314828 non-null float64
Promo2SinceYear        314828 non-null float64
Promo2GoingOn          618473 non-null int64
comp_open_since        618473 non-null float64
DaysFromPromotion      314828 non-null float6

In [5]:
# Convert Dates to Date time
expanded=expanded_promo.copy()

expanded['Date']=pd.to_datetime(expanded['Date'])


In [6]:
#add in variables for day of month etc
expanded['dayofweek'] = expanded['Date'].dt.dayofweek
expanded['quarter'] = expanded['Date'].dt.quarter
expanded['month'] = expanded['Date'].dt.month
expanded['year'] = expanded['Date'].dt.year
expanded['dayofyear'] = expanded['Date'].dt.dayofyear
expanded['dayofmonth'] = expanded['Date'].dt.day
expanded['weekofyear'] = expanded['Date'].dt.weekofyear

In [7]:
#create dummy variables for day of week etc and categorical variables

expanded= pd.get_dummies(expanded,columns=['dayofweek','dayofmonth','quarter','month','StateHoliday','StoreType','Assortment'])



In [8]:
#Re-add original day of month variable etc.
expanded['dayofweek'] = expanded['Date'].dt.dayofweek
expanded['quarter'] = expanded['Date'].dt.quarter
expanded['month'] = expanded['Date'].dt.month
expanded['year'] = expanded['Date'].dt.year
expanded['dayofyear'] = expanded['Date'].dt.dayofyear
expanded['dayofmonth'] = expanded['Date'].dt.day
expanded['weekofyear'] = expanded['Date'].dt.weekofyear




In [9]:
expanded.columns

Index(['Date', 'Store', 'DayOfWeek', 'Sales', 'Customers', 'Open', 'Promo',
       'SchoolHoliday', 'CompetitionDistance', 'CompetitionOpened', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'Promo2GoingOn',
       'comp_open_since', 'DaysFromPromotion', 'Decay', 'year', 'dayofyear',
       'weekofyear', 'dayofweek_0', 'dayofweek_1', 'dayofweek_2',
       'dayofweek_3', 'dayofweek_4', 'dayofweek_5', 'dayofweek_6',
       'dayofmonth_1', 'dayofmonth_2', 'dayofmonth_3', 'dayofmonth_4',
       'dayofmonth_5', 'dayofmonth_6', 'dayofmonth_7', 'dayofmonth_8',
       'dayofmonth_9', 'dayofmonth_10', 'dayofmonth_11', 'dayofmonth_12',
       'dayofmonth_13', 'dayofmonth_14', 'dayofmonth_15', 'dayofmonth_16',
       'dayofmonth_17', 'dayofmonth_18', 'dayofmonth_19', 'dayofmonth_20',
       'dayofmonth_21', 'dayofmonth_22', 'dayofmonth_23', 'dayofmonth_24',
       'dayofmonth_25', 'dayofmonth_26', 'dayofmonth_27', 'dayofmonth_28',
       'dayofmonth_29', 'dayofmonth_30', 'dayofmonth_31', 

In [10]:
expanded.dropna(axis = 0, how ='any',inplace=True)
expanded=expanded[expanded['Sales'] >0 ]

In [11]:

from datetime import timedelta
import datetime
df=expanded.copy()
date_range_days=(df['Date'].max() - df['Date'].min()).days
split_date=df['Date'].min() + timedelta(date_range_days*0.8) #train set 80% of full population
#randomly creating train and test subsets. may need to refine this 
df_early,df_later = df.loc[df['Date'] <= split_date], df.loc[df['Date'] > split_date]

In [12]:
df_early.shape

(176068, 89)

In [13]:
#create average sales per customer
aggs=df_early.groupby(by=['Store']).agg({'Sales':'sum','Customers':'sum'})
aggs['av_SalesPerCustomer']=aggs['Sales']/aggs['Customers']
aggs.sort_values(by='av_SalesPerCustomer',ascending=False)
df_early= pd.merge(df_early, aggs['av_SalesPerCustomer'], on=['Store'])
df_later= pd.merge(df_later, aggs['av_SalesPerCustomer'], on=['Store'])
df= pd.merge(df, aggs['av_SalesPerCustomer'], on=['Store'])
#create sales per customer for each store and day
#expanded['SalesPerCustomer']=expanded['Sales']/expanded['Customers']



In [14]:
#create av sales per day of the week
aggs=df_early.groupby(by=['Store','dayofweek']).agg({'Sales':'sum','Customers':'sum'})
aggs['av_SalesPerCustomer_dayofweek']=aggs['Sales']/aggs['Customers']
aggs.sort_values(by='av_SalesPerCustomer_dayofweek',ascending=False)
df_early= pd.merge(df_early, aggs['av_SalesPerCustomer_dayofweek'], on=['Store'])
df_later= pd.merge(df_later, aggs['av_SalesPerCustomer_dayofweek'], on=['Store'])
df= pd.merge(df, aggs['av_SalesPerCustomer_dayofweek'], on=['Store'])
#create sales per customer for each store and day
#expanded['SalesPerCustomer']=expanded['Sales']/expanded['Customers']

In [None]:
#create av sales per day of month

aggs=df_early.groupby(by=['Store','dayofmonth']).agg({'Sales':'sum','Customers':'sum'})
aggs['av_SalesPerCustomer_dayofmonth']=aggs['Sales']/aggs['Customers']
aggs.sort_values(by='av_SalesPerCustomer_dayofmonth',ascending=False)
df_early= pd.merge(df_early, aggs['av_SalesPerCustomer_dayofmonth'], on=['Store'])
df_later= pd.merge(df_later, aggs['av_SalesPerCustomer_dayofmonth'], on=['Store'])
df= pd.merge(df, aggs['av_SalesPerCustomer_dayofmonth'], on=['Store'])
#create sales per customer for each store and day
#expanded['SalesPerCustomer']=expanded['Sales']/expanded['Customers']

In [17]:
#drop if any other values null

df2=df_early.combine(df_later)

#X, y = df.loc[:, :-['Sales'],df.iloc[:, -['Sales']]
#X_train, X_test, y_train, y_test = df_early.loc[:,:-['Sales']], df_later.iloc[:,:-['Sales']], df_early.iloc[:,-['Sales']], df_later.iloc[:,-['Sales']] 

#data_new.set_index('Date',inplace=True)

#expanded_new.to_csv('data/train_all_10_Oct.csv')

TypeError: combine() missing 1 required positional argument: 'func'

In [46]:
%store data_new

Stored 'data_new' (DataFrame)


Unnamed: 0,Store,DayOfWeek,Sales,Customers,Open,Promo,SchoolHoliday,CompetitionDistance,CompetitionOpened,Promo2,...,Assortment_a,Assortment_b,Assortment_c,dayofweek,quarter,month,dayofmonth,av_SalesPerCustomer,av_SalesPerCustomer_dayofweek,av_SalesPerCustomer_dayofmonth
count,216185.0,216185.0,216185.0,216185.0,216185.0,216185.0,216185.0,216185.0,216185.0,216185.0,...,216185.0,216185.0,216185.0,216185.0,216185.0,216185.0,216185.0,216185.0,216185.0,216185.0
mean,560.361561,3.519083,6450.173023,679.430293,1.0,0.433064,0.18425,4320.19141,0.54483,1.0,...,0.527604,0.008368,0.464028,2.519083,2.210385,5.579069,15.81977,9.952542,9.941649,9.956964
std,326.25035,1.72049,2709.995491,323.561257,0.0,0.4955,0.387689,5349.235579,0.497987,0.0,...,0.499239,0.091093,0.498706,1.72049,1.061716,3.253345,8.673782,2.051448,2.105917,2.194007
min,2.0,1.0,297.0,36.0,1.0,0.0,0.0,20.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,3.420756,3.252171,3.058925
25%,280.0,2.0,4570.0,475.0,1.0,0.0,0.0,570.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,3.0,8.0,8.296476,8.362405,8.32706
50%,553.0,3.0,5958.0,611.0,1.0,0.0,0.0,2220.0,1.0,1.0,...,1.0,0.0,0.0,2.0,2.0,5.0,16.0,9.841821,9.822343,9.832271
75%,848.0,5.0,7780.0,799.0,1.0,1.0,0.0,5630.0,1.0,1.0,...,1.0,0.0,1.0,4.0,3.0,8.0,23.0,11.469913,11.492476,11.52021
max,1115.0,7.0,37646.0,4246.0,1.0,1.0,1.0,27190.0,1.0,1.0,...,1.0,1.0,1.0,6.0,4.0,12.0,31.0,16.308164,17.273445,20.524705
