In [1]:
import pandas as pd

train_df = pd.read_csv('../data/train.csv', low_memory=False)
test_df = pd.read_csv('../data/test.csv')
store_df = pd.read_csv('../data/store.csv')

print(train_df.shape, test_df.shape, store_df.shape)

(1017209, 9) (41088, 8) (1115, 10)


In [2]:
train_df = pd.merge(train_df, store_df, on='Store')
test_df = pd.merge(test_df, store_df, on='Store')

In [3]:



# Gather some features
def build_features(features, data):
    # remove NaNs
    data.fillna(0, inplace=True)
    data.loc[data.Open.isnull(), 'Open'] = 1
    # Use some properties directly
    features.extend(['CompetitionDistance', 'Promo', 'Promo2', 'SchoolHoliday'])

    # Label encode some features
    features.extend(['StoreType', 'Assortment', 'StateHoliday'])
    mappings = {'0': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4}
    data.StoreType.replace(mappings, inplace=True)
    data.Assortment.replace(mappings, inplace=True)
    data.StateHoliday.replace(mappings, inplace=True)

    features.extend(['DayOfWeek', 'Month', 'Day', 'Year', 'WeekOfYear'])
    data['Date'] = pd.to_datetime(data.Date)
    data['Year'] = data.Date.dt.year
    data['Month'] = data.Date.dt.month
    data['Day'] = data.Date.dt.day
    data['DayOfWeek'] = data.Date.dt.dayofweek
    data['WeekOfYear'] = data.Date.dt.isocalendar().week

    # Calculate time competition open time in months
    features.append('CompetitionOpen')
    data['CompetitionOpen'] = 12 * (data.Year - data.CompetitionOpenSinceYear) + \
                              (data.Month - data.CompetitionOpenSinceMonth)
    # Promo open time in months
    features.append('PromoOpen')
    data['PromoOpen'] = 12 * (data.Year - data.Promo2SinceYear) + \
                        (data.WeekOfYear - data.Promo2SinceWeek) / 4.0
    data['PromoOpen'] = data.PromoOpen.apply(lambda x: x if x > 0 else 0)
    data.loc[data.Promo2SinceYear == 0, 'PromoOpen'] = 0

    # Indicate that sales on that day are in promo interval
    features.append('IsPromoMonth')
    month2str = {1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun', \
                 7: 'Jul', 8: 'Aug', 9: 'Sept', 10: 'Oct', 11: 'Nov', 12: 'Dec'}
    data['monthStr'] = data.Month.map(month2str)
    data.loc[data.PromoInterval == 0, 'PromoInterval'] = ''
    data['IsPromoMonth'] = 0
    for interval in data.PromoInterval.unique():
        if interval != '':
            for month in interval.split(','):
                data.loc[(data.monthStr == month) & (data.PromoInterval == interval), 'IsPromoMonth'] = 1

    return data


feature_list = []

build_features(feature_list, train_df)
build_features([], test_df)
print('Done')


Done


In [4]:
train_df

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,...,Promo2SinceYear,PromoInterval,Year,Month,Day,WeekOfYear,CompetitionOpen,PromoOpen,monthStr,IsPromoMonth
0,1,4,2015-07-31,5263,555,1,1,0,1,3,...,0.0,,2015,7,31,31,82.0,0.00,Jul,0
1,1,3,2015-07-30,5020,546,1,1,0,1,3,...,0.0,,2015,7,30,31,82.0,0.00,Jul,0
2,1,2,2015-07-29,4782,523,1,1,0,1,3,...,0.0,,2015,7,29,31,82.0,0.00,Jul,0
3,1,1,2015-07-28,5011,560,1,1,0,1,3,...,0.0,,2015,7,28,31,82.0,0.00,Jul,0
4,1,0,2015-07-27,6102,612,1,1,0,1,3,...,0.0,,2015,7,27,31,82.0,0.00,Jul,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1017204,1115,5,2013-01-05,4771,339,1,0,0,1,4,...,2012.0,"Mar,Jun,Sept,Dec",2013,1,5,1,24157.0,6.75,Jan,0
1017205,1115,4,2013-01-04,4540,326,1,0,0,1,4,...,2012.0,"Mar,Jun,Sept,Dec",2013,1,4,1,24157.0,6.75,Jan,0
1017206,1115,3,2013-01-03,4297,300,1,0,0,1,4,...,2012.0,"Mar,Jun,Sept,Dec",2013,1,3,1,24157.0,6.75,Jan,0
1017207,1115,2,2013-01-02,3697,305,1,0,0,1,4,...,2012.0,"Mar,Jun,Sept,Dec",2013,1,2,1,24157.0,6.75,Jan,0


In [5]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
train_one_hot = encoder.fit_transform(train_df[['DayOfWeek', 'Month', 'StoreType', 'Assortment', 'StateHoliday']])
test_one_hot = encoder.transform(test_df[['DayOfWeek', 'Month', 'StoreType', 'Assortment', 'StateHoliday']])



In [6]:
train_one_hot.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 1., 0., 0.]])

In [7]:
train_df_clean = pd.concat([
    train_df[['Sales', "CompetitionDistance", "Promo", "Promo2", "SchoolHoliday", 'Year', 'PromoOpen', 'IsPromoMonth']],
    pd.DataFrame(train_one_hot.toarray(), columns=encoder.get_feature_names_out().tolist())
], axis=1)

test_df_clean = pd.concat([
    test_df[["CompetitionDistance", "Promo", "Promo2", "SchoolHoliday", 'Year', 'PromoOpen', 'IsPromoMonth']],
    pd.DataFrame(test_one_hot.toarray(), columns=encoder.get_feature_names_out().tolist())
])

train_df_clean.head(5)

Unnamed: 0,Sales,CompetitionDistance,Promo,Promo2,SchoolHoliday,Year,PromoOpen,IsPromoMonth,DayOfWeek_0,DayOfWeek_1,...,StoreType_2,StoreType_3,StoreType_4,Assortment_1,Assortment_2,Assortment_3,StateHoliday_0,StateHoliday_1,StateHoliday_2,StateHoliday_3
0,5263,1270.0,1,0,1,2015,0.0,0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,5020,1270.0,1,0,1,2015,0.0,0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,4782,1270.0,1,0,1,2015,0.0,0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,5011,1270.0,1,0,1,2015,0.0,0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,6102,1270.0,1,0,1,2015,0.0,0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_valid = train_test_split(train_df_clean, test_size=41088, random_state=42)

In [9]:
import json

X_train.to_csv('../data/for_training/train.csv', index=False)
X_valid.to_csv('../data/for_training/valid.csv', index=False)
test_df_clean.to_csv('../data/for_training/test.csv', index=False)


In [10]:

features = {'features': test_df_clean.columns.tolist(), 'target': 'Sales'}

with open('../data/for_training/features.json', 'w') as f:
    json.dump(features, f, indent=4)