In [None]:
import pandas as pd
import numpy as np
import datetime as dt
from datetime import datetime
from IPython.display import clear_output
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, TimeSeriesSplit

# Preprocessing

In [None]:
ignore_unused_features = True

In [None]:
# load both datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# put in one dataframe
df = pd.concat([train, test])
df.reset_index(drop=True)
del train
del test

In [None]:
df.head()

In [None]:
# show sample row
df.iloc[0]

### Missing values

In [None]:
## Missing Values
if not ignore_unused_features:
    # cloud coverage: 0 if no events, 8 if events
    for row in range(len(df)):
        if row % 10000 == 0:
            clear_output()
            print("Working on row {}".format(row))
        if np.isnan(df.loc[row, 'CloudCover']):
            if df.loc[row, 'Events'] is np.nan:
                df.loc[row, 'CloudCover'] = 0
            else:
                df.loc[row, 'CloudCover'] = 8

    # max gust speed = max wind speed
    df.Max_Gust_SpeedKm_h = df.Max_Gust_SpeedKm_h.fillna(df.Max_Wind_SpeedKm_h)

### Categorical features

In [None]:
if not ignore_unused_features:
    # No-Events (NaN) are considered as sunny days, with lowest value (0) on the events scale
    df['Events'] = df['Events'].fillna(0)
    df=df.replace({'Rain':1, 'Thunderstorm':1, 'Fog':1, 'Snow': 2, 'Fog-Rain': 2, 'Rain-Thunderstorm': 2, 'Rain-Snow':2, 'Fog-Snow':2, 'Fog-Rain-Snow':3, 'Rain-Hail':3, 'Snow-Hail':3, 'Rain-Snow-Hail':3, 'Fog-Rain-Hail':3, 'Fog-Thunderstorm':3, 'Fog-Rain-Thunderstorm':4, 'Fog-Snow-Hail':4, 'Fog-Rain-Snow-Hail':4, 'Rain-Snow-Thunderstorm':4, 'Rain-Hail-Thunderstorm':4, 'Fog-Rain-Hail-Thunderstorm':4, 'Rain-Snow-Hail-Thunderstorm':4})
    df['Events'].unique()
    # Sistemo i dati per regression tree

    # One-Hot Encoding 
    df = pd.get_dummies(df, columns=['StoreType'], prefix='StoreType')
    df = pd.get_dummies(df, columns=['AssortmentType'], prefix='AssortmentType')
    df = pd.get_dummies(df, columns=['Region'], prefix='Region')

### Add date features
Add new features:
- day of the week 
- month 
- week of the year 
- quarter of the year 

In [None]:
## Date Features
# convert date to datetime
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

# add features
df['DayOfWeek']=df['Date'].dt.dayofweek
df['Month']=df['Date'].dt.month
df['Week']=df['Date'].dt.weekofyear
df['Quarter']=df['Date'].dt.quarter

### Add one-day-distance features

In [None]:
## Temporal Features
# create columns
df['IsOpen_yesterday'] = np.empty(len(df))
df['IsOpen_tomorrow'] = np.empty(len(df))
df['IsHoliday_yesterday'] = np.empty(len(df))
df['IsHoliday_tomorrow'] = np.empty(len(df))
df['HasPromotions_yesterday'] = np.empty(len(df))
df['HasPromotions_tomorrow'] = np.empty(len(df))

for store in df.StoreID.unique():
    clear_output()
    print("Working on {}".format(store))
    temp = df.loc[df.StoreID == store]
    # switch index to timestamps to make this easier
    oldindex = temp.index
    temp.index = temp['Date']
    
    temp['IsOpen_yesterday'] = temp.IsOpen.rolling(window='1d',closed='left', min_periods=1).sum()
    temp['IsOpen_tomorrow'] = temp.IsOpen.rolling(window='1d',closed='left', min_periods=1).sum().shift(-2, '1d')
    temp['IsHoliday_yesterday'] = temp.IsHoliday.rolling(window='1d',closed='left', min_periods=1).sum()
    temp['IsHoliday_tomorrow'] = temp.IsHoliday.rolling(window='1d',closed='left', min_periods=1).sum().shift(-2, '1d')  
    temp['HasPromotions_yesterday'] = temp.HasPromotions.rolling(window='1d',closed='left', min_periods=1).sum()
    temp['HasPromotions_tomorrow'] = temp.HasPromotions.rolling(window='1d',closed='left', min_periods=1).sum().shift(-2, '1d')  
   
    # put it back in the dataframe
    temp.index = oldindex
    df.loc[df.StoreID == store] = temp

# fix edge days
df.IsOpen_yesterday.fillna(1, inplace = True)
df.IsOpen_tomorrow.fillna(1, inplace = True)
df.IsHoliday_yesterday.fillna(0, inplace = True)
df.IsHoliday_tomorrow.fillna(0, inplace = True)
df.HasPromotions_yesterday.fillna(0, inplace=True)
df.HasPromotions_tomorrow.fillna(0, inplace=True)

### Drop columns
- NumberOfCustumers : not present in  the testset
- WindDirDegrees : useless
- Visibility: too many missing values

In [None]:
# Drop useless columns
df['NumberOfCustomers'] = np.nan
df = df.drop('NumberOfCustomers', axis=1)  
df = df.drop('WindDirDegrees', axis=1)
df = df.drop('Max_VisibilityKm', axis=1)
df = df.drop('Mean_VisibilityKm', axis=1)
df = df.drop('Min_VisibilitykM', axis=1)

### Drop columns with redundant information
Region_AreaKM2, Region_GDP and Region_PupolationK have the same info (different number for each region). Keep only one of them.

In [None]:
# we keep the region population
df = df.drop('Region_AreaKM2', axis=1)
df = df.drop('Region_GDP', axis=1)

### Drop days when the stores are closed

In [None]:
# Drop rows with IsOpen = 0
# -> the number of sales is always = 0 
df = df[df.IsOpen == 1]
# drop the now useless column
df = df.drop('IsOpen', axis=1)

## Add features: store average sales

In [None]:
# split train and test
df_train = df[df.Date < datetime(2018, 3, 1, 0, 0, 0)]
df_test = df[df.Date >= datetime(2018, 3, 1, 0, 0, 0)]

In [None]:
# add temporary feature
df_train['Year'] = df_train['Date'].dt.year

for store in df.StoreID.unique():
    clear_output()
    print('store ', store)
    mask = (df_train.StoreID == store)
    mask_test = (df_test.StoreID == store)
    store_df = df_train[mask]

    # avg daily sales
    available_months = store_df.Month.unique()
    available_years = store_df.Year.unique()
    total_sales = sum(store_df.NumberOfSales)
    total_open_days = store_df.Date.count()
    daily_sales = total_sales / total_open_days
    df_train.loc[mask, 'daily_sales'] = daily_sales
    df_test.loc[mask_test, 'daily_sales'] = daily_sales

    # avg sales for each month (BASED ON specific MONTH, not just average of all months)
    for m in available_months:
        month_avg_sales = sum(store_df[(store_df.Month == m)].NumberOfSales)/len(store_df[(store_df.Month == m)].Year.unique())
        df_train.loc[((mask) & (df_train.Month == m)), 'month_avg_sales'] = month_avg_sales
        if m in (3, 4):
            df_test.loc[((mask_test) & (df_test.Month == m)), 'month_avg_sales'] = month_avg_sales
    
    # avg yearly sales
    yearly_sales = sum(store_df.NumberOfSales)/len(store_df['Year'].unique())
    df_train.loc[mask , 'yearly_sales'] = yearly_sales
    df_test.loc[mask_test, 'yearly_sales'] = yearly_sales
    
df_train = df_train.drop('Year', axis=1)

## Add features: linear regression

In [None]:
# put back in one dataframe
df = pd.concat([df_train, df_test])
df.reset_index(drop=True)
del df_train
del df_test

In [None]:
# prepare date as number for the regression
print('Converting date to a number...')
df['DateNumeric'] = df['Date'].values.astype(float)
    
# add temporary columns to compute splits
df['Year_temp'] = df['Date'].dt.year
df['Month_temp'] = df['Date'].dt.month

# get all year-month pairs
l = sorted(list(set(df[['Year_temp', 'Month_temp']].itertuples(index=False))))

# drop temporary columns
df = df.drop(['Year_temp', 'Month_temp'], axis=1)

# cycle on two months at a time
print('Number of months: {}'.format(len(l)))
l = [(l[i], l[i+1]) for i in range(0, len(l), 2)]
print('Number of folds: {}'.format(len(l) - 1))

# declare new features
print('Declaring new features...')
df['regression_whole'] = np.empty(len(df))
df['regression_twomonths'] = np.empty(len(df))

In [None]:
# first 2 months = NaN
print('First two months...')
first_two_months = (df.Date < datetime(2016, 5, 1, 0, 0, 0))
df.loc[first_two_months, 'regression_whole'] = np.nan
df.loc[first_two_months, 'regression_twomonths'] = np.nan

# following months: linear regression

# version trained on all preceding months
kf = TimeSeriesSplit(len(l) - 1)

print('Preparing indexes...')
indexes = []
for train_months_idx, target_months_idx in kf.split(l):
    train_months = [l[i] for i in train_months_idx]
    target_months = [l[i] for i in target_months_idx]
    # flatten
    train_months = [x for y in train_months for x in y]
    target_months = [x for y in target_months for x in y]
    train_indexes = df.Date.apply(lambda d: (d.year, d.month) in train_months)
    target_indexes = df.Date.apply(lambda d: (d.year, d.month) in target_months)
    indexes.append((train_indexes, target_indexes))

# iterate 2 months at a time
fold = 0
for train_indexes, target_indexes in indexes:
    fold += 1
    print('Fold {}'.format(fold))
    for store in df.StoreID.unique():
        for day in (0, 1, 2, 3, 4, 5, 6):
            # fit linear regression on sales and HasPromotions
            mask = (train_indexes) & (df.StoreID == store) & (df.DayOfWeek == day)
            X = df.loc[mask, ['DateNumeric', 'HasPromotions']]
            y = df.loc[mask, 'NumberOfSales']      
            target_mask = (target_indexes) & (df.StoreID == store) & (df.DayOfWeek == day)
            if len(X) == 0:
                # leave empty
                df.loc[target_mask, 'regression_whole'] = np.nan
            else:
                target_X = df.loc[target_mask, ['DateNumeric', 'HasPromotions']]
                if len(target_X) == 0:
                    # skip
                    continue
                # predict following two months
                model = LinearRegression()
                model.fit(X, y)
                df.loc[target_mask, 'regression_whole'] = model.predict(target_X)
        
        
# version trained on the two preceding months
kf = TimeSeriesSplit(len(l) - 1, max_train_size=1)

print('Preparing indexes...')
indexes = []
for train_months_idx, target_months_idx in kf.split(l):
    train_months = [l[i] for i in train_months_idx]
    target_months = [l[i] for i in target_months_idx]
    # flatten
    train_months = [x for y in train_months for x in y]
    target_months = [x for y in target_months for x in y]
    train_indexes = df.Date.apply(lambda d: (d.year, d.month) in train_months)
    target_indexes = df.Date.apply(lambda d: (d.year, d.month) in target_months)
    indexes.append((train_indexes, target_indexes))

# iterate 2 months at a time
fold = 0
for train_indexes, target_indexes in indexes:
    fold += 1
    print('Fold {}'.format(fold))
    for store in df.StoreID.unique():
        for day in (0, 1, 2, 3, 4, 5, 6):
            # fit linear regression on sales and HasPromotions
            mask = (train_indexes) & (df.StoreID == store) & (df.DayOfWeek == day)
            X = df.loc[mask, ['DateNumeric', 'HasPromotions']]
            y = df.loc[mask, 'NumberOfSales']      
            target_mask = (target_indexes) & (df.StoreID == store) & (df.DayOfWeek == day)
            if len(X) == 0:
                # leave empty
                df.loc[target_mask, 'regression_twomonths'] = np.nan
            else:
                target_X = df.loc[target_mask, ['DateNumeric', 'HasPromotions']]
                if len(target_X) == 0:
                    # skip
                    continue
                # predict following two months
                model = LinearRegression()
                model.fit(X, y)
                df.loc[target_mask, 'regression_twomonths'] = model.predict(target_X)

In [None]:
# drop temporary column
df = df.drop('DateNumeric', axis=1)

# add another useful feature
df['RegressionDistance'] = df.Date.apply(lambda x: x.day + ((x.month + 1) % 2) * 30)

## Save preprocessed dataset

In [None]:
# split dataframe again
df_train = df[df.Date < datetime(2018, 3, 1, 0, 0, 0)]
df_test = df[df.Date >= datetime(2018, 3, 1, 0, 0, 0)]
del df

In [None]:
# save to file
df_train.to_csv('preprocessed_train_complete.csv', index=False)
df_test.to_csv('preprocessed_test_complete.csv', index=False)

In [None]:
df_train.head()

In [None]:
df_test.head()

# Training

### Get features

In [None]:
# Choose features

selected_features = [
    'NumberOfSales',
    'HasPromotions',
    'IsOpen_yesterday',
    'IsOpen_tomorrow',
    'IsHoliday_tomorrow',
    'IsHoliday_yesterday',
    'daily_sales',
    'month_avg_sales',
    'yearly_sales',
    'NearestCompetitor',
    'DayOfWeek',
    'Week',
    'Month',
    'regression_whole',
    'RegressionDistance',
    'StoreID' # we need this to group the predictions later
    ]

df_train = df_train[selected_features]
df_test = df_test[selected_features]

### Handle NaN values

In [None]:
# Look for features with NaN values
null_cols = []
for col in df_train.columns:
    if df_train[col].isnull().values.any():
        null_cols.append(col)

print('Features with NaN: {}'.format(len(null_cols)))
for col in null_cols:
    print(col)
print('Num of rows containing NaNs: {}'.format(len(pd.isnull(df_train).any(1).nonzero()[0])))
    
# drop all rows with NaN values
# the first two months are inevitable since we can't apply the regression
inds = pd.isnull(df).any(1).nonzero()[0]
df_train = df_train.drop(df_train.index[inds])

### Split X and y

In [None]:
X_train = df_train.drop(['NumberOfSales', 'StoreID'], axis=1)
y_train = df_train["NumberOfSales"]

### Train model

In [None]:
forest = RandomForestRegressor(n_estimators=300, max_depth=15, random_state=0, n_jobs=2, max_features='sqrt')
model = forest.fit(X_train, y_train)

In [None]:
# show model most important features
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(20, 5))
plt.title("Feature importances", fontsize=20)
plt.bar(range(X.shape[1]), importances[indices],
       color="brown", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), X.columns[indices],rotation=90)
ax = plt.axes()
for tick in ax.xaxis.get_major_ticks():
    tick.label.set_fontsize(16) 
for tick in ax.yaxis.get_major_ticks():
    tick.label.set_fontsize(16)
plt.xlim([-1, X.shape[1]])
plt.show()

# Predictions

### Make predictions

In [None]:
X_test = df_test.drop(['NumberOfSales', 'StoreID'], axis=1)
y_pred = model.predict(X_test)

### Prepare predictions dataframe

In [None]:
final = pd.DataFrame(columns = ['StoreID', 'Month', 'NumberOfSales'] )
final.Month = df_test.Month
final.StoreID = df_test.StoreID
final.NumberOfSales= y_pred
final = final.groupby(['StoreID', 'Month'], as_index=False).agg({"NumberOfSales":"sum"})
final.NumberOfSales = final.NumberOfSales.astype('int64')

### Save to file

In [None]:
final.to_csv("predictions.csv", index = False)

In [None]:
final