In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import dateutil.relativedelta
from IPython.display import clear_output
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, TimeSeriesSplit

In [2]:
def find_nan_features(df):
    null_cols = []
    for col in df.columns:
        if df[col].isnull().values.any():
            null_cols.append(col)
    return null_cols

In [3]:
def remove_nan_rows(df):
    # getting indices (rows) of all NaN values
    inds = pd.isnull(df).any(1).nonzero()[0]

    # drop all the rows with NaN values
    return df.drop(df.index[inds])

In [4]:
def split(df, train_fraction):
    mindate = df.Date.min()
    maxdate = df.Date.max()
    splitdate = mindate + (maxdate - mindate) * train_fraction
    train = df[df.Date < splitdate]
    test = df[df.Date >= splitdate]
    return train, test

In [5]:
def get_x_y(df):
    # split set in data and target
    X = df.drop('NumberOfSales', axis=1)
    y = df["NumberOfSales"]
    return X, y

In [6]:
def train_model(X_train, y_train, n_estimators=250, n_jobs=1):
    # fit random forest with 250 trees
    forest = RandomForestRegressor(n_estimators=n_estimators, random_state=0, n_jobs=n_jobs)
    forest.fit(X_train, y_train)
    return forest

In [7]:
def bip_metric(X_val, y_val, y_pred, rows_region):
    e_r = []
    month_sum = []
    
    # add regions to compute the metric
    X_val['Region'] = rows_region
    # DEBUG
#     print(X_val.Region.unique())
    X_val = pd.get_dummies(X_val, columns=['Region'], prefix='Region')
    
    # adjust shape
    X_val = X_val.reset_index(drop=True)
    y_pred = y_pred.tolist()
    y_val = y_val.tolist()

    for r in rows_region.unique():

        region = 'Region_' + str(r)
        d = X_val.loc[X_val[region] == 1]

        error = 0
        y_somma = 0
  
        # cycle through stores
        for i in range(1000,1736): 
               
            for m in range(1,13):
                sum_pred_month = 0
                sum_actual_month = 0
                indexes = d.index[(d['StoreID'] == i) & (d['Month'] == m)].tolist()
                
                for j in indexes:
           
                    sum_pred_month += y_pred[j]
                    sum_actual_month += y_val[j]

                error += abs(sum_actual_month - sum_pred_month)
                y_somma = y_somma + sum_actual_month

    e_r.append(error/y_somma)

    return sum(e_r)/len(e_r)

In [8]:
def eval_model_with_predicted_sales(X_val, y_val, model, df_dates, store_ids, val_date, rows_region, verbose=False):
    
    # prepare the dataframe where store the prediction
    y_predicted = pd.DataFrame(np.nan, index=X_val.index, columns=['NumberOfSales'])
    
    i = 0
    for index, row in X_val.iterrows():
        i += 1
#         if verbose:
            # DEBUG
#             print('processing... ' , i/len(X_val.index))

        # get from train set all the rows about this store
        temp = df_dates.loc[df_dates.StoreID == store_ids[index]]

        # get the date of the 'row' 
        date = temp.loc[index]
        date = date['Date']
    
        # get all the data from the last month
        splitdate = date + dateutil.relativedelta.relativedelta(days=-31)
        temp = temp[temp.Date >= splitdate]
        temp = temp[temp.Date <= date]

        temp.sort_values(by='Date')
        # switch index to timestamps to make this easier
        oldindex = temp.index
        temp.index = temp['Date']

        temp['NumberOfSales_yesterday'] = temp.NumberOfSales.rolling(window='1d',closed='left', min_periods=1).sum()
        temp['NumberOfSales_lastweek'] = temp.NumberOfSales.rolling(window='7d',closed='left', min_periods=1).sum()
        temp['NumberOfSales_lastmonth'] = temp.NumberOfSales.rolling(window='30d',closed='left', min_periods=1).sum()

        temp.NumberOfSales_yesterday = temp.NumberOfSales_yesterday.fillna(0)
        temp.NumberOfSales_lastweek = temp.NumberOfSales_lastweek.fillna(0)

        # put it back in the dataframe
        temp.index = oldindex

        # put the result in 'row'
        row['NumberOfSales_yesterday'] = temp.loc[index,'NumberOfSales_yesterday']
        row['NumberOfSales_lastweek'] = temp.loc[index,'NumberOfSales_lastweek']
        row['NumberOfSales_lastmonth'] = temp.loc[index,'NumberOfSales_lastmonth']

        # predict NumberOfSales of today
        temp.loc[index,'NumberOfSales'] = model.predict(row.values.reshape(1, -1))

        # at the next iter my I can use this prediction for a new rolling
        df_dates.loc[index] = temp.loc[index]

        y_predicted.loc[index] = temp.loc[index,'NumberOfSales'] 
    
    y_pred = y_predicted['NumberOfSales']
    new_x_val = X_val 
    new_x_val['Month'] = pd.DatetimeIndex(val_date['Date']).month
    new_x_val['StoreID'] = store_ids
    score = bip_metric(new_x_val, y_val, y_pred, rows_region)
    return score

In [9]:
def crossvalidation_TS_pred_sales(df, rows_region, nfolds=8, n_estimators=50, n_jobs=1, verbose=False):
    '''Time-Series Crossvalidation on the dataset `df` with `nfolds` folds.
    Split the dataset in N training-validation folds,
    trains and evaluates results for each of them,
    returns the mean of the error and metrics'''
    # convert date to datetime
    if df['Date'].dtype == np.dtype('O'):
        df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')
        
    # add temporary columns to compute splits
    df['Year_CV'] = df['Date'].dt.year
    df['Month_CV'] = df['Date'].dt.month
    
    # get all year-month pairs
    l = sorted(list(set(df[['Year_CV', 'Month_CV']].itertuples(index=False))))
    
    # drop temporary columns
    df = df.drop(['Year_CV', 'Month_CV'], axis=1)
    
    # split dataset in folds
    kf = TimeSeriesSplit(nfolds)
    indexes = []
    for train_months_idx, val_months_idx in kf.split(l):
        train_months = [l[i] for i in train_months_idx]
        val_months = [l[i] for i in val_months_idx]
        train_indexes = df.Date.apply(lambda d: (d.year, d.month) in train_months)
        val_indexes = df.Date.apply(lambda d: (d.year, d.month) in val_months)
        indexes.append((train_indexes, val_indexes))

    # iterate on the folds
    total_score = 0
    i = 0
    for train_indexes, val_indexes in indexes:
        i += 1
        if verbose:
            print('Starting fold {}'.format(i))
        # split
        df_train = df[train_indexes]
        df_validation = df[val_indexes]
        
        # store Date and ID
        val_date = pd.DataFrame(df_validation['Date'])
        val_store_id = df_validation['StoreID']
        
        # and drop them
        df_train = df_train.drop('Date', axis=1)
        df_validation = df_validation.drop('Date', axis=1)

        df_train = df_train.drop('StoreID', axis=1)
        df_validation = df_validation.drop('StoreID', axis=1)
        
        # train model
        X_train, y_train = get_x_y(df_train)
        model = train_model(X_train, y_train, n_estimators=n_estimators, n_jobs=n_jobs)
        
        # evaluate model
        X_val, y_val = get_x_y(df_validation)
        
#         months = pd.DatetimeIndex(val_date['Date']).month
        score = eval_model_with_predicted_sales(X_val, y_val, model, df, val_store_id, val_date, rows_region, verbose=verbose)
        if verbose:
            print('Partial score: {:6.4f}'.format(score))
        total_score += score
        
    return total_score / nfolds

### Load dataset

In [10]:
# load preprocessed csv to dataframe
df = pd.read_csv('preprocessed_train.csv')

# debug
df = df[df.StoreID == 1000]

### Prepare dataset

In [11]:
# Save region for each index
rows_region = df['Region']

# Choose features
selected_features=[
    'NumberOfSales',
    'NumberOfSales_lastmonth', 
    'HasPromotions', 
    'NumberOfSales_yesterday', 
    'NumberOfSales_lastweek',
    'IsOpen_yesterday',
    'DayOfWeek',
    'NearestCompetitor',
    'Week',
    'StoreID',
    'IsHoliday_tomorrow',
    'Date'] # droppata dopo

df = df[selected_features]

In [12]:
# Look for features with NaN values
null_cols = find_nan_features(df)
print('Features with NaN:')
for col in null_cols:
    print(col)
    
# drop all rows with NaN values
df = remove_nan_rows(df)

Features with NaN:
NumberOfSales_lastmonth
NumberOfSales_yesterday
NumberOfSales_lastweek
IsOpen_yesterday
IsHoliday_tomorrow


### Cross-validation

In [13]:
# debug
# nfolds = 20
nfolds = 8

score = crossvalidation_TS_pred_sales(df, rows_region, nfolds=nfolds, 
                        n_estimators=50, n_jobs=3, verbose=True)

print("Total score: {:6.4f}".format(score))

Starting fold 1
Partial score: 0.1898
Starting fold 2
Partial score: 0.0299
Starting fold 3
Partial score: 0.0979
Starting fold 4
Partial score: 0.0215
Starting fold 5
Partial score: 0.0410
Starting fold 6
Partial score: 0.0619
Starting fold 7
Partial score: 0.1451
Starting fold 8
Partial score: 0.1500
Total score: 0.0921
