In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import dateutil.relativedelta
from datetime import timedelta
from IPython.display import clear_output
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, TimeSeriesSplit

In [2]:
def find_nan_features(df):
    null_cols = []
    for col in df.columns:
        if df[col].isnull().values.any():
            null_cols.append(col)
    return null_cols

In [3]:
def remove_nan_rows(df):
    # getting indices (rows) of all NaN values
    inds = pd.isnull(df).any(1).nonzero()[0]

    # drop all the rows with NaN values
    return df.drop(df.index[inds])

In [4]:
def split(df, train_fraction):
    mindate = df.Date.min()
    maxdate = df.Date.max()
    splitdate = mindate + (maxdate - mindate) * train_fraction
    train = df[df.Date < splitdate]
    test = df[df.Date >= splitdate]
    return train, test

In [5]:
def get_x_y(df):
    # split set in data and target
    X = df.drop('NumberOfSales', axis=1)
    y = df["NumberOfSales"]
    return X, y

In [6]:
def train_model(X_train, y_train, n_estimators=250, max_depth=None, n_jobs=1):
    # fit random forest with 250 trees
    forest = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=0, n_jobs=n_jobs)
    forest.fit(X_train, y_train)
    return forest

In [7]:
def bip_metric(X_val, y_val, y_pred, rows_region):
    e_r = []
    month_sum = []
    
    # add regions to compute the metric
    X_val['Region'] = rows_region
    # DEBUG
#     print(X_val.Region.unique())
    X_val = pd.get_dummies(X_val, columns=['Region'], prefix='Region')
    
    # adjust shape
    X_val = X_val.reset_index(drop=True)
    y_pred = y_pred.tolist()
    y_val = y_val.tolist()

    for r in rows_region.unique():

        region = 'Region_' + str(r)
        d = X_val.loc[X_val[region] == 1]

        error = 0
        y_somma = 0
  
        # cycle through stores
        for i in range(1000,1736): 
               
            for m in range(1,13):
                sum_pred_month = 0
                sum_actual_month = 0
                indexes = d.index[(d['StoreID'] == i) & (d['Month'] == m)].tolist()
                
                for j in indexes:
           
                    sum_pred_month += y_pred[j]
                    sum_actual_month += y_val[j]

                error += abs(sum_actual_month - sum_pred_month)
                y_somma = y_somma + sum_actual_month

    e_r.append(error/y_somma)

    return sum(e_r)/len(e_r)

In [8]:
def eval_model_with_predicted_sales(X_val, y_val, model, whole_df, store_ids, val_date, rows_region, verbose=False):

    # prepare the dataframe where we store the predictions
    y_predicted = pd.DataFrame(np.nan, index=X_val.index, columns=['NumberOfSales'])

    # prepare the matrix to recompute features
    total_stores = 735
    total_days = 30 + len(val_date.Date.unique())
    matrix = pd.DataFrame(np.zeros((total_days, total_stores)))
    # matrice, una riga per store
    # le colonne sono i giorni
    # la prima volta copiamo i primi 30 giorni
    start_date = min(val_date.Date) - timedelta(30)
    for i in range(30):
        # print(i)  # DEBUg
        d = start_date + timedelta(i)
        rows = whole_df[whole_df.Date == d]
        idxs = rows.StoreID - 1000
        sls = rows.NumberOfSales
        sls.index = idxs
        matrix.iloc[i] = sls

    i = 29
    # per ogni giorno in X_val
    for d in sorted(val_date.Date.unique()):
        i += 1
        # print(i)  # DEBUg
        # carica da X_val tutte le righe con quel giorno
        indexes = val_date[val_date.Date == d].index
        matrix_idx = store_ids[indexes] - 1000

        # ricalcoliamo feature
        temp = matrix.iloc[i-30:i, matrix_idx].sum(axis=0)
        temp.index = indexes
        X_val.loc[indexes, 'NumberOfSales_lastmonth'] = temp
        temp = matrix.iloc[i-7:i, matrix_idx].sum(axis=0)
        temp.index = indexes
        X_val.loc[indexes, 'NumberOfSales_lastweek'] = temp
        temp = matrix.iloc[i-1:i, matrix_idx].sum(axis=0)
        temp.index = indexes
        X_val.loc[indexes, 'NumberOfSales_yesterday'] = temp

        # predictions
        temp_predict = model.predict(X_val.loc[indexes])
        
        # salva in y_predicted per avere lo score
        y_predicted.loc[indexes, 'NumberOfSales'] = temp_predict

        # salva nella matrice per ricalcolare le feature
        rows = whole_df[whole_df.Date == d]
        idxs = rows.StoreID - 1000
        newvalues = pd.Series(temp_predict)
        newvalues.index = idxs
        matrix.iloc[i] = newvalues

    y_pred = y_predicted['NumberOfSales']
    new_x_val = X_val
    new_x_val['Month'] = pd.DatetimeIndex(val_date['Date']).month
    new_x_val['StoreID'] = store_ids
    score = bip_metric(new_x_val, y_val, y_pred, rows_region)
    return score


In [9]:
def crossvalidation_TS_pred_sales(df, rows_region, nfolds=8, n_estimators=50, max_depth=None, n_jobs=1, verbose=False):
    '''Time-Series Crossvalidation on the dataset `df` with `nfolds` folds.
    Split the dataset in N training-validation folds,
    trains and evaluates results for each of them,
    returns the mean of the error and metrics'''
    # convert date to datetime
    if df['Date'].dtype == np.dtype('O'):
        df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')
        
    # add temporary columns to compute splits
    df['Year_CV'] = df['Date'].dt.year
    df['Month_CV'] = df['Date'].dt.month
    
    # get all year-month pairs
    l = sorted(list(set(df[['Year_CV', 'Month_CV']].itertuples(index=False))))
    
    # drop temporary columns
    df = df.drop(['Year_CV', 'Month_CV'], axis=1)
    
    # split dataset in folds
    kf = TimeSeriesSplit(nfolds)
    indexes = []
    for train_months_idx, val_months_idx in kf.split(l):
        train_months = [l[i] for i in train_months_idx]
        val_months = [l[i] for i in val_months_idx]
        train_indexes = df.Date.apply(lambda d: (d.year, d.month) in train_months)
        val_indexes = df.Date.apply(lambda d: (d.year, d.month) in val_months)
        indexes.append((train_indexes, val_indexes))

    # iterate on the folds
    total_score = 0
    i = 0
    for train_indexes, val_indexes in indexes:
        i += 1
        if verbose:
            print('Starting fold {}'.format(i))
        # split
        df_train = df[train_indexes]
        df_validation = df[val_indexes]
        
        # store Date and ID
        val_date = pd.DataFrame(df_validation['Date'])
        val_store_id = df_validation['StoreID']
        
        # and drop them
        df_train = df_train.drop('Date', axis=1)
        df_validation = df_validation.drop('Date', axis=1)

        df_train = df_train.drop('StoreID', axis=1)
        df_validation = df_validation.drop('StoreID', axis=1)
        
        # train model
        X_train, y_train = get_x_y(df_train)
        model = train_model(X_train, y_train, n_estimators=n_estimators, max_depth=max_depth, n_jobs=n_jobs)
        
        # evaluate model
        X_val, y_val = get_x_y(df_validation)
        
#         months = pd.DatetimeIndex(val_date['Date']).month
        score = eval_model_with_predicted_sales(X_val, y_val, model, df, val_store_id, val_date, rows_region, verbose=verbose)
        if verbose:
            print('Partial score: {:6.4f}'.format(score))
        total_score += score
        
    return total_score / nfolds

### Load dataset

In [10]:
# load preprocessed csv to dataframe
df = pd.read_csv('preprocessed_train.csv')

# debug
# df = df[df.Region > 7]

### Prepare dataset

In [11]:
# Save region for each index
rows_region = df['Region']

# Choose features
selected_features=[
    'NumberOfSales',
    'NumberOfSales_lastmonth', 
    'HasPromotions', 
    'NumberOfSales_yesterday', 
    'NumberOfSales_lastweek',
    'IsOpen_yesterday',
    'DayOfWeek',
    'NearestCompetitor',
    'Week',
    'StoreID',
    'IsHoliday_tomorrow',
    'Date'] # droppata dopo

df = df[selected_features]

In [12]:
# Look for features with NaN values
null_cols = find_nan_features(df)
print('Features with NaN:')
for col in null_cols:
    print(col)
    
# drop all rows with NaN values
df = remove_nan_rows(df)

Features with NaN:
NumberOfSales_lastmonth
NumberOfSales_yesterday
NumberOfSales_lastweek
IsOpen_yesterday
IsHoliday_tomorrow


### Cross-validation

In [13]:
nfolds = 12  # two months at a time

score = crossvalidation_TS_pred_sales(df, rows_region, nfolds=nfolds, 
                        n_estimators=250, max_depth=40, n_jobs=3, verbose=True)

print("Total score: {:6.4f}".format(score))

Starting fold 1
Partial score: 0.0293
Starting fold 2
Partial score: 0.0807
Starting fold 3
Partial score: 0.0411
Starting fold 4
Partial score: 0.0605
Starting fold 5
Partial score: 0.0492
Starting fold 6


KeyError: 'Region_2'