### TRAIN-TEST SPLIT

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from IPython.display import clear_output
from sklearn.ensemble import RandomForestRegressor

In [2]:
def find_nan_features(df):
    null_cols = []
    for col in df.columns:
        if df[col].isnull().values.any():
            null_cols.append(col)
    return null_cols

In [3]:
def remove_nan_rows(df):
    # getting indices (rows) of all NaN values
    inds = pd.isnull(df).any(1).nonzero()[0]

    # drop all the rows with NaN values
    return df.drop(df.index[inds])

In [4]:
def split(df, train_fraction):
    mindate = df.Date.min()
    maxdate = df.Date.max()
    splitdate = mindate + (maxdate - mindate) * train_fraction
    train = df[df.Date < splitdate]
    test = df[df.Date >= splitdate]
    return train, test

In [5]:
def get_x_y(df):
    # split set in data and target
    X = df.drop('NumberOfSales', axis=1)
    y = df["NumberOfSales"]
    return X, y

In [6]:
def train_model(X_train, y_train):
    # fit random forest with 250 trees
    forest = RandomForestRegressor(n_estimators=250, random_state=0, n_jobs=3)
    forest.fit(X_train, y_train)
    return forest

In [7]:
def bip_metric(X_val, y_val, y_pred, rows_region):

    e_r = []
    month_sum = []
    # adjust shape
    X_val = X_val.reset_index(drop=True)
    X_val['Region'] = rows_region
    X_val = pd.get_dummies(X_val, columns=['Region'], prefix='Region')
    
    y_pred = y_pred.tolist()
    y_val = y_val.tolist()

    for r in range(11):

        region = 'Region_' + str(r)
        d = X_val.loc[X_val[region] == 1]

        error = 0
        y_somma = 0
  
#  cycle through stores
        for i in range(1000,1736): 
               
            for m in range(1,13):
                sum_pred_month = 0
                sum_actual_month = 0
                indexes = d.index[(d['StoreID'] == i) & (d['Month'] == m)].tolist()
                
                for j in indexes:
           
                    sum_pred_month += y_pred[j]
                    sum_actual_month += y_val[j]

                error += abs(sum_actual_month - sum_pred_month)
                y_somma = y_somma + sum_actual_month

    e_r.append(error/y_somma)

    return sum(e_r)/len(e_r)

In [8]:
def eval_model(X_val, y_val, model, val_id, months, rows_region):
    y_pred = model.predict(X_val)
    new_x_val = X_val 
    new_x_val['Month'] = months
    new_x_val['StoreID'] = val_id
    score = bip_metric(new_x_val, y_val, y_pred, rows_region)
    return score

### Load dataset

In [9]:
# load preprocessed csv to dataframe
df = pd.read_csv('preprocessed_train.csv')

### Prepare dataset

In [10]:
# Sistemo i dati per regression tree
## StoreID
# df.drop('StoreID',axis=1) droppo dopo

## StoreType
df = pd.get_dummies(df, columns=['StoreType'], prefix='StoreType')

## AssortmentType
df = pd.get_dummies(df, columns=['AssortmentType'], prefix='AssortmentType')

## Region
# df = pd.get_dummies(df, columns=['Region'], prefix='Region')

## Events
# No-Events (NaN) are considered as sunny days, with lowest value (0) on the events scale
df['Events'] = df['Events'].fillna(0)
df=df.replace({'Rain':1, 'Thunderstorm':1, 'Fog':1, 'Snow': 2, 'Fog-Rain': 2, 'Rain-Thunderstorm': 2, 'Rain-Snow':2, 'Fog-Snow':2, 'Fog-Rain-Snow':3, 'Rain-Hail':3, 'Snow-Hail':3, 'Rain-Snow-Hail':3, 'Fog-Rain-Hail':3, 'Fog-Thunderstorm':3, 'Fog-Rain-Thunderstorm':4, 'Fog-Snow-Hail':4, 'Fog-Rain-Snow-Hail':4, 'Rain-Snow-Thunderstorm':4, 'Rain-Hail-Thunderstorm':4, 'Fog-Rain-Hail-Thunderstorm':4, 'Rain-Snow-Hail-Thunderstorm':4})


In [11]:
rows_region = df['Region']


selected_features=[
    'NumberOfSales',
    'NumberOfSales_lastmonth', 
    'HasPromotions', 
    'NumberOfSales_yesterday', 
    'NumberOfSales_lastweek',
    'IsOpen_yesterday',
    'DayOfWeek',
    'NearestCompetitor',
    'Week',
    'StoreID',
    'IsHoliday_tomorrow',
    'Date'] # droppata dopo

df = df[selected_features]
# df_train = df[selected_features]
# df_validation = df[selected_features]

In [12]:
# Look for features with NaN values
null_cols = find_nan_features(df)
print('Features with NaN:')
for col in null_cols:
    print(col)
    
# drop all rows with NaN values
df = remove_nan_rows(df)

Features with NaN:
NumberOfSales_lastmonth
NumberOfSales_yesterday
NumberOfSales_lastweek
IsOpen_yesterday
IsHoliday_tomorrow


### Split

In [13]:
# convert date to datetime
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')

In [14]:
# remove Region from training
# df = df.drop('Region', axis=1)

In [15]:
# split in train and validation
train_fraction = 21/24
df_train, df_validation = split(df, train_fraction)

In [16]:
# store Date and ID
val_date = df_validation['Date']
val_date['Date'] = pd.to_datetime(df_validation['Date'], format='%Y-%m-%d')
val_id = df_validation['StoreID']
# and drop them
df_train = df_train.drop('Date', axis=1)
df_validation = df_validation.drop('Date', axis=1)

df_train = df_train.drop('StoreID', axis=1)
df_validation = df_validation.drop('StoreID', axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.loc[key] = value
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


### Train model

In [17]:
X_train, y_train = get_x_y(df_train)

# checking shapes
print('X: ' + str(X_train.shape))
print('y: ' + str(y_train.shape))

X: (371913, 9)
y: (371913,)


In [18]:
model = train_model(X_train, y_train)

### Evaluate model

In [19]:
X_val, y_val = get_x_y(df_validation)

# checking shapes
print('X: ' + str(X_val.shape))
print('y: ' + str(y_val.shape))

X: (51597, 9)
y: (51597,)


In [20]:
# val_date = pd.to_datetime(val_date, format='%Y-%m-%d')
months = pd.DatetimeIndex(val_date['Date']).month
months

Int64Index([11, 12, 12, 12, 12, 12, 12, 12, 12, 12,
            ...
             2,  2,  2,  2,  2,  2,  2,  2,  2,  2],
           dtype='int64', name='Date', length=51597)

In [22]:

score = eval_model(X_val, y_val, model, val_id, months, rows_region)
print(score)

0.03791872490397624


result = 0.03791872490397624