# TRAIN-TEST SPLIT

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from IPython.display import clear_output
from sklearn.ensemble import RandomForestRegressor

In [2]:
def find_nan_features(df):
    null_cols = []
    for col in df.columns:
        if df[col].isnull().values.any():
            null_cols.append(col)
    return null_cols

In [3]:
def remove_nan_rows(df):
    # getting indices (rows) of all NaN values
    inds = pd.isnull(df).any(1).nonzero()[0]

    # drop all the rows with NaN values
    return df.drop(df.index[inds])

In [4]:
def split(df, train_fraction):
    mindate = df.Date.min()
    maxdate = df.Date.max()
    splitdate = mindate + (maxdate - mindate) * train_fraction
    train = df[df.Date < splitdate]
    test = df[df.Date >= splitdate]
    return train, test

In [5]:
def get_x_y(df):
    # split set in data and target
    X = df.drop('NumberOfSales', axis=1)
    y = df["NumberOfSales"]
    return X, y

In [6]:
def train_model(X_train, y_train):
    # fit random forest with 250 trees
    forest = RandomForestRegressor(n_estimators=250, random_state=0)
    forest.fit(X_train, y_train)
    return forest

In [7]:
def bip_metric(X_val, y_val, y_pred):
    e_r = []
    
    # adjust shape
    X_val = X_val.reset_index(drop=True)
    y_pred = y_pred.tolist()
    y_val = y_val.tolist()

    for i in range(11):

        error = 0
        y_somma = 0

        region = 'Region_' + str(i)
        indexes = X_val.index[X_val[region] == 1].tolist()

        for j in indexes:
            
            error += abs(y_pred[j] - y_val[j])
            y_somma = y_somma + y_val[j]

        e_r.append(error/y_somma)

    return sum(e_r)/len(e_r)

In [8]:
def eval_model(X_val, y_val, model):
    y_pred = model.predict(X_val)
    score = bip_metric(X_val, y_val, y_pred)
    return score

### Load dataset

In [9]:
# load preprocessed csv to dataframe
df = pd.read_csv('preprocessed_train.csv')

### Prepare dataset

In [10]:
# Sistemo i dati per regression tree
## StoreID
df.drop('StoreID',axis=1)

## StoreType
df = pd.get_dummies(df, columns=['StoreType'], prefix='StoreType')

## AssortmentType
df = pd.get_dummies(df, columns=['AssortmentType'], prefix='AssortmentType')

## Region
df = pd.get_dummies(df, columns=['Region'], prefix='Region')

## Events
# No-Events (NaN) are considered as sunny days, with lowest value (0) on the events scale
df['Events'] = df['Events'].fillna(0)
df=df.replace({'Rain':1, 'Thunderstorm':1, 'Fog':1, 'Snow': 2, 'Fog-Rain': 2, 'Rain-Thunderstorm': 2, 'Rain-Snow':2, 'Fog-Snow':2, 'Fog-Rain-Snow':3, 'Rain-Hail':3, 'Snow-Hail':3, 'Rain-Snow-Hail':3, 'Fog-Rain-Hail':3, 'Fog-Thunderstorm':3, 'Fog-Rain-Thunderstorm':4, 'Fog-Snow-Hail':4, 'Fog-Rain-Snow-Hail':4, 'Rain-Snow-Thunderstorm':4, 'Rain-Hail-Thunderstorm':4, 'Fog-Rain-Hail-Thunderstorm':4, 'Rain-Snow-Hail-Thunderstorm':4})


In [11]:
# Look for features with NaN values
null_cols = find_nan_features(df)
print('Features with NaN:')
for col in null_cols:
    print(col)
    
# drop all rows with NaN values
df = remove_nan_rows(df)

Features with NaN:
IsOpen_yesterday
IsOpen_tomorrow
IsHoliday_yesterday
IsHoliday_tomorrow
NumberOfSales_yesterday
NumberOfSales_lastweek
NumberOfSales_lastmonth


### Split

In [12]:
# convert date to datetime
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')

In [13]:
# split in train and validation
train_fraction = 21/24
df_train, df_validation = split(df, train_fraction)

In [14]:
# remove Date
df_train = df_train.drop('Date', axis=1)
df_validation = df_validation.drop('Date', axis=1)

### Train model

In [15]:
X_train, y_train = get_x_y(df_train)

# checking shapes
print('X: ' + str(X_train.shape))
print('y: ' + str(y_train.shape))

X: (371913, 52)
y: (371913,)


In [16]:
model = train_model(X_train, y_train)

### Evaluate model

In [17]:
X_val, y_val = get_x_y(df_validation)

# checking shapes
print('X: ' + str(X_val.shape))
print('y: ' + str(y_val.shape))

X: (51597, 52)
y: (51597,)


In [18]:
score = eval_model(X_val, y_val, model)
print(score)

0.1301051044448495
