# TRAIN-TEST SPLIT

In [124]:
import pandas as pd
import numpy as np
import datetime as dt
from IPython.display import clear_output
from sklearn.ensemble import RandomForestRegressor

In [116]:
# load csv to dataframe
df = pd.read_csv('train.csv')

In [117]:
# create split date

split = '1/12/2017'
split = pd.to_datetime(split, format='%d/%m/%Y')

In [118]:
## Missing Values
# cloud coverage: 0 if no events, 8 if events
for row in range(len(df)):
    if row % 10000 == 0:
        clear_output()
        print("Working on row {}".format(row))
    if np.isnan(df.loc[row, 'CloudCover']):
        if df.loc[row, 'Events'] is np.nan:
            df.loc[row, 'CloudCover'] = 0
        else:
            df.loc[row, 'CloudCover'] = 8

# max gust speed = max wind speed
df.Max_Gust_SpeedKm_h = df.Max_Gust_SpeedKm_h.fillna(df.Max_Wind_SpeedKm_h)

Working on row 520000


In [119]:
## Date Features
# convert date to datetime
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

# add features
# df['DayN']=df['Date'].dt.dayofyear    # non credo possa servire
df['DayOfWeek']=df['Date'].dt.dayofweek
df['Month']=df['Date'].dt.month
df['Week']=df['Date'].dt.weekofyear
df['Quarter']=df['Date'].dt.quarter

In [120]:
## Temporal Features
# create columns
df['IsOpen_yesterday'] = np.empty(len(df))
df['IsOpen_tomorrow'] = np.empty(len(df))
df['IsHoliday_yesterday'] = np.empty(len(df))
df['IsHoliday_tomorrow'] = np.empty(len(df))
df['NumberOfSales_yesterday'] = np.empty(len(df))
df['NumberOfSales_lastweek'] = np.empty(len(df))
df['NumberOfSales_lastmonth'] = np.empty(len(df))

for store in df.StoreID.unique():
    clear_output()
    print("Working on {}".format(store))
    temp = df.loc[df.StoreID == store]
    # switch index to timestamps to make this easier
    oldindex = temp.index
    temp.index = temp['Date']
    
    temp['IsOpen_yesterday'] = temp.IsOpen.rolling(window='1d',closed='left', min_periods=1).sum()
    temp['IsOpen_tomorrow'] = temp.IsOpen.rolling(window='1d',closed='left', min_periods=1).sum().shift(-2, '1d')
    temp['IsHoliday_yesterday'] = temp.IsHoliday.rolling(window='1d',closed='left', min_periods=1).sum()
    temp['IsHoliday_tomorrow'] = temp.IsHoliday.rolling(window='1d',closed='left', min_periods=1).sum().shift(-2, '1d')
    temp['NumberOfSales_yesterday'] = temp.NumberOfSales.rolling(window='1d',closed='left', min_periods=1).sum()
    temp['NumberOfSales_lastweek'] = temp.NumberOfSales.rolling(window='7d',closed='left', min_periods=1).sum()
    temp['NumberOfSales_lastmonth'] = temp.NumberOfSales.rolling(window='30d',closed='left', min_periods=1).sum()
    
    # put it back in the dataframe
    temp.index = oldindex
    df.loc[df.StoreID == store] = temp
    
# Attenzione: i valori di tomorrow nel test sono sputtanati a NaN
# vanno messi a mano qui o cambiato il modo di calcolo
    
# drop rows at the beginning where we have no past information
# NB: possiamo fare a meno se togliamo quelle feature
df = df.iloc[30:]

Working on 1748


In [121]:
# Drop useless columns
# df = df.drop('Date', axis=1)
df = df.drop('NumberOfCustomers', axis=1)  
df = df.drop('WindDirDegrees', axis=1)

# questi non cambiano mai, teniamo regione e population
df = df.drop('Region_AreaKM2', axis=1)
df = df.drop('Region_GDP', axis=1)
#df = df.drop('Region_PopulationK', axis=1)

In [122]:
# Drop rows with IsOpen = 0
# -> number of sales is always = 0 
df = df[df.IsOpen == 1]
df = df.drop('IsOpen', axis=1)

In [133]:
# Sistemo i dati per regression tree

# One-Hot Encoding 
# nb: pd.get_dummies rimuove le colonne direttamente

# ## StoreId
# df = pd.get_dummies(df, columns=['StoreID'], prefix='StoreID')

df.drop('StoreID',axis=1)

 ## StoreType
df = pd.get_dummies(df, columns=['StoreType'], prefix='StoreType')

 ## AssortmentType
df = pd.get_dummies(df, columns=['AssortmentType'], prefix='AssortmentType')

# ## Region
df = pd.get_dummies(df, columns=['Region'], prefix='Region')

# ## Events
# df = pd.get_dummies(df, columns=['Events'], prefix='Events', dummy_na=True)
### inutile se possiamo usare categorie con decision tree

# numeric features to categories (strings)
#df.StoreID = df.StoreID.astype(str)
#df.Region = df.Region.astype(str)

In [134]:
# No-Events (NaN) are considered as sunny days, with lowest value (0) on the events scale
df['Events'] = df['Events'].fillna(0)

In [135]:
df=df.replace({'Rain':1, 'Thunderstorm':1, 'Fog':1, 'Snow': 2, 'Fog-Rain': 2, 'Rain-Thunderstorm': 2, 'Rain-Snow':2, 'Fog-Snow':2, 'Fog-Rain-Snow':3, 'Rain-Hail':3, 'Snow-Hail':3, 'Rain-Snow-Hail':3, 'Fog-Rain-Hail':3, 'Fog-Thunderstorm':3, 'Fog-Rain-Thunderstorm':4, 'Fog-Snow-Hail':4, 'Fog-Rain-Snow-Hail':4, 'Rain-Snow-Thunderstorm':4, 'Rain-Hail-Thunderstorm':4, 'Fog-Rain-Hail-Thunderstorm':4, 'Rain-Snow-Hail-Thunderstorm':4})


array([0, 1, 2, 3, 4])

In [136]:
# train test split

df_train = df[df.Date < split]
df_validation = df[df.Date >= split]

In [142]:
# remove Date

df_train = df_train.drop('Date', axis=1)
df_validation = df_validation.drop('Date', axis=1)

In [143]:
# split trainset in data and target

y = df_train["NumberOfSales"]
X = df_train.drop('NumberOfSales', axis=1)

In [144]:
# Looking for features with NaN values
null_cols = []
print('Features with NaN:')
for col in X.columns:
    if X[col].isnull().values.any():
        print(col)
        null_cols.append(col)

Features with NaN:
Max_VisibilityKm
Mean_VisibilityKm
Min_VisibilitykM
IsOpen_yesterday
IsOpen_tomorrow
IsHoliday_yesterday
IsHoliday_tomorrow
NumberOfSales_yesterday
NumberOfSales_lastweek
NumberOfSales_lastmonth


In [145]:
# getting indices (rows) of all NaN values
inds = pd.isnull(X).any(1).nonzero()[0]

# drop all the rows with NaN values
y = y.drop(y.index[inds])
X = X.drop(X.index[inds])

In [146]:
# checking shapes
print('X: ' + str(X.shape))
print('y: ' + str(y.shape))

X: (370616, 55)
y: (370616,)


In [147]:
# fit random forest with 250 trees
forest = RandomForestRegressor(n_estimators=250, random_state=0)
forest.fit(X, y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [148]:
# split validation in data and target

y_val = df_validation["NumberOfSales"]
X_val = df_validation.drop('NumberOfSales', axis=1)

In [150]:
# Looking for features with NaN values
null_cols = []
print('Features with NaN:')
for col in X_val.columns:
    if X_val[col].isnull().values.any():
        print(col)
        null_cols.append(col)

Features with NaN:
Max_VisibilityKm
Mean_VisibilityKm
Min_VisibilitykM
IsOpen_tomorrow
IsHoliday_tomorrow


In [153]:
# getting indices (rows) of all NaN values
inds = pd.isnull(X_val).any(1).nonzero()[0]

# drop all the rows with NaN values
y_val = y_val.drop(y_val.index[inds])
X_val = X_val.drop(X_val.index[inds])

In [154]:
# predict
y_pred = forest.predict(X_val)

In [226]:
def eval(X_val, y_val, y_pred):
    e_r = []
    
    # adjust shape
    X_val = X_val.reset_index(drop=True)
    y_pred = y_pred.tolist()
    y_val = y_val.tolist()

    for i in range(11):

        error = 0
        y_somma = 0

        region = 'Region_' + str(i)
        indexes = X_val.index[X_val[region] == 1].tolist()

        for j in indexes:
            
            error += abs(y_pred[j] - y_val[j])
            y_somma = y_somma + y_val[j]

        e_r.append(error/y_somma)

    return sum(e_r)/len(e_r)


In [228]:
evaluation = eval(X_val, y_val, y_pred)

Region_0
Region_1
Region_2
Region_3
Region_4
Region_5
Region_6
Region_7
Region_8
Region_9
Region_10


In [229]:
evaluation

0.13169826588577588