### TRAIN-TEST SPLIT

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from IPython.display import clear_output
from sklearn.ensemble import RandomForestRegressor

In [2]:
def find_nan_features(df):
    null_cols = []
    for col in df.columns:
        if df[col].isnull().values.any():
            null_cols.append(col)
    return null_cols

In [3]:
def remove_nan_rows(df):
    # getting indices (rows) of all NaN values
    inds = pd.isnull(df).any(1).nonzero()[0]

    # drop all the rows with NaN values
    return df.drop(df.index[inds])

In [4]:
def split(df, train_fraction):
    mindate = df.Date.min()
    maxdate = df.Date.max()
    splitdate = mindate + (maxdate - mindate) * train_fraction
    train = df[df.Date < splitdate]
    test = df[df.Date >= splitdate]
    return train, test

In [5]:
def get_x_y(df):
    # split set in data and target
    X = df.drop('NumberOfSales', axis=1)
    y = df["NumberOfSales"]
    return X, y

In [6]:
def train_model(X_train, y_train):
    # fit random forest with 250 trees
    forest = RandomForestRegressor(n_estimators=250, random_state=0, n_jobs=3)
    forest.fit(X_train, y_train)
    return forest

### Load dataset

In [7]:
# load preprocessed csv to dataframe
df = pd.read_csv('preprocessed_train.csv')

In [8]:
# prepare dictionary storeId to region
region_dict = {}
store_dict = {}
for i in range(0, 11):
    region_dict[i] = []

selected_features=[
    'StoreID',
    'Region']
storeIDs = df.groupby(selected_features)
for store_reg, data in storeIDs:
    region_dict[store_reg[1]].append(store_reg[0])
    store_dict[store_reg[0]] = store_reg[1]
    
print (region_dict)
print(store_dict)

{0: [1001, 1009, 1019, 1037, 1038, 1070, 1094, 1132, 1175, 1177, 1185, 1215, 1230, 1251, 1266, 1308, 1325, 1328, 1334, 1347, 1350, 1354, 1372, 1374, 1382, 1383, 1405, 1406, 1441, 1442, 1447, 1449, 1475, 1490, 1493, 1504, 1508, 1514, 1531, 1550, 1560, 1576, 1582, 1596, 1611, 1649, 1664, 1674, 1684, 1689, 1695, 1697, 1710, 1726], 1: [1015, 1024, 1048, 1055, 1067, 1072, 1107, 1144, 1147, 1295, 1302, 1377, 1419, 1461, 1471, 1489, 1568, 1572, 1591, 1605, 1612, 1633, 1713, 1724], 2: [1004, 1016, 1031, 1039, 1043, 1044, 1045, 1051, 1056, 1057, 1068, 1075, 1079, 1082, 1093, 1096, 1097, 1100, 1103, 1105, 1112, 1117, 1119, 1121, 1123, 1131, 1133, 1136, 1154, 1161, 1163, 1166, 1178, 1179, 1201, 1202, 1220, 1226, 1243, 1254, 1256, 1268, 1282, 1285, 1287, 1288, 1290, 1292, 1301, 1335, 1343, 1344, 1345, 1346, 1356, 1358, 1360, 1366, 1368, 1371, 1379, 1391, 1395, 1407, 1410, 1414, 1415, 1417, 1418, 1422, 1423, 1425, 1426, 1435, 1455, 1456, 1473, 1476, 1478, 1483, 1497, 1501, 1503, 1505, 1506, 1509, 1

### Prepare dataset

In [9]:
# Sistemo i dati per regression tree
## StoreID
# df.drop('StoreID',axis=1) droppo dopo

## StoreType
df = pd.get_dummies(df, columns=['StoreType'], prefix='StoreType')

## AssortmentType
df = pd.get_dummies(df, columns=['AssortmentType'], prefix='AssortmentType')

## Region
df = pd.get_dummies(df, columns=['Region'], prefix='Region')

## Events
# No-Events (NaN) are considered as sunny days, with lowest value (0) on the events scale
df['Events'] = df['Events'].fillna(0)
df=df.replace({'Rain':1, 'Thunderstorm':1, 'Fog':1, 'Snow': 2, 'Fog-Rain': 2, 'Rain-Thunderstorm': 2, 'Rain-Snow':2, 'Fog-Snow':2, 'Fog-Rain-Snow':3, 'Rain-Hail':3, 'Snow-Hail':3, 'Rain-Snow-Hail':3, 'Fog-Rain-Hail':3, 'Fog-Thunderstorm':3, 'Fog-Rain-Thunderstorm':4, 'Fog-Snow-Hail':4, 'Fog-Rain-Snow-Hail':4, 'Rain-Snow-Thunderstorm':4, 'Rain-Hail-Thunderstorm':4, 'Fog-Rain-Hail-Thunderstorm':4, 'Rain-Snow-Hail-Thunderstorm':4})


In [10]:
# rows_region = df['Region']

selected_features=[
    'NumberOfSales',
    'NumberOfSales_lastmonth', 
    'HasPromotions', 
    'NumberOfSales_yesterday', 
    'NumberOfSales_lastweek',
    'IsOpen_yesterday',
    'DayOfWeek',
    'NearestCompetitor',
    'Week',
    'StoreID',
    'IsHoliday_tomorrow',
    'StoreType_Hyper Market',
    'Region_PopulationK',
    'Month',
    'IsOpen_tomorrow',
    'Max_TemperatureC',
    'Min_Humidity',
    'Min_Sea_Level_PressurehPa',
    'Mean_Wind_SpeedKm_h',
    'Mean_Humidity', 
    'Date'] # droppate dopo

df = df[selected_features]
# df_train = df[selected_features]
# df_validation = df[selected_features]


In [11]:
# Look for features with NaN values
null_cols = find_nan_features(df)
print('Features with NaN:')
for col in null_cols:
    print(col)
    
# drop all rows with NaN values
df = remove_nan_rows(df)

Features with NaN:
NumberOfSales_lastmonth
NumberOfSales_yesterday
NumberOfSales_lastweek
IsOpen_yesterday
IsHoliday_tomorrow
IsOpen_tomorrow


### Split

In [12]:
path_to_folder = './validation20VIF/'

In [13]:
# divide in several df based on storeid

for i in range(1000, 1736):
    store_df = df.loc[df['StoreID'] == i]
    store_df.drop('StoreID',axis=1)
    store_df.to_csv(path_to_folder+str(i)+'.csv',index=False)
 
# ## 
# #df_list.append(df.loc[df['StoreID'] == 1000]) 
# i = 1000
# for store_df in df_list: 
# #     ## StoreID
#     store_df.drop('StoreID',axis=1)
#     store_df.to_csv('./validation/' +str(i)+'.csv',index=False)
#     i=i+1

In [14]:
del store_df
del df

## Apply train an validation to all store

In [15]:
train_fraction = 21/24
region_num_list = [0,0,0,0,0,0,0,0,0,0,0]
region_den_list = [0,0,0,0,0,0,0,0,0,0,0]

for i in range (1000,1736):
    clear_output()
    print("Working on store: "+str(i))
# for i in range(1000,1010):
    #retrieve dataset
    df = pd.read_csv(path_to_folder+str(i)+'.csv')
    
    # convert date to datetime
    df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')
    
    # split in train and validation
    df_train, df_validation = split(df, train_fraction)
    # store months
    months = pd.DatetimeIndex(df_validation['Date']).month
    # drop date
    df_train = df_train.drop('Date', axis=1)
    df_validation = df_validation.drop('Date', axis=1)
    
    # train model
    X_train, y_train = get_x_y(df_train)
    # # checking shapes
    # print('X: ' + str(X_train.shape))
    # print('y: ' + str(y_train.shape))
    model = train_model(X_train, y_train)
    
    # evaluate model
    X_val, y_val = get_x_y(df_validation)
    # # checking shapes
    # print('X: ' + str(X_val.shape))
    # print('y: ' + str(y_val.shape))
    
    
    y_pred = model.predict(X_val)
#     new_x_val = X_val 
    X_val['Month'] = months
#     new_x_val['StoreID'] = val_id
    
    del model
    
    # adjust shape
    X_val = X_val.reset_index(drop=True)
    y_pred = y_pred.tolist()
    y_val = y_val.tolist()
    
    region = store_dict[i]           
    for m in range(1,13):
        sum_pred_month = 0
        sum_actual_month = 0
        indexes = X_val.index[X_val['Month'] == m].tolist()

        for j in indexes:

            sum_pred_month += y_pred[j]
            sum_actual_month += y_val[j]

        region_num_list[region] += abs(sum_actual_month - sum_pred_month)
        region_den_list[region] += sum_actual_month    
    
# print(region_num_list,region_den_list)

e_r = []
for r in range(11):
    e_r.append(region_num_list[r]/region_den_list[r])
    

result = (sum(e_r)/len(e_r))
    
    
    


Working on store: 1000
Working on store: 1001
Working on store: 1002
Working on store: 1003
Working on store: 1004
Working on store: 1005
Working on store: 1006
Working on store: 1007
Working on store: 1008
Working on store: 1009
Working on store: 1010
Working on store: 1011
Working on store: 1012
Working on store: 1013
Working on store: 1014
Working on store: 1015
Working on store: 1016
Working on store: 1017
Working on store: 1018
Working on store: 1019
Working on store: 1020
Working on store: 1021
Working on store: 1022
Working on store: 1023
Working on store: 1024
Working on store: 1025
Working on store: 1026
Working on store: 1027
Working on store: 1028
Working on store: 1029
Working on store: 1030
Working on store: 1031
Working on store: 1032
Working on store: 1033
Working on store: 1034
Working on store: 1035
Working on store: 1036
Working on store: 1037
Working on store: 1038
Working on store: 1039
Working on store: 1040
Working on store: 1041
Working on store: 1042
Working on 

Working on store: 1357
Working on store: 1358
Working on store: 1359
Working on store: 1360
Working on store: 1361
Working on store: 1362
Working on store: 1363
Working on store: 1364
Working on store: 1365
Working on store: 1366
Working on store: 1367
Working on store: 1368
Working on store: 1369
Working on store: 1370
Working on store: 1371
Working on store: 1372
Working on store: 1373
Working on store: 1374
Working on store: 1375
Working on store: 1376
Working on store: 1377
Working on store: 1378
Working on store: 1379
Working on store: 1380
Working on store: 1381
Working on store: 1382
Working on store: 1383
Working on store: 1384
Working on store: 1385
Working on store: 1386
Working on store: 1387
Working on store: 1388
Working on store: 1389
Working on store: 1390
Working on store: 1391
Working on store: 1392
Working on store: 1393
Working on store: 1394
Working on store: 1395
Working on store: 1396
Working on store: 1397
Working on store: 1398
Working on store: 1399
Working on 

Working on store: 1714
Working on store: 1715
Working on store: 1716
Working on store: 1717
Working on store: 1718
Working on store: 1719
Working on store: 1720
Working on store: 1721
Working on store: 1722
Working on store: 1723
Working on store: 1724
Working on store: 1725
Working on store: 1726
Working on store: 1727
Working on store: 1728
Working on store: 1729
Working on store: 1730
Working on store: 1731
Working on store: 1732
Working on store: 1733
Working on store: 1734
Working on store: 1735


In [16]:
print("Result: ", result)

Result:  0.04401165105451335


### Train model

In [17]:
region_num_list

[1032721.3759999999,
 448087.0000000001,
 1492818.6600000004,
 1401675.2079999992,
 211031.516,
 849207.7079999999,
 552464.0679999997,
 1122283.4560000002,
 408306.8039999996,
 3292709.1640000045,
 1135746.8200000003]

In [18]:
region_den_list

[21367831.0,
 8905037.0,
 26634180.0,
 41512390.0,
 8366525.0,
 18053394.0,
 12219753.0,
 23098370.0,
 8910179.0,
 64412870.0,
 34773267.0]

### Evaluate model