# PREPROCESSING

In [None]:
import pandas as pd
import numpy as np
import datetime as dt
from IPython.display import clear_output

In [None]:
# load csvs to dataframe
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
len_test = len(df_test)

# we need both for the temporal features
df = pd.concat([df_train, df_test], ignore_index=True)

In [None]:
df.head()

In [None]:
# show sample row
df.iloc[0]

In [None]:
## Missing Values
# cloud coverage: 0 if no events, 8 if events
for row in range(len(df)):
    if row % 10000 == 0:
        clear_output()
        print("Working on row {}".format(row))
    if np.isnan(df.loc[row, 'CloudCover']):
        if df.loc[row, 'Events'] is np.nan:
            df.loc[row, 'CloudCover'] = 0
        else:
            df.loc[row, 'CloudCover'] = 8

# max gust speed = max wind speed
df.Max_Gust_SpeedKm_h = df.Max_Gust_SpeedKm_h.fillna(df.Max_Wind_SpeedKm_h)

In [None]:
## Date Features
# convert date to datetime
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

# add features
# df['DayN']=df['Date'].dt.dayofyear    # non credo possa servire
df['DayOfWeek']=df['Date'].dt.dayofweek
df['Month']=df['Date'].dt.month
df['Week']=df['Date'].dt.weekofyear
df['Quarter']=df['Date'].dt.quarter


In [None]:
## Temporal Features
# create columns
df['IsOpen_yesterday'] = np.empty(len(df))
df['IsOpen_tomorrow'] = np.empty(len(df))
df['IsHoliday_yesterday'] = np.empty(len(df))
df['IsHoliday_tomorrow'] = np.empty(len(df))
df['NumberOfSales_yesterday'] = np.empty(len(df))
df['NumberOfSales_lastweek'] = np.empty(len(df))
df['NumberOfSales_lastmonth'] = np.empty(len(df))

for store in df.StoreID.unique():
    clear_output()
    print("Working on {}".format(store))
    temp = df.loc[df.StoreID == store]
    # switch index to timestamps to make this easier
    oldindex = temp.index
    temp.index = temp['Date']
    
    temp['IsOpen_yesterday'] = temp.IsOpen.rolling(window='1d',closed='left', min_periods=1).sum()
    temp['IsOpen_tomorrow'] = temp.IsOpen.rolling(window='1d',closed='left', min_periods=1).sum().shift(-2, '1d')
    temp['IsHoliday_yesterday'] = temp.IsHoliday.rolling(window='1d',closed='left', min_periods=1).sum()
    temp['IsHoliday_tomorrow'] = temp.IsHoliday.rolling(window='1d',closed='left', min_periods=1).sum().shift(-2, '1d')
    temp['NumberOfSales_yesterday'] = temp.NumberOfSales.rolling(window='1d',closed='left', min_periods=1).sum()
    temp['NumberOfSales_lastweek'] = temp.NumberOfSales.rolling(window='7d',closed='left', min_periods=1).sum()
    temp['NumberOfSales_lastmonth'] = temp.NumberOfSales.rolling(window='30d',closed='left', min_periods=1).sum()
    
    # put it back in the dataframe
    temp.index = oldindex
    df.loc[df.StoreID == store] = temp
    
# Attenzione: i valori di tomorrow nel test sono sputtanati a NaN
# vanno messi a mano qui o cambiato il modo di calcolo
    
# drop rows at the beginning where we have no past information
# NB: possiamo fare a meno se togliamo quelle feature
df = df.iloc[30:]

In [None]:
# One-Hot Encoding
# nb: pd.get_dummies rimuove le colonne direttamente

# ## StoreId
# df = pd.get_dummies(df, columns=['StoreID'], prefix='StoreID')

 ## StoreType
# df = pd.get_dummies(df, columns=['StoreType'], prefix='StoreType')

 ## AssortmentType
# df = pd.get_dummies(df, columns=['AssortmentType'], prefix='AssortmentType')

# ## Region
# df = pd.get_dummies(df, columns=['Region'], prefix='Region')

# ## Events
# df = pd.get_dummies(df, columns=['Events'], prefix='Events', dummy_na=True)
### inutile se possiamo usare categorie con decision tree

# numeric features to categories (strings)
# df.StoreID = df.StoreID.astype(str)
# df.Region = df.Region.astype(str)


In [None]:
# Drop useless columns
df = df.drop('Date', axis=1)
df = df.drop('NumberOfCustomers', axis=1)  
df = df.drop('WindDirDegrees', axis=1)

# questi non cambiano mai, teniamo regione e population
df = df.drop('Region_AreaKM2', axis=1)
df = df.drop('Region_GDP', axis=1)
#df = df.drop('Region_PopulationK', axis=1)

In [None]:
# Drop rows with IsOpen = 0
# -> number of sales is always = 0 
df = df[df.IsOpen == 1]
df = df.drop('IsOpen', axis=1)

In [None]:
df.head()

In [None]:
print(list(df.columns))

In [None]:
# save preprocessed data
df[:-len_test].to_csv('preprocessed_train.csv',index=False)
df[-len_test:].to_csv('preprocessed_test.csv', index=False)

In [None]:
### REMEMBER
# le sales di testing vanno aggiunte e sistemate a runtime