# PREPROCESSING

In [85]:
import pandas as pd
import numpy as np
import datetime as dt
from IPython.display import clear_output

## Run with train.csv once and then test.csv

In [86]:
# load csvs to dataframe
#df = pd.read_csv('train.csv')
df = pd.read_csv('test.csv')

In [87]:
df.head()

Unnamed: 0,StoreID,Date,IsHoliday,IsOpen,HasPromotions,StoreType,AssortmentType,NearestCompetitor,Region,Region_AreaKM2,...,Mean_TemperatureC,Mean_VisibilityKm,Mean_Wind_SpeedKm_h,Min_Dew_PointC,Min_Humidity,Min_Sea_Level_PressurehPa,Min_TemperatureC,Min_VisibilitykM,Precipitationmm,WindDirDegrees
0,1000,01/03/2018,0,1,0,Hyper Market,General,326,7,9643,...,6,14.0,5,-1,46,1011,2,10.0,0.0,180
1,1000,02/03/2018,0,1,0,Hyper Market,General,326,7,9643,...,4,12.0,6,-1,52,1009,3,10.0,5.08,315
2,1000,03/03/2018,0,1,0,Hyper Market,General,326,7,9643,...,3,13.0,11,-3,41,1013,-2,10.0,0.0,210
3,1000,04/03/2018,0,0,0,Hyper Market,General,326,7,9643,...,6,11.0,18,-2,65,1002,1,6.0,3.05,193
4,1000,05/03/2018,0,1,1,Hyper Market,General,326,7,9643,...,6,10.0,23,-5,25,1000,2,8.0,0.25,247


In [88]:
# show sample row
df.iloc[0]

StoreID                               1000
Date                            01/03/2018
IsHoliday                                0
IsOpen                                   1
HasPromotions                            0
StoreType                     Hyper Market
AssortmentType                     General
NearestCompetitor                      326
Region                                   7
Region_AreaKM2                        9643
Region_GDP                           17130
Region_PopulationK                    2770
CloudCover                               6
Events                                Rain
Max_Dew_PointC                           3
Max_Gust_SpeedKm_h                     NaN
Max_Humidity                            95
Max_Sea_Level_PressurehPa             1022
Max_TemperatureC                         9
Max_VisibilityKm                        31
Max_Wind_SpeedKm_h                      18
Mean_Dew_PointC                          2
Mean_Humidity                           75
Mean_Sea_Le

## Missing values : CloudCover
We consider cloud cover = 0 when no events occur, 8 otherwise

In [89]:
## Missing Values
# cloud coverage: 0 if no events, 8 if events
for row in range(len(df)):
    if row % 10000 == 0:
        clear_output()
        print("Working on row {}".format(row))
    if np.isnan(df.loc[row, 'CloudCover']):
        if df.loc[row, 'Events'] is np.nan:
            df.loc[row, 'CloudCover'] = 0
        else:
            df.loc[row, 'CloudCover'] = 8

# max gust speed = max wind speed
df.Max_Gust_SpeedKm_h = df.Max_Gust_SpeedKm_h.fillna(df.Max_Wind_SpeedKm_h)

Working on row 40000


In [90]:
df.iloc[0]

StoreID                               1000
Date                            01/03/2018
IsHoliday                                0
IsOpen                                   1
HasPromotions                            0
StoreType                     Hyper Market
AssortmentType                     General
NearestCompetitor                      326
Region                                   7
Region_AreaKM2                        9643
Region_GDP                           17130
Region_PopulationK                    2770
CloudCover                               6
Events                                Rain
Max_Dew_PointC                           3
Max_Gust_SpeedKm_h                      18
Max_Humidity                            95
Max_Sea_Level_PressurehPa             1022
Max_TemperatureC                         9
Max_VisibilityKm                        31
Max_Wind_SpeedKm_h                      18
Mean_Dew_PointC                          2
Mean_Humidity                           75
Mean_Sea_Le

## Add features
Add new features:
- day of the week 
- month 
- week of the year 
- quarter of the year 

In [91]:
## Date Features
# convert date to datetime
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

# add features
# df['DayN']=df['Date'].dt.dayofyear    # non credo possa servire
df['DayOfWeek']=df['Date'].dt.dayofweek
df['Month']=df['Date'].dt.month
df['Week']=df['Date'].dt.weekofyear
df['Quarter']=df['Date'].dt.quarter

In [92]:
## Temporal Features
# create columns
df['IsOpen_yesterday'] = np.empty(len(df))
df['IsOpen_tomorrow'] = np.empty(len(df))
df['IsHoliday_yesterday'] = np.empty(len(df))
df['IsHoliday_tomorrow'] = np.empty(len(df))

for store in df.StoreID.unique():
    clear_output()
    print("Working on {}".format(store))
    temp = df.loc[df.StoreID == store]
    # switch index to timestamps to make this easier
    oldindex = temp.index
    temp.index = temp['Date']
    
    temp['IsOpen_yesterday'] = temp.IsOpen.rolling(window='1d',closed='left', min_periods=1).sum()
    temp['IsOpen_tomorrow'] = temp.IsOpen.rolling(window='1d',closed='left', min_periods=1).sum().shift(-2, '1d')
    temp['IsHoliday_yesterday'] = temp.IsHoliday.rolling(window='1d',closed='left', min_periods=1).sum()
    temp['IsHoliday_tomorrow'] = temp.IsHoliday.rolling(window='1d',closed='left', min_periods=1).sum().shift(-2, '1d')  
    
    # put it back in the dataframe
    temp.index = oldindex
    df.loc[df.StoreID == store] = temp
    

Working on 1748


In [93]:
df.iloc[0]

StoreID                                      1000
Date                          2018-03-01 00:00:00
IsHoliday                                       0
IsOpen                                          1
HasPromotions                                   0
StoreType                            Hyper Market
AssortmentType                            General
NearestCompetitor                             326
Region                                          7
Region_AreaKM2                               9643
Region_GDP                                  17130
Region_PopulationK                           2770
CloudCover                                      6
Events                                       Rain
Max_Dew_PointC                                  3
Max_Gust_SpeedKm_h                             18
Max_Humidity                                   95
Max_Sea_Level_PressurehPa                    1022
Max_TemperatureC                                9
Max_VisibilityKm                               31


In [94]:
# No-Events (NaN) are considered as sunny days, with lowest value (0) on the events scale
df['Events'] = df['Events'].fillna(0)

df=df.replace({'Rain':1, 'Thunderstorm':1, 'Fog':1, 'Snow': 2, 'Fog-Rain': 2, 'Rain-Thunderstorm': 2, 'Rain-Snow':2, 'Fog-Snow':2, 'Fog-Rain-Snow':3, 'Rain-Hail':3, 'Snow-Hail':3, 'Rain-Snow-Hail':3, 'Fog-Rain-Hail':3, 'Fog-Thunderstorm':3, 'Fog-Rain-Thunderstorm':4, 'Fog-Snow-Hail':4, 'Fog-Rain-Snow-Hail':4, 'Rain-Snow-Thunderstorm':4, 'Rain-Hail-Thunderstorm':4, 'Fog-Rain-Hail-Thunderstorm':4, 'Rain-Snow-Hail-Thunderstorm':4})

df['Events'].unique()

array([1, 2, 0, 3, 4], dtype=int64)

In [95]:
# Get columns with NaN values
null_cols = []
print('Features with NaN:')
for col in df.columns:
    if df[col].isnull().values.any():
        print(col)
        null_cols.append(col)
 

Features with NaN:
Max_VisibilityKm
Mean_VisibilityKm
Min_VisibilitykM
IsOpen_yesterday
IsOpen_tomorrow
IsHoliday_yesterday
IsHoliday_tomorrow


### Drop columns
- NumberOfCustumers : not present in  the testset
- WindDirDegrees : useless

In [96]:
# Drop useless columns
df['NumberOfCustomers'] = np.nan
df = df.drop('NumberOfCustomers', axis=1)  
df = df.drop('WindDirDegrees', axis=1)
df = df.drop('Max_VisibilityKm', axis=1)
df = df.drop('Mean_VisibilityKm', axis=1)
df = df.drop('Min_VisibilitykM', axis=1)

### Drop columns with the same information
Region_AreaKM2, Region_GDP and Region_PupolationK have the same info (different number for each region). Keep only one of them.

In [97]:
# questi non cambiano mai, teniamo regione e population
df = df.drop('Region_AreaKM2', axis=1)
df = df.drop('Region_GDP', axis=1)

In [98]:
# Drop rows with IsOpen = 0
# -> number of sales is always = 0 
df = df[df.IsOpen == 1]
df = df.drop('IsOpen', axis=1)

In [99]:
df.iloc[0]

StoreID                                      1000
Date                          2018-03-01 00:00:00
IsHoliday                                       0
HasPromotions                                   0
StoreType                            Hyper Market
AssortmentType                            General
NearestCompetitor                             326
Region                                          7
Region_PopulationK                           2770
CloudCover                                      6
Events                                          1
Max_Dew_PointC                                  3
Max_Gust_SpeedKm_h                             18
Max_Humidity                                   95
Max_Sea_Level_PressurehPa                    1022
Max_TemperatureC                                9
Max_Wind_SpeedKm_h                             18
Mean_Dew_PointC                                 2
Mean_Humidity                                  75
Mean_Sea_Level_PressurehPa                   1019


In [100]:
df.IsOpen_yesterday.fillna(1, inplace = True)
df.IsOpen_tomorrow.fillna(1, inplace = True)
df.IsHoliday_yesterday.fillna(1, inplace = True)
df.IsHoliday_tomorrow.fillna(1, inplace = True)

In [101]:
# Get columns with NaN values
null_cols = []
print('Features with NaN:')
for col in df.columns:
    if df[col].isnull().values.any():
        print(col)
        null_cols.append(col)
 

Features with NaN:


In [102]:
# save preprocessed data
#df.to_csv('preprocessed_train.csv', index=False)
df.to_csv('preprocessed_test.csv', index=False)

In [70]:
df

Unnamed: 0,StoreID,Date,IsHoliday,HasPromotions,StoreType,AssortmentType,NearestCompetitor,Region,NumberOfSales,Region_PopulationK,...,Min_TemperatureC,Precipitationmm,DayOfWeek,Month,Week,Quarter,IsOpen_yesterday,IsOpen_tomorrow,IsHoliday_yesterday,IsHoliday_tomorrow
0,1000,2016-03-01,0,0,Hyper Market,General,326,7,5676,2770,...,1,0.00,1,3,9,1,1.0,1.0,1.0,1.0
1,1000,2016-03-02,0,0,Hyper Market,General,326,7,8111,2770,...,1,0.00,2,3,9,1,1.0,1.0,0.0,1.0
2,1000,2016-03-04,0,0,Hyper Market,General,326,7,8300,2770,...,2,0.00,4,3,9,1,1.0,1.0,1.0,0.0
3,1000,2016-03-05,0,0,Hyper Market,General,326,7,7154,2770,...,-1,0.00,5,3,9,1,1.0,0.0,0.0,0.0
5,1000,2016-03-07,0,1,Hyper Market,General,326,7,10110,2770,...,-2,0.00,0,3,10,1,0.0,1.0,0.0,0.0
6,1000,2016-03-08,0,1,Hyper Market,General,326,7,9019,2770,...,-2,0.00,1,3,10,1,1.0,1.0,0.0,0.0
7,1000,2016-03-09,0,1,Hyper Market,General,326,7,8804,2770,...,-1,0.00,2,3,10,1,1.0,1.0,0.0,0.0
8,1000,2016-03-10,0,1,Hyper Market,General,326,7,7823,2770,...,6,0.00,3,3,10,1,1.0,1.0,0.0,0.0
9,1000,2016-03-11,0,1,Hyper Market,General,326,7,7989,2770,...,7,0.51,4,3,10,1,1.0,1.0,0.0,0.0
10,1000,2016-03-12,0,0,Hyper Market,General,326,7,5895,2770,...,6,0.00,5,3,10,1,1.0,0.0,0.0,0.0
