# PREPROCESSING

In [1]:
import pandas as pd
import numpy as np
import datetime as dt

In [2]:
# load csvs to dataframe
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
len_test = len(df_test)

# we need both for the temporal features
df = pd.concat([df_train, df_test], ignore_index=True)

In [3]:
df.head()

Unnamed: 0,AssortmentType,CloudCover,Date,Events,HasPromotions,IsHoliday,IsOpen,Max_Dew_PointC,Max_Gust_SpeedKm_h,Max_Humidity,...,NumberOfCustomers,NumberOfSales,Precipitationmm,Region,Region_AreaKM2,Region_GDP,Region_PopulationK,StoreID,StoreType,WindDirDegrees
0,General,8.0,01/03/2016,Rain-Snow,0,0,1,1,,100,...,495.0,5676.0,0.0,7,9643,17130,2770,1000,Hyper Market,23
1,General,8.0,02/03/2016,Snow,0,0,1,0,,87,...,608.0,8111.0,0.0,7,9643,17130,2770,1000,Hyper Market,56
2,General,8.0,04/03/2016,Rain,0,0,1,0,,81,...,665.0,8300.0,0.0,7,9643,17130,2770,1000,Hyper Market,22
3,General,6.0,05/03/2016,,0,0,1,-3,,80,...,630.0,7154.0,0.0,7,9643,17130,2770,1000,Hyper Market,108
4,General,6.0,06/03/2016,,0,0,0,0,,93,...,0.0,0.0,0.0,7,9643,17130,2770,1000,Hyper Market,46


In [4]:
# show sample row
df.iloc[0]

AssortmentType                     General
CloudCover                               8
Date                            01/03/2016
Events                           Rain-Snow
HasPromotions                            0
IsHoliday                                0
IsOpen                                   1
Max_Dew_PointC                           1
Max_Gust_SpeedKm_h                     NaN
Max_Humidity                           100
Max_Sea_Level_PressurehPa             1032
Max_TemperatureC                         2
Max_VisibilityKm                        19
Max_Wind_SpeedKm_h                      21
Mean_Dew_PointC                         -1
Mean_Humidity                           82
Mean_Sea_Level_PressurehPa            1030
Mean_TemperatureC                        1
Mean_VisibilityKm                       11
Mean_Wind_SpeedKm_h                     16
Min_Dew_PointC                          -2
Min_Humidity                            70
Min_Sea_Level_PressurehPa             1029
Min_Tempera

In [5]:
## Missing Values
# cloud coverage: 0 if no events, 8 if events
for row in range(len(df)):
    if row % 10000 == 0:
        print("Working on row {}".format(row))
    if np.isnan(df.loc[row, 'CloudCover']):
        if df.loc[row, 'Events'] is np.nan:
            df.loc[row, 'CloudCover'] = 0
        else:
            df.loc[row, 'CloudCover'] = 8

# max gust speed = max wind speed
df.Max_Gust_SpeedKm_h = df.Max_Gust_SpeedKm_h.fillna(df.Max_Wind_SpeedKm_h)

Working on row 0
Working on row 10000
Working on row 20000
Working on row 30000
Working on row 40000
Working on row 50000
Working on row 60000
Working on row 70000
Working on row 80000
Working on row 90000
Working on row 100000
Working on row 110000
Working on row 120000
Working on row 130000
Working on row 140000
Working on row 150000
Working on row 160000
Working on row 170000
Working on row 180000
Working on row 190000
Working on row 200000
Working on row 210000
Working on row 220000
Working on row 230000
Working on row 240000
Working on row 250000
Working on row 260000
Working on row 270000
Working on row 280000
Working on row 290000
Working on row 300000
Working on row 310000
Working on row 320000
Working on row 330000
Working on row 340000
Working on row 350000
Working on row 360000
Working on row 370000
Working on row 380000
Working on row 390000
Working on row 400000
Working on row 410000
Working on row 420000
Working on row 430000
Working on row 440000
Working on row 450000
Wo

In [6]:
## Date Features
# convert date to datetime
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

# add features
# df['DayN']=df['Date'].dt.dayofyear    # non credo possa servire
df['DayOfWeek']=df['Date'].dt.dayofweek
df['Month']=df['Date'].dt.month
df['Week']=df['Date'].dt.weekofyear
df['Quarter']=df['Date'].dt.quarter


In [None]:
## Temporal Features
# create columns
df['IsOpen_yesterday'] = np.empty(len(df))
df['IsOpen_tomorrow'] = np.empty(len(df))
df['IsHoliday_yesterday'] = np.empty(len(df))
df['IsHoliday_tomorrow'] = np.empty(len(df))
df['NumberOfSales_yesterday'] = np.empty(len(df))
df['NumberOfSales_lastweek'] = np.empty(len(df))
df['NumberOfSales_lastmonth'] = np.empty(len(df))

for store in df.StoreID.unique():
    print("Working on {}".format(store))
    temp = df.loc[df.StoreID == store]
    # switch index to timestamps to make this easier
    oldindex = temp.index
    temp.index = temp['Date']
    
    temp['IsOpen_yesterday'] = temp.IsOpen.rolling(window='1d',closed='left', min_periods=1).sum()
    temp['IsOpen_tomorrow'] = temp.IsOpen.rolling(window='1d',closed='left', min_periods=1).sum().shift(-2, '1d')
    temp['IsHoliday_yesterday'] = temp.IsHoliday.rolling(window='1d',closed='left', min_periods=1).sum()
    temp['IsHoliday_tomorrow'] = temp.IsHoliday.rolling(window='1d',closed='left', min_periods=1).sum().shift(-2, '1d')
    temp['NumberOfSales_yesterday'] = temp.NumberOfSales.rolling(window='1d',closed='left', min_periods=1).sum()
    temp['NumberOfSales_lastweek'] = temp.NumberOfSales.rolling(window='7d',closed='left', min_periods=1).sum()
    temp['NumberOfSales_lastmonth'] = temp.NumberOfSales.rolling(window='30d',closed='left', min_periods=1).sum()
    
    # put it back in the dataframe
    temp.index = oldindex
    df.loc[df.StoreID == store] = temp
    
# Attenzione: i valori di tomorrow nel test sono sputtanati a NaN
# vanno messi a mano qui o cambiato il modo di calcolo
    
# drop rows at the beginning where we have no past information
# NB: possiamo fare a meno se togliamo quelle feature
df = df.iloc[30:]

Working on 1000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

Working on 1001
Working on 1002
Working on 1003
Working on 1004
Working on 1005
Working on 1006
Working on 1007
Working on 1008
Working on 1009
Working on 1010
Working on 1011
Working on 1012
Working on 1013
Working on 1014
Working on 1015
Working on 1016
Working on 1017
Working on 1018
Working on 1019
Working on 1020
Working on 1021
Working on 1022
Working on 1023
Working on 1024
Working on 1025
Working on 1026
Working on 1027
Working on 1028
Working on 1029
Working on 1030
Working on 1031
Working on 1032
Working on 1033
Working on 1034
Working on 1035
Working on 1036
Working on 1037
Working on 1038
Working on 1039
Working on 1040
Working on 1041
Working on 1042
Working on 1043
Working on 1044
Working on 1045
Working on 1046
Working on 1047
Working on 1048
Working on 1049
Working on 1050
Working on 1051
Working on 1052
Working on 1053
Working on 1054
Working on 1055
Working on 1056
Working on 1057
Working on 1058
Working on 1059
Working on 1060
Working on 1061
Working on 1062
Working 

Working on 1514
Working on 1515
Working on 1516
Working on 1517
Working on 1518
Working on 1519
Working on 1520
Working on 1521
Working on 1522
Working on 1523
Working on 1524
Working on 1525
Working on 1526
Working on 1527
Working on 1528
Working on 1529
Working on 1530
Working on 1531
Working on 1532
Working on 1533
Working on 1534
Working on 1535
Working on 1536
Working on 1537
Working on 1538
Working on 1539
Working on 1540
Working on 1541
Working on 1542
Working on 1543
Working on 1544
Working on 1545
Working on 1546
Working on 1547
Working on 1548
Working on 1549
Working on 1550
Working on 1551
Working on 1552
Working on 1553
Working on 1554
Working on 1555
Working on 1556
Working on 1557
Working on 1558
Working on 1559
Working on 1560
Working on 1561
Working on 1562
Working on 1563
Working on 1564
Working on 1565
Working on 1566
Working on 1567
Working on 1568
Working on 1569
Working on 1570
Working on 1571
Working on 1572
Working on 1573
Working on 1574
Working on 1575
Working 

In [None]:
# One-Hot Encoding
# nb: pd.get_dummies rimuove le colonne direttamente

# ## StoreId
# df = pd.get_dummies(df, columns=['StoreID'], prefix='StoreID')

# ## StoreType
# df = pd.get_dummies(df, columns=['StoreType'], prefix='StoreType')

# ## AssortmentType
# df = pd.get_dummies(df, columns=['AssortmentType'], prefix='AssortmentType')

# ## Region
# df = pd.get_dummies(df, columns=['Region'], prefix='Region')

# ## Events
# df = pd.get_dummies(df, columns=['Events'], prefix='Events', dummy_na=True)
### inutile se possiamo usare categorie con decision tree

# numeric features to categories (strings)
df.StoreID = df.StoreID.astype(str)
df.Region = df.Region.astype(str)


In [None]:
# Drop useless columns
df = df.drop('Date', axis=1)
df = df.drop('NumberOfCustomers', axis=1)  
df = df.drop('WindDirDegrees', axis=1)

# questi non cambiano mai, teniamo regione e population
df = df.drop('Region_AreaKM2', axis=1)
df = df.drop('Region_GDP', axis=1)
#df = df.drop('Region_PopulationK', axis=1)

In [None]:
df.head()

In [None]:
print(list(df.columns))

In [None]:
# save preprocessed data
df[:-len_test].to_csv('preprocessed_train.csv',index=False)
df[-len_test:].to_csv('preprocessed_test.csv', index=False)

In [None]:
### REMEMBER
# le sales di testing vanno aggiunte e sistemate a runtime

In [None]:
# ###############################    IGNORARE QUESTA CELLA
# import numpy as np
# import matplotlib.pyplot as plt
# import pandas as pd
# import random

# from sklearn import datasets
# from sklearn import linear_model
# from sklearn import naive_bayes
# from sklearn import neighbors
# from sklearn import linear_model
# from sklearn.ensemble import ExtraTreesRegressor

# from sklearn.feature_selection import SelectFromModel
# from sklearn.feature_selection import VarianceThreshold

# from sklearn.decomposition import PCA

# from sklearn.model_selection import StratifiedKFold
# from sklearn.model_selection import KFold
# from sklearn.model_selection import cross_val_score


# kfolds = KFold(10,shuffle=True,random_state=1234)
# model = linear_model.LinearRegression()
# X=pd.read_csv('preprocessed_train.csv')
# X.drop('NumberOfSales',axis=1)
# X.head()
# y=df['NumberOfSales']