In [1]:
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

In [2]:
def find_nan_features(df):
    null_cols = []
    for col in df.columns:
        if df[col].isnull().values.any():
            null_cols.append(col)
    return null_cols

In [3]:
def remove_nan_rows(df):
    # getting indices (rows) of all NaN values
    inds = pd.isnull(df).any(1).nonzero()[0]

    # drop all the rows with NaN values
    return df.drop(df.index[inds])

In [4]:
df = pd.read_csv('preprocessed_train_with_avg.csv')

In [5]:
# Sistemo i dati per regression tree
## StoreID
# df.drop('StoreID',axis=1) droppo dopo

## StoreType
df = pd.get_dummies(df, columns=['StoreType'], prefix='StoreType')

## AssortmentType
df = pd.get_dummies(df, columns=['AssortmentType'], prefix='AssortmentType')

## Region
# df = pd.get_dummies(df, columns=['Region'], prefix='Region')

## Events
# No-Events (NaN) are considered as sunny days, with lowest value (0) on the events scale
df['Events'] = df['Events'].fillna(0)
df=df.replace({'Rain':1, 'Thunderstorm':1, 'Fog':1, 'Snow': 2, 'Fog-Rain': 2, 'Rain-Thunderstorm': 2, 'Rain-Snow':2, 'Fog-Snow':2, 'Fog-Rain-Snow':3, 'Rain-Hail':3, 'Snow-Hail':3, 'Rain-Snow-Hail':3, 'Fog-Rain-Hail':3, 'Fog-Thunderstorm':3, 'Fog-Rain-Thunderstorm':4, 'Fog-Snow-Hail':4, 'Fog-Rain-Snow-Hail':4, 'Rain-Snow-Thunderstorm':4, 'Rain-Hail-Thunderstorm':4, 'Fog-Rain-Hail-Thunderstorm':4, 'Rain-Snow-Hail-Thunderstorm':4})


In [6]:
# Look for features with NaN values
null_cols = find_nan_features(df)
print('Features with NaN:')
for col in null_cols:
    print(col)
    
# drop all rows with NaN values
df = remove_nan_rows(df)

Features with NaN:
Max_VisibilityKm
Mean_VisibilityKm
Min_VisibilitykM
IsOpen_yesterday
IsOpen_tomorrow
IsHoliday_yesterday
IsHoliday_tomorrow
NumberOfSales_yesterday
NumberOfSales_lastweek
NumberOfSales_lastmonth


In [7]:
features = set(df.columns.tolist())
toremove = set(['Date', 'NumberOfSales', 'NumberOfSales_yesterday', 
            'NumberOfSales_lastweek', 'NumberOfSales_lastmonth'])
features = list(features - toremove)
X = df[features]
y = df['NumberOfSales']

In [8]:
selector = SelectKBest(f_regression, k=10)
selector.fit(X, y)

SelectKBest(k=10, score_func=<function f_regression at 0x7fa5b77c1158>)

In [9]:
X.columns

Index(['Max_Dew_PointC', 'Year', 'daily_sales', 'Mean_TemperatureC',
       'Min_VisibilitykM', 'Min_Humidity', 'IsOpen_yesterday',
       'IsHoliday_yesterday', 'Events', 'Mean_Sea_Level_PressurehPa',
       'Mean_Dew_PointC', 'Week', 'StoreType_Hyper Market', 'Max_Humidity',
       'Precipitationmm', 'Region', 'StoreType_Shopping Center',
       'Region_PopulationK', 'Max_Sea_Level_PressurehPa', 'Mean_Humidity',
       'HasPromotions', 'month_avg_sales', 'Mean_Wind_SpeedKm_h',
       'NearestCompetitor', 'Max_Wind_SpeedKm_h', 'Max_Gust_SpeedKm_h',
       'Quarter', 'CloudCover', 'IsHoliday_tomorrow', 'AssortmentType_General',
       'Min_Dew_PointC', 'DayOfWeek', 'StoreID', 'Min_TemperatureC',
       'Min_Sea_Level_PressurehPa', 'AssortmentType_With Fish Department',
       'AssortmentType_With Non-Food Department', 'Mean_VisibilityKm',
       'IsOpen_tomorrow', 'IsHoliday', 'Max_VisibilityKm', 'Month',
       'StoreType_Super Market', 'yearly_sales', 'Max_TemperatureC',
       'Stor

In [10]:
z = X.columns[selector.get_support()].tolist()
z

['daily_sales',
 'IsOpen_yesterday',
 'StoreType_Shopping Center',
 'HasPromotions',
 'month_avg_sales',
 'AssortmentType_General',
 'DayOfWeek',
 'AssortmentType_With Non-Food Department',
 'IsOpen_tomorrow',
 'yearly_sales']

In [11]:
for i, score in enumerate(selector.scores_):
    print('{:30s} -> {:9.5f}'.format(X.columns[i], score))

Max_Dew_PointC                 -> 856.26975
Year                           ->   1.04420
daily_sales                    -> 598586.52235
Mean_TemperatureC              -> 752.83250
Min_VisibilitykM               ->  15.67358
Min_Humidity                   -> 201.49472
IsOpen_yesterday               -> 13247.83469
IsHoliday_yesterday            -> 552.41552
Events                         ->  57.96781
Mean_Sea_Level_PressurehPa     ->   0.02505
Mean_Dew_PointC                -> 784.58981
Week                           -> 2726.98027
StoreType_Hyper Market         -> 576.46638
Max_Humidity                   ->  66.26615
Precipitationmm                -> 192.11503
Region                         -> 808.90225
StoreType_Shopping Center      -> 8682.76105
Region_PopulationK             -> 198.73784
Max_Sea_Level_PressurehPa      ->   6.70062
Mean_Humidity                  ->  99.21445
HasPromotions                  -> 60176.20971
month_avg_sales                -> 649978.82160
Mean_Wind_SpeedKm_h 

In [23]:
# add day of the year
df['Day'] = pd.to_datetime(df.Date).apply(lambda x: x.toordinal())
features.append('Day')

In [24]:
scores = [0] * len(features)
for storeid in range(1000, 1735):
    X = df[df.StoreID == storeid][features]
    y = df[df.StoreID == storeid]['NumberOfSales']
    selector = SelectKBest(f_regression)
    selector.fit(X, y)
    for i in range(len(features)):
        scores[i] += selector.scores_.tolist()[i] / 1736

  corr /= X_norms
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  n_samples * X_means ** 2)


In [25]:
for i, score in enumerate(scores):
    print('{:30s} -> {:9.5f}'.format(X.columns[i], score))

Max_Dew_PointC                 ->   2.45248
Year                           ->   3.33695
daily_sales                    ->       nan
Mean_TemperatureC              ->   2.87663
Min_VisibilitykM               ->   0.79059
Min_Humidity                   ->   2.01672
IsOpen_yesterday               ->       nan
IsHoliday_yesterday            ->   1.93960
Events                         ->   0.82962
Mean_Sea_Level_PressurehPa     ->   0.36002
Mean_Dew_PointC                ->   2.23999
Week                           ->   5.38158
StoreType_Hyper Market         ->       nan
Max_Humidity                   ->   0.41724
Precipitationmm                ->       nan
Region                         ->       nan
StoreType_Shopping Center      ->       nan
Region_PopulationK             ->       nan
Max_Sea_Level_PressurehPa      ->   0.36260
Mean_Humidity                  ->   1.61885
HasPromotions                  -> 167.08142
month_avg_sales                ->  22.95986
Mean_Wind_SpeedKm_h            -