In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from nb_008 import *

# Rossmann

## Data preparation

To create the feature-engineered filed train_clean and test_clean from the initial data, run nb009a

In [None]:
PATH = Path('data/rossmann/')
train_df = pd.read_feather(PATH/'train_clean')
test_df = pd.read_feather(PATH/'test_clean')

In [None]:
train_df.head()

In [None]:
cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen',
    'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear',
    'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw',
    'SchoolHoliday_fw', 'SchoolHoliday_bw']

contin_vars = ['CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC',
   'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h', 
   'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE',
   'AfterStateHoliday', 'BeforeStateHoliday', 'Promo', 'SchoolHoliday']

n = len(train_df); n

In [None]:
class TabularTransform():
    
    def __call__(self, df, test=False):
        func = self.apply_test if test else self.apply_train
        func(df)
        
    def apply_train(self, df): raise NotImplementedError
    def apply_test(self, df):  raise NotImplementedError

In [None]:
@dataclass
class Categorify(TabularTransform):
    col_names:Collection[str]
    
    def apply_train(self, df):
        self.categories = {}
        for n in self.col_names: 
            df[n] = df[n].astype('category').cat.as_ordered()
            self.categories[n] = df[n].cat.categories
            df[n] = df[n].cat.codes
    
    def apply_test(self, df):
        for n in self.col_names:
            df[n] = pd.Categorical(df[n], categories=self.categories[n], ordered=True)
            df[n] = df[n].cat.codes

In [None]:
categorify = Categorify(cat_vars)
categorify(train_df)
categorify(test_df, test=True)

In [None]:
@dataclass
class Retype(TabularTransform):
    col_names:Collection[str]
    dtypes:Collection[str]
    
    def __post_init__(self): self.dtypes = listify(self.dtypes, self.col_names)
    
    def apply_train(self, df):
        for n,dt in zip(self.col_names, self.dtypes): df[n] = df[n].astype(dt)
            
    def apply_test(self, df): self.apply_train(df)

In [None]:
retype = Retype(contin_vars, ['float32'])
retype(train_df)
retype(test_df, test=True)

In [None]:
@dataclass
class FillNA(TabularTransform):
    col_names:Collection[str]
    fill_val:float=0.
        
    def apply_train(self, df):
        for n in self.col_names: df[n] = df[n].fillna(self.fill_val)
            
    def apply_test(self, df): self.apply_train(df)

In [None]:
fillna = FillNA(contin_vars)
fillna(train_df)
fillna(test_df, test=True)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn_pandas import DataFrameMapper

In [None]:
@dataclass
class ScaleVar(TabularTransform):
    col_names:Collection[str]
    
    def apply_train(self, df):
        self.scaler = DataFrameMapper([([n],StandardScaler()) for n in self.col_names])
        df[self.scaler.transformed_names_] = self.scaler.fit_transform(df)
            
    def apply_test(self, df): 
        df[self.scaler.transformed_names_] = self.scaler.transform(df)

In [None]:
scaler = ScaleVar(contin_vars)
scaler(train_df)
scaler(test_df, test=True)

In [None]:
train_df[scaler.scaler.transformed_names_].head()

In [None]:
FillStrategy = IntEnum('FillStrategy', 'MEDIAN COMMON')

@dataclass
class FillMissing(TabularTransform):
    fill_strategy:FillStrategy=FillStrategy.MEDIAN
    add_col:bool=True
        
    def apply_train(self, df):
        self.na_dict = {}
        for name,col in df.items():
            if pd.isnull(col).sum():
                if self.add_col: df[name+'_na'] = pd.isnull(col)
                filler = col.median() if self.fill_strategy == FillStrategy.MEDIAN else col.dropna().value_counts().idxmax()
                df[name] = col.fillna(filler)
                self.na_dict[name] = filler
            
    def apply_test(self, df): 
        for name,col in df.items():
            if name in self.na_dict:
                if self.add_col: df[name+'_na'] = pd.isnull(col)
                df[name] = col.fillna(self.na_dict)

In [None]:
fill_missing = FillMissing()
fill_missing(train_df)
fill_missing(test_df, test=True)

In [None]:
from pandas.api.types import is_numeric_dtype

In [None]:
class TabularDataset():
    def __init__(self, df, dep_var, cat_names=None, cont_names=None, log_output=False):
        if not is_numeric_dtype(df[dep_var]): df[dep_var] = df[dep_var].cat.codes
        self.y = torch.tensor(df[dep_var].values)
        if log_output: self.y = torch.log(self.y.float())
        df.drop([dep_var], axis=1, inplace=True)
        n = len(self.y)
        self.cats = np.stack([c.values for n,c in df[cat_names].items()], 1) if cat_names else np.zeros((n,1))
        self.cats = LongTensor(self.cats.astype(np.int64))
        self.conts = np.stack([c.values for n,c in df[cont_names].items()], 1) if cont_names else np.zeros((n,1))
        self.conts = FloatTensor(self.conts.astype(np.float32))
    
    def __len__(self): return len(self.y)
    def __getitem__(self, idx): return ((self.cats[idx], self.conts[idx]), self.y[idx])
    
    @classmethod
    def from_dataframes(cls, train_df, test_df, dep_var, tfms=None, **kwargs):
        if tfms is None: tfms = []
        for tfm in tfms:
            tfm(train_df)
            tfm(test_df, test=True)
        return cls(train_df, dep_var, **kwargs), cls(test_df, dep_var, **kwargs)  

In [None]:
dep_var = 'Sales'
train_df = pd.read_feather(PATH/'train_clean')
train_df = train_df[cat_vars+contin_vars+[dep_var, 'Date']].copy()
train_df = train_df.set_index('Date')

In [None]:
cut = int(len(train_df) * 0.1)
train_df,valid_df = train_df[cut:], train_df[:cut]
len(train_df),len(valid_df)

In [None]:
tfms = [Categorify(cat_vars), Retype(contin_vars, ['float32']), FillNA(contin_vars), ScaleVar(contin_vars),
        FillMissing()]

In [None]:
train_ds, valid_ds = TabularDataset.from_dataframes(train_df, valid_df, dep_var, tfms, cat_names=cat_vars, 
                                                    cont_names=contin_vars, log_output=True)

In [None]:
len(train_ds), len(valid_ds)

In [None]:
data = DataBunch.create(train_ds, valid_ds, bs=64, num_workers=1)